diff --git a/API/HydrusNetwork.py b/API/HydrusNetwork.py index ace54c8..630fa42 100644 --- a/API/HydrusNetwork.py +++ b/API/HydrusNetwork.py @@ -12,6 +12,7 @@ import sys import time from SYS.logger import log, debug +from SYS.utils_constant import ALL_SUPPORTED_EXTENSIONS as GLOBAL_SUPPORTED_EXTENSIONS import tempfile import logging from dataclasses import dataclass, field @@ -1103,9 +1104,7 @@ SUPPORTED_FILETYPES = { } # Flatten to get all supported extensions -ALL_SUPPORTED_EXTENSIONS = set() -for category_extensions in SUPPORTED_FILETYPES.values(): - ALL_SUPPORTED_EXTENSIONS.update(category_extensions.keys()) +ALL_SUPPORTED_EXTENSIONS = set(GLOBAL_SUPPORTED_EXTENSIONS) # Global Hydrus client cache to reuse session keys diff --git a/API/archive_client.py b/API/archive_client.py deleted file mode 100644 index 91ec609..0000000 --- a/API/archive_client.py +++ /dev/null @@ -1,584 +0,0 @@ -"""Archive.org API client for borrowing and downloading books. - -This module provides low-level functions for interacting with Archive.org: -- Authentication (login, credential management) -- Borrowing (loan, return_loan) -- Book metadata extraction (get_book_infos, get_book_metadata) -- Image downloading and deobfuscation -- PDF creation with metadata - -Used by Provider/openlibrary.py for the borrowing workflow. -""" -from __future__ import annotations - -import base64 -import hashlib -import logging -import os -import re -import sys -import time -from concurrent import futures -from typing import Any, Dict, List, Optional, Sequence, Tuple - -import requests - -from SYS.logger import log, debug - -try: - from Crypto.Cipher import AES # type: ignore - from Crypto.Util import Counter # type: ignore -except ImportError: - AES = None # type: ignore - Counter = None # type: ignore - -try: - from tqdm import tqdm # type: ignore -except ImportError: - tqdm = None # type: ignore - - -def credential_openlibrary(config: Dict[str, Any]) -> Tuple[Optional[str], Optional[str]]: - """Get OpenLibrary/Archive.org email and password from config. - - Supports both formats: - - New: {"provider": {"openlibrary": {"email": "...", "password": "..."}}} - - Old: {"Archive": {"email": "...", "password": "..."}} - {"archive_org_email": "...", "archive_org_password": "..."} - - Returns: (email, password) tuple, each can be None - """ - if not isinstance(config, dict): - return None, None - - # Try new format first - provider_config = config.get("provider", {}) - if isinstance(provider_config, dict): - openlibrary_config = provider_config.get("openlibrary", {}) - if isinstance(openlibrary_config, dict): - email = openlibrary_config.get("email") - password = openlibrary_config.get("password") - if email or password: - return email, password - - # Try old nested format - archive_config = config.get("Archive") - if isinstance(archive_config, dict): - email = archive_config.get("email") - password = archive_config.get("password") - if email or password: - return email, password - - # Fall back to old flat format - email = config.get("archive_org_email") - password = config.get("archive_org_password") - return email, password - - -class BookNotAvailableError(Exception): - """Raised when a book is not available for borrowing (waitlisted/in use).""" - pass - - -def display_error(response: requests.Response, message: str) -> None: - """Display error and exit.""" - log(message, file=sys.stderr) - log(response.text, file=sys.stderr) - sys.exit(1) - - -def login(email: str, password: str) -> requests.Session: - """Login to archive.org. - - Args: - email: Archive.org email - password: Archive.org password - - Returns: - Authenticated requests.Session - - Raises: - SystemExit on login failure - """ - session = requests.Session() - session.get("https://archive.org/account/login", timeout=30) - - data = {"username": email, "password": password} - response = session.post("https://archive.org/account/login", data=data, timeout=30) - - if "bad_login" in response.text: - log("Invalid credentials!", file=sys.stderr) - sys.exit(1) - if "Successful login" in response.text: - debug("Successful login") - return session - display_error(response, "[-] Error while login:") - sys.exit(1) # Unreachable but satisfies type checker - - -def loan(session: requests.Session, book_id: str, verbose: bool = True) -> requests.Session: - """Borrow a book from archive.org (14-day loan). - - Args: - session: Authenticated requests.Session from login() - book_id: Archive.org book identifier (e.g., 'ia_book_id') - verbose: Whether to log messages - - Returns: - Session with active loan - - Raises: - SystemExit on loan failure - """ - data = {"action": "grant_access", "identifier": book_id} - response = session.post("https://archive.org/services/loans/loan/searchInside.php", data=data, timeout=30) - data["action"] = "browse_book" - response = session.post("https://archive.org/services/loans/loan/", data=data, timeout=30) - - if response.status_code == 400: - try: - if response.json()["error"] == "This book is not available to borrow at this time. Please try again later.": - debug("Book is not available for borrowing (waitlisted or in use)") - raise BookNotAvailableError("Book is waitlisted or in use") - display_error(response, "Something went wrong when trying to borrow the book.") - except BookNotAvailableError: - raise - except: - display_error(response, "The book cannot be borrowed") - - data["action"] = "create_token" - response = session.post("https://archive.org/services/loans/loan/", data=data, timeout=30) - - if "token" in response.text: - if verbose: - debug("Successful loan") - return session - display_error(response, "Something went wrong when trying to borrow the book.") - sys.exit(1) # Unreachable but satisfies type checker - - -def return_loan(session: requests.Session, book_id: str) -> None: - """Return a borrowed book. - - Args: - session: Authenticated requests.Session with active loan - book_id: Archive.org book identifier - """ - data = {"action": "return_loan", "identifier": book_id} - response = session.post("https://archive.org/services/loans/loan/", data=data, timeout=30) - if response.status_code == 200 and response.json()["success"]: - debug("Book returned") - else: - display_error(response, "Something went wrong when trying to return the book") - - -def get_book_infos(session: requests.Session, url: str) -> Tuple[str, List[str], Dict[str, Any]]: - """Extract book information and page links from archive.org viewer. - - Args: - session: Authenticated requests.Session - url: Book URL (e.g., https://archive.org/borrow/book_id or /details/book_id) - - Returns: - Tuple of (title, page_links, metadata) - - Raises: - RuntimeError: If page data cannot be extracted - """ - r = session.get(url, timeout=30).text - - # Try to extract the infos URL from the response - try: - # Look for the "url" field in the response using regex - # Matches "url":"//archive.org/..." - import re - match = re.search(r'"url"\s*:\s*"([^"]+)"', r) - if not match: - raise ValueError("No 'url' field found in response") - - url_path = match.group(1) - if url_path.startswith("//"): - infos_url = "https:" + url_path - else: - infos_url = url_path - - infos_url = infos_url.replace("\\u0026", "&") - except (IndexError, ValueError, AttributeError) as e: - # If URL extraction fails, raise with better error message - raise RuntimeError(f"Failed to extract book info URL from response: {e}") - - response = session.get(infos_url, timeout=30) - data = response.json()["data"] - title = data["brOptions"]["bookTitle"].strip().replace(" ", "_") - title = "".join(c for c in title if c not in '<>:"/\\|?*') # Filter forbidden chars - title = title[:150] # Trim to avoid long file names - metadata = data["metadata"] - links = [] - - # Safely extract page links from brOptions data - try: - br_data = data.get("brOptions", {}).get("data", []) - for item in br_data: - if isinstance(item, list): - for page in item: - if isinstance(page, dict) and "uri" in page: - links.append(page["uri"]) - elif isinstance(item, dict) and "uri" in item: - links.append(item["uri"]) - except (KeyError, IndexError, TypeError) as e: - log(f"Warning: Error parsing page links: {e}", file=sys.stderr) - # Continue with whatever links we found - - if len(links) > 1: - debug(f"Found {len(links)} pages") - return title, links, metadata - elif len(links) == 1: - debug(f"Found {len(links)} page") - return title, links, metadata - else: - log("Error while getting image links - no pages found", file=sys.stderr) - raise RuntimeError("No pages found in book data") - - -def image_name(pages: int, page: int, directory: str) -> str: - """Generate image filename for page. - - Args: - pages: Total number of pages - page: Current page number (0-indexed) - directory: Directory to save to - - Returns: - Full path to image file - """ - return f"{directory}/{(len(str(pages)) - len(str(page))) * '0'}{page}.jpg" - - -def deobfuscate_image(image_data: bytes, link: str, obf_header: str) -> bytes: - """Decrypt obfuscated image data using AES-CTR. - - This handles Archive.org's image obfuscation for borrowed books. - Based on: https://github.com/justimm - - Args: - image_data: Encrypted image bytes - link: Image URL (used to derive AES key) - obf_header: X-Obfuscate header value (format: "1|BASE64_COUNTER") - - Returns: - Decrypted image bytes - """ - if not AES or not Counter: - raise RuntimeError("Crypto library not available") - - try: - version, counter_b64 = obf_header.split("|") - except Exception as e: - raise ValueError("Invalid X-Obfuscate header format") from e - - if version != "1": - raise ValueError("Unsupported obfuscation version: " + version) - - # Derive AES key from URL - aesKey = re.sub(r"^https?:\/\/.*?\/", "/", link) - sha1_digest = hashlib.sha1(aesKey.encode("utf-8")).digest() - key = sha1_digest[:16] - - # Decode counter - counter_bytes = base64.b64decode(counter_b64) - if len(counter_bytes) != 16: - raise ValueError(f"Expected counter to be 16 bytes, got {len(counter_bytes)}") - - prefix = counter_bytes[:8] - initial_value = int.from_bytes(counter_bytes[8:], byteorder="big") - - # Create AES-CTR cipher - ctr = Counter.new(64, prefix=prefix, initial_value=initial_value, little_endian=False) # type: ignore - cipher = AES.new(key, AES.MODE_CTR, counter=ctr) # type: ignore - - decrypted_part = cipher.decrypt(image_data[:1024]) - new_data = decrypted_part + image_data[1024:] - return new_data - - -def download_one_image( - session: requests.Session, - link: str, - i: int, - directory: str, - book_id: str, - pages: int, -) -> None: - """Download a single book page image. - - Handles obfuscated images and re-borrowing on 403 errors. - - Args: - session: Authenticated requests.Session - link: Direct image URL - i: Page index (0-based) - directory: Directory to save to - book_id: Archive.org book ID (for re-borrowing on 403) - pages: Total number of pages - """ - headers = { - "Referer": "https://archive.org/", - "Accept": "image/avif,image/webp,image/apng,image/*,*/*;q=0.8", - "Sec-Fetch-Site": "same-site", - "Sec-Fetch-Mode": "no-cors", - "Sec-Fetch-Dest": "image", - } - retry = True - response = None - while retry: - try: - response = session.get(link, headers=headers, timeout=30) - if response.status_code == 403: - session = loan(session, book_id, verbose=False) - raise Exception("Borrow again") - if response.status_code == 200: - retry = False - except: - time.sleep(1) - - image = image_name(pages, i, directory) - - if response is None: - log(f"Failed to download page {i}", file=sys.stderr) - return - - obf_header = response.headers.get("X-Obfuscate") - image_content = None - if obf_header: - try: - image_content = deobfuscate_image(response.content, link, obf_header) - except Exception as e: - log(f"Deobfuscation failed: {e}", file=sys.stderr) - return - else: - image_content = response.content - - with open(image, "wb") as f: - f.write(image_content) - - -def download( - session: requests.Session, - n_threads: int, - directory: str, - links: List[str], - scale: int, - book_id: str, -) -> List[str]: - """Download all book pages as images. - - Uses thread pool for parallel downloads. - - Args: - session: Authenticated requests.Session - n_threads: Number of download threads - directory: Directory to save images to - links: List of image url - scale: Image resolution (0=highest, 10=lowest) - book_id: Archive.org book ID (for re-borrowing) - - Returns: - List of downloaded image file paths - """ - debug("Downloading pages...") - links = [f"{link}&rotate=0&scale={scale}" for link in links] - pages = len(links) - - tasks = [] - with futures.ThreadPoolExecutor(max_workers=n_threads) as executor: - for link in links: - i = links.index(link) - tasks.append( - executor.submit( - download_one_image, - session=session, - link=link, - i=i, - directory=directory, - book_id=book_id, - pages=pages, - ) - ) - if tqdm: - for _ in tqdm(futures.as_completed(tasks), total=len(tasks)): # type: ignore - pass - else: - for _ in futures.as_completed(tasks): - pass - - images = [image_name(pages, i, directory) for i in range(len(links))] - return images - - -def check_direct_download(book_id: str) -> Tuple[bool, str]: - """Check if a book can be downloaded directly without borrowing. - - Searches Archive.org metadata for downloadable PDF files. - - Args: - book_id: Archive.org book identifier - - Returns: - Tuple of (can_download: bool, pdf_url: str) - """ - try: - # First, try to get the metadata to find the actual PDF filename - metadata_url = f"https://archive.org/metadata/{book_id}" - response = requests.get(metadata_url, timeout=10) - response.raise_for_status() - metadata = response.json() - - # Find PDF file in files list - if "files" in metadata: - for file_info in metadata["files"]: - filename = file_info.get("name", "") - if filename.endswith(".pdf") and file_info.get("source") == "original": - # Found the original PDF - pdf_filename = filename - pdf_url = f"https://archive.org/download/{book_id}/{pdf_filename.replace(' ', '%20')}" - - # Verify it's accessible - check_response = requests.head(pdf_url, timeout=5, allow_redirects=True) - if check_response.status_code == 200: - return True, pdf_url - - return False, "" - - except Exception as e: - log(f"Error checking direct download: {e}", file=sys.stderr) - return False, "" - - -def get_openlibrary_by_isbn(isbn: str) -> Dict[str, Any]: - """Fetch book data from OpenLibrary using ISBN. - - Args: - isbn: ISBN-10 or ISBN-13 to search for - - Returns: - Dictionary with book metadata from OpenLibrary - """ - try: - # Try ISBN API first - api_url = f"https://openlibrary.org/api/books?bibkeys=ISBN:{isbn}&jscmd=data&format=json" - response = requests.get(api_url, timeout=10) - response.raise_for_status() - data = response.json() - - if data: - # Get first result - key = list(data.keys())[0] - return data[key] - return {} - except Exception as e: - log(f"Error fetching OpenLibrary data by ISBN: {e}", file=sys.stderr) - return {} - - -def extract_isbn_from_metadata(metadata: Dict[str, Any]) -> str: - """Extract ISBN from archive.org metadata. - - Looks for ISBN in various metadata fields. - - Args: - metadata: Archive.org metadata dictionary - - Returns: - ISBN string (clean, no hyphens) or empty string if not found - """ - # Try various common metadata fields - isbn_fields = [ - "isbn", "ISBN", "isbn_13", "isbn_10", "isbns", - "isbn-10", "isbn-13", "identifer_isbn" - ] - - for field in isbn_fields: - if field in metadata: - isbn_val = metadata[field] - if isinstance(isbn_val, list): - isbn_val = isbn_val[0] if isbn_val else None - if isbn_val and isinstance(isbn_val, str): - # Clean ISBN (remove hyphens, spaces) - isbn_clean = isbn_val.replace("-", "").replace(" ", "") - if len(isbn_clean) in [10, 13]: - return isbn_clean - - return "" - - -def normalize_url(url: str) -> str: - """Convert openlibrary.org URL to archive.org URL. - - Looks up the actual Archive.org ID from OpenLibrary API. - - Args: - url: Book URL (archive.org or openlibrary.org format) - - Returns: - Normalized archive.org URL - """ - url = url.strip() - - # Already archive.org format - if url.startswith("https://archive.org/details/"): - return url - - # Convert openlibrary.org format by querying the OpenLibrary API - if "openlibrary.org/books/" in url: - try: - # Extract the book ID (e.g., OL6796852M) - parts = url.split("/books/") - if len(parts) > 1: - book_id = parts[1].split("/")[0] - - # Query OpenLibrary API to get the book metadata - api_url = f"https://openlibrary.org/books/{book_id}.json" - response = requests.get(api_url, timeout=10) - response.raise_for_status() - data = response.json() - - # Look for identifiers including internet_archive or ocaid - # First try ocaid (Open Content Alliance ID) - this is most common - if "ocaid" in data: - ocaid = data["ocaid"] - return f"https://archive.org/details/{ocaid}" - - # Check for identifiers object - if "identifiers" in data: - identifiers = data["identifiers"] - - # Look for internet_archive ID - if "internet_archive" in identifiers: - ia_ids = identifiers["internet_archive"] - if isinstance(ia_ids, list) and ia_ids: - ia_id = ia_ids[0] - else: - ia_id = ia_ids - return f"https://archive.org/details/{ia_id}" - - # If no IA identifier found, use the book ID as fallback - log(f"No Internet Archive ID found for {book_id}. Attempting with OpenLibrary ID.", file=sys.stderr) - return f"https://archive.org/details/{book_id}" - - except requests.RequestException as e: - log(f"Could not fetch OpenLibrary metadata: {e}", file=sys.stderr) - # Fallback to using the book ID directly - parts = url.split("/books/") - if len(parts) > 1: - book_id = parts[1].split("/")[0] - return f"https://archive.org/details/{book_id}" - except (KeyError, IndexError) as e: - log(f"Error parsing OpenLibrary response: {e}", file=sys.stderr) - # Fallback to using the book ID directly - parts = url.split("/books/") - if len(parts) > 1: - book_id = parts[1].split("/")[0] - return f"https://archive.org/details/{book_id}" - - # Return original if can't parse - return url diff --git a/API/folder.py b/API/folder.py index 9fbc561..27d65c5 100644 --- a/API/folder.py +++ b/API/folder.py @@ -407,38 +407,53 @@ class API_folder_store: logger.error(f"Error clearing worker log for {worker_id}: {exc}", exc_info=True) def _migrate_metadata_schema(self, cursor) -> None: - """Import legacy metadata from old schema if present. Existing hash-based schema is ready to use.""" + """Ensure metadata schema is up-to-date. + + - If a legacy schema is detected, attempt to import/upgrade (best-effort). + - If the hash-based schema exists, add any missing columns expected by current code. + """ try: # Check if this is a fresh new database (hash-based schema) cursor.execute('PRAGMA table_info(metadata)') existing_columns = {row[1] for row in cursor.fetchall()} - - # If hash column exists, we're already on the new schema - if 'hash' in existing_columns: - logger.info("Database is already using hash-based schema - no migration needed") - return - - # Legacy migration: If old schema exists, try to import data + + # Legacy migration: If old schema exists, try to import data. # Old schema would have had: id (INTEGER PRIMARY KEY), file_hash (TEXT), etc. - if 'id' in existing_columns and 'file_hash' in existing_columns: - logger.info("Detected legacy metadata schema - importing to new hash-based schema") - # This would be complex legacy migration - for now just note it - logger.info("Legacy metadata table detected but import not yet implemented") + if 'hash' not in existing_columns: + if 'id' in existing_columns and 'file_hash' in existing_columns: + logger.info("Detected legacy metadata schema - importing to new hash-based schema") + # This would be complex legacy migration - for now just note it. + logger.info("Legacy metadata table detected but import not yet implemented") + return + + # Unknown/unsupported schema; nothing we can safely do here. return - - # Add any missing columns to the new schema - for col_name, col_def in [('size', 'INTEGER'), ('ext', 'TEXT'), - ('type', 'TEXT'), - ('time_imported', 'TIMESTAMP DEFAULT CURRENT_TIMESTAMP'), - ('time_modified', 'TIMESTAMP DEFAULT CURRENT_TIMESTAMP')]: + + # Hash-based schema exists: add any missing columns expected by current code. + # These are safe ALTER TABLE additions for older DBs. + column_specs = { + 'size': 'INTEGER', + 'ext': 'TEXT', + 'type': 'TEXT', + 'url': 'TEXT', + 'relationships': 'TEXT', + 'duration': 'REAL', + 'time_imported': 'TIMESTAMP DEFAULT CURRENT_TIMESTAMP', + 'time_modified': 'TIMESTAMP DEFAULT CURRENT_TIMESTAMP', + 'created_at': 'TIMESTAMP DEFAULT CURRENT_TIMESTAMP', + 'updated_at': 'TIMESTAMP DEFAULT CURRENT_TIMESTAMP', + } + + for col_name, col_def in column_specs.items(): if col_name not in existing_columns: try: cursor.execute(f"ALTER TABLE metadata ADD COLUMN {col_name} {col_def}") + existing_columns.add(col_name) logger.info(f"Added '{col_name}' column to metadata table") except Exception as e: logger.debug(f"Column '{col_name}' may already exist: {e}") - - # Populate type column from ext if not already populated + + # Populate type column from ext if not already populated. if 'type' in existing_columns and 'ext' in existing_columns: try: from SYS.utils_constant import get_type_from_ext @@ -451,7 +466,7 @@ class API_folder_store: logger.info(f"Populated type column for {len(rows)} metadata entries") except Exception as e: logger.debug(f"Could not populate type column: {e}") - + self.connection.commit() except Exception as e: logger.debug(f"Note: Schema import/migration completed with status: {e}") @@ -929,6 +944,13 @@ class API_folder_store: if not fields: return + # Ensure a metadata row exists so updates don't silently no-op. + # This can happen for older DBs or entries created without explicit metadata. + cursor.execute( + "INSERT OR IGNORE INTO metadata (hash) VALUES (?)", + (file_hash,), + ) + values.append(file_hash) sql = f"UPDATE metadata SET {', '.join(fields)}, time_modified = CURRENT_TIMESTAMP, updated_at = CURRENT_TIMESTAMP WHERE hash = ?" @@ -1681,6 +1703,84 @@ class DatabaseAPI: ) return {row[0] for row in cursor.fetchall()} + def get_file_hashes_with_any_url(self, limit: Optional[int] = None) -> Set[str]: + """Get hashes of files that have any non-empty URL metadata.""" + cursor = self.get_cursor() + cursor.execute( + """ + SELECT DISTINCT f.hash + FROM files f + JOIN metadata m ON f.hash = m.hash + WHERE m.url IS NOT NULL + AND TRIM(m.url) != '' + AND TRIM(m.url) != '[]' + LIMIT ? + """, + (limit or 10000,), + ) + return {row[0] for row in cursor.fetchall()} + + def get_file_hashes_by_url_like(self, like_pattern: str, limit: Optional[int] = None) -> Set[str]: + """Get hashes of files whose URL metadata contains a substring (case-insensitive).""" + cursor = self.get_cursor() + cursor.execute( + """ + SELECT DISTINCT f.hash + FROM files f + JOIN metadata m ON f.hash = m.hash + WHERE m.url IS NOT NULL + AND LOWER(m.url) LIKE ? + LIMIT ? + """, + (like_pattern.lower(), limit or 10000), + ) + return {row[0] for row in cursor.fetchall()} + + def get_files_with_any_url(self, limit: Optional[int] = None) -> List[tuple]: + """Get files that have any non-empty URL metadata. + + Returns (hash, file_path, size, ext) tuples. + """ + cursor = self.get_cursor() + cursor.execute( + """ + SELECT f.hash, f.file_path, + COALESCE((SELECT size FROM metadata WHERE hash = f.hash), 0) as size, + COALESCE((SELECT ext FROM metadata WHERE hash = f.hash), '') as ext + FROM files f + JOIN metadata m ON f.hash = m.hash + WHERE m.url IS NOT NULL + AND TRIM(m.url) != '' + AND TRIM(m.url) != '[]' + ORDER BY f.file_path + LIMIT ? + """, + (limit or 10000,), + ) + return cursor.fetchall() + + def get_files_by_url_like(self, like_pattern: str, limit: Optional[int] = None) -> List[tuple]: + """Get files whose URL metadata contains a substring (case-insensitive). + + Returns (hash, file_path, size, ext) tuples. + """ + cursor = self.get_cursor() + cursor.execute( + """ + SELECT f.hash, f.file_path, + COALESCE((SELECT size FROM metadata WHERE hash = f.hash), 0) as size, + COALESCE((SELECT ext FROM metadata WHERE hash = f.hash), '') as ext + FROM files f + JOIN metadata m ON f.hash = m.hash + WHERE m.url IS NOT NULL + AND LOWER(m.url) LIKE ? + ORDER BY f.file_path + LIMIT ? + """, + (like_pattern.lower(), limit or 10000), + ) + return cursor.fetchall() + def get_file_metadata(self, file_hashes: Set[str], limit: Optional[int] = None) -> List[tuple]: """Get metadata for files given their hashes. Returns (hash, file_path, size, extension) tuples.""" if not file_hashes: diff --git a/CLI.py b/CLI.py index fce4e7e..da7e441 100644 --- a/CLI.py +++ b/CLI.py @@ -1498,6 +1498,9 @@ def _execute_pipeline(tokens: list): elif table_type == 'soulseek': print(f"Auto-piping Soulseek selection to download-file") stages.append(['download-file']) + elif table_type == 'openlibrary': + print(f"Auto-piping OpenLibrary selection to download-file") + stages.append(['download-file']) elif source_cmd == 'search-file' and source_args and 'youtube' in source_args: # Legacy check print(f"Auto-piping YouTube selection to .pipe") @@ -1667,6 +1670,35 @@ def _execute_pipeline(tokens: list): filtered_pipe_objs = [coerce_to_pipe_object(item) for item in filtered] piped_result = filtered_pipe_objs if len(filtered_pipe_objs) > 1 else filtered_pipe_objs[0] print(f"Selected {len(filtered)} item(s) using {cmd_name}") + + # If selection is the last stage and looks like a provider result, + # auto-initiate the borrow/download flow. + if stage_index + 1 >= len(stages): + try: + from ProviderCore.registry import get_search_provider as _get_search_provider + except Exception: + _get_search_provider = None + + if _get_search_provider is not None: + selected_list = filtered_pipe_objs + provider_table: Optional[str] = None + try: + for obj in selected_list: + extra = getattr(obj, "extra", None) + if isinstance(extra, dict) and extra.get("table"): + provider_table = str(extra.get("table")) + break + except Exception: + provider_table = None + + if provider_table: + try: + provider = _get_search_provider(provider_table, config) + except Exception: + provider = None + if provider is not None: + print("Auto-downloading selection via download-file") + stages.append(["download-file"]) continue else: print(f"No items matched selection {cmd_name}\n") @@ -1736,13 +1768,14 @@ def _execute_pipeline(tokens: list): } # Display-only commands (just show data, don't modify or search) display_only_commands = { - 'get-url', 'get_url', 'get-note', 'get_note', + 'get-note', 'get_note', 'get-relationship', 'get_relationship', 'get-file', 'get_file', 'check-file-status', 'check_file_status' } # Commands that manage their own table/history state (e.g. get-tag) self_managing_commands = { 'get-tag', 'get_tag', 'tags', + 'get-url', 'get_url', 'search-file', 'search_file' } diff --git a/Provider/openlibrary.py b/Provider/openlibrary.py index 38f842a..35519ff 100644 --- a/Provider/openlibrary.py +++ b/Provider/openlibrary.py @@ -1,19 +1,38 @@ from __future__ import annotations +import base64 +from concurrent import futures +import hashlib +import json as json_module +import re import shutil import sys import tempfile +import time from pathlib import Path from typing import Any, Dict, List, Optional, Tuple import requests +from API.HTTP import HTTPClient from ProviderCore.base import SearchProvider, SearchResult from ProviderCore.download import download_file, sanitize_filename from cli_syntax import get_field, get_free_text, parse_query from SYS.logger import log from SYS.utils import unique_path +try: + from Crypto.Cipher import AES # type: ignore + from Crypto.Util import Counter # type: ignore +except ImportError: + AES = None # type: ignore + Counter = None # type: ignore + +try: + from tqdm import tqdm # type: ignore +except ImportError: + tqdm = None # type: ignore + def _looks_like_isbn(text: str) -> bool: t = (text or "").replace("-", "").strip() @@ -38,6 +57,13 @@ def _resolve_edition_id(doc: Dict[str, Any]) -> str: edition_key = doc.get("edition_key") if isinstance(edition_key, list) and edition_key: return str(edition_key[0]).strip() + if isinstance(edition_key, str) and edition_key.strip(): + return edition_key.strip() + + # Often present even when edition_key is missing. + cover_edition_key = doc.get("cover_edition_key") + if isinstance(cover_edition_key, str) and cover_edition_key.strip(): + return cover_edition_key.strip() # Fallback: sometimes key can be /books/OL...M key = doc.get("key") @@ -54,7 +80,7 @@ def _check_lendable(session: requests.Session, edition_id: str) -> Tuple[bool, s return False, "not-an-edition" url = f"https://openlibrary.org/api/volumes/brief/json/OLID:{edition_id}" - resp = session.get(url, timeout=10) + resp = session.get(url, timeout=6) resp.raise_for_status() data = resp.json() or {} wrapped = data.get(f"OLID:{edition_id}") @@ -88,7 +114,7 @@ def _resolve_archive_id(session: requests.Session, edition_id: str, ia_candidate # Otherwise query the edition JSON. try: - resp = session.get(f"https://openlibrary.org/books/{edition_id}.json", timeout=10) + resp = session.get(f"https://openlibrary.org/books/{edition_id}.json", timeout=6) resp.raise_for_status() data = resp.json() or {} @@ -116,6 +142,522 @@ class OpenLibrary(SearchProvider): super().__init__(config) self._session = requests.Session() + class BookNotAvailableError(Exception): + """Raised when a book is not available for borrowing (waitlisted/in use).""" + + @staticmethod + def _credential_archive(config: Dict[str, Any]) -> Tuple[Optional[str], Optional[str]]: + """Get Archive.org email/password from config. + + Supports: + - New: {"provider": {"openlibrary": {"email": "...", "password": "..."}}} + - Old: {"Archive": {"email": "...", "password": "..."}} + {"archive_org_email": "...", "archive_org_password": "..."} + """ + if not isinstance(config, dict): + return None, None + + provider_config = config.get("provider", {}) + if isinstance(provider_config, dict): + openlibrary_config = provider_config.get("openlibrary", {}) + if isinstance(openlibrary_config, dict): + email = openlibrary_config.get("email") + password = openlibrary_config.get("password") + if email or password: + return str(email) if email is not None else None, str(password) if password is not None else None + + archive_config = config.get("Archive") + if isinstance(archive_config, dict): + email = archive_config.get("email") + password = archive_config.get("password") + if email or password: + return str(email) if email is not None else None, str(password) if password is not None else None + + email = config.get("archive_org_email") + password = config.get("archive_org_password") + return str(email) if email is not None else None, str(password) if password is not None else None + + @staticmethod + def _archive_error_body(response: requests.Response) -> str: + try: + body = response.text or "" + except Exception: + return "" + if len(body) > 2000: + return body[:1200] + "\n... (truncated) ...\n" + body[-400:] + return body + + @classmethod + def _archive_login(cls, email: str, password: str) -> requests.Session: + """Login to archive.org using the token-based services endpoint (matches test-login.py).""" + session = requests.Session() + + token_resp = session.get("https://archive.org/services/account/login/", timeout=30) + try: + token_json = token_resp.json() + except Exception as exc: + raise RuntimeError(f"Archive login token parse failed: {exc}\n{cls._archive_error_body(token_resp)}") + + if not token_json.get("success"): + raise RuntimeError(f"Archive login token fetch failed\n{cls._archive_error_body(token_resp)}") + + token = (token_json.get("value") or {}).get("token") + if not token: + raise RuntimeError("Archive login token missing") + + headers = {"Content-Type": "application/x-www-form-urlencoded"} + payload = {"username": email, "password": password, "t": token} + + login_resp = session.post( + "https://archive.org/services/account/login/", + headers=headers, + data=json_module.dumps(payload), + timeout=30, + ) + + try: + login_json = login_resp.json() + except Exception as exc: + raise RuntimeError(f"Archive login parse failed: {exc}\n{cls._archive_error_body(login_resp)}") + + if login_json.get("success") is False: + if login_json.get("value") == "bad_login": + raise RuntimeError("Invalid Archive.org credentials") + raise RuntimeError(f"Archive login failed: {login_json}") + + return session + + @classmethod + def _archive_loan(cls, session: requests.Session, book_id: str, *, verbose: bool = True) -> requests.Session: + data = {"action": "grant_access", "identifier": book_id} + session.post("https://archive.org/services/loans/loan/searchInside.php", data=data, timeout=30) + data["action"] = "browse_book" + response = session.post("https://archive.org/services/loans/loan/", data=data, timeout=30) + + if response.status_code == 400: + try: + err = (response.json() or {}).get("error") + if err == "This book is not available to borrow at this time. Please try again later.": + raise cls.BookNotAvailableError("Book is waitlisted or in use") + raise RuntimeError(f"Borrow failed: {err or response.text}") + except cls.BookNotAvailableError: + raise + except Exception: + raise RuntimeError("The book cannot be borrowed") + + data["action"] = "create_token" + response = session.post("https://archive.org/services/loans/loan/", data=data, timeout=30) + if "token" in (response.text or ""): + return session + raise RuntimeError("Something went wrong when trying to borrow the book") + + @staticmethod + def _archive_return_loan(session: requests.Session, book_id: str) -> None: + data = {"action": "return_loan", "identifier": book_id} + response = session.post("https://archive.org/services/loans/loan/", data=data, timeout=30) + if response.status_code == 200: + try: + if (response.json() or {}).get("success"): + return + except Exception: + pass + raise RuntimeError("Something went wrong when trying to return the book") + + @staticmethod + def _archive_get_book_infos(session: requests.Session, url: str) -> Tuple[str, List[str], Dict[str, Any]]: + """Extract page links from Archive.org book reader.""" + r = session.get(url, timeout=30).text + + # Matches: "url":"//archive.org/..." (allow whitespace) + match = re.search(r'"url"\s*:\s*"([^"]+)"', r) + if not match: + raise RuntimeError("Failed to extract book info URL from response") + + url_path = match.group(1) + infos_url = ("https:" + url_path) if url_path.startswith("//") else url_path + infos_url = infos_url.replace("\\u0026", "&") + + response = session.get(infos_url, timeout=30) + payload = response.json() + data = payload["data"] + + title = str(data["brOptions"]["bookTitle"]).strip().replace(" ", "_") + title = "".join(c for c in title if c not in '<>:"/\\|?*') + title = title[:150] + + metadata = data.get("metadata") or {} + links: List[str] = [] + br_data = (data.get("brOptions") or {}).get("data", []) + if isinstance(br_data, list): + for item in br_data: + if isinstance(item, list): + for page in item: + if isinstance(page, dict) and "uri" in page: + links.append(page["uri"]) + elif isinstance(item, dict) and "uri" in item: + links.append(item["uri"]) + + if not links: + raise RuntimeError("No pages found in book data") + return title, links, metadata if isinstance(metadata, dict) else {} + + @staticmethod + def _archive_image_name(pages: int, page: int, directory: str) -> str: + return f"{directory}/{(len(str(pages)) - len(str(page))) * '0'}{page}.jpg" + + @staticmethod + def _archive_deobfuscate_image(image_data: bytes, link: str, obf_header: str) -> bytes: + if not AES or not Counter: + raise RuntimeError("Crypto library not available") + + try: + version, counter_b64 = obf_header.split("|") + except Exception as exc: + raise ValueError("Invalid X-Obfuscate header format") from exc + + if version != "1": + raise ValueError("Unsupported obfuscation version: " + version) + + aes_key = re.sub(r"^https?:\/\/.*?\/", "/", link) + sha1_digest = hashlib.sha1(aes_key.encode("utf-8")).digest() + key = sha1_digest[:16] + + counter_bytes = base64.b64decode(counter_b64) + if len(counter_bytes) != 16: + raise ValueError(f"Expected counter to be 16 bytes, got {len(counter_bytes)}") + + prefix = counter_bytes[:8] + initial_value = int.from_bytes(counter_bytes[8:], byteorder="big") + ctr = Counter.new(64, prefix=prefix, initial_value=initial_value, little_endian=False) # type: ignore + cipher = AES.new(key, AES.MODE_CTR, counter=ctr) # type: ignore + + decrypted_part = cipher.decrypt(image_data[:1024]) + return decrypted_part + image_data[1024:] + + @classmethod + def _archive_download_one_image( + cls, + session: requests.Session, + link: str, + i: int, + directory: str, + book_id: str, + pages: int, + ) -> None: + headers = { + "Referer": "https://archive.org/", + "Accept": "image/avif,image/webp,image/apng,image/*,*/*;q=0.8", + "Sec-Fetch-Site": "same-site", + "Sec-Fetch-Mode": "no-cors", + "Sec-Fetch-Dest": "image", + } + + while True: + try: + response = session.get(link, headers=headers, timeout=30) + if response.status_code == 403: + cls._archive_loan(session, book_id, verbose=False) + raise RuntimeError("Borrow again") + if response.status_code == 200: + break + except Exception: + time.sleep(1) + + image = cls._archive_image_name(pages, i, directory) + obf_header = response.headers.get("X-Obfuscate") + if obf_header: + image_content = cls._archive_deobfuscate_image(response.content, link, obf_header) + else: + image_content = response.content + + with open(image, "wb") as f: + f.write(image_content) + + @classmethod + def _archive_download( + cls, + session: requests.Session, + n_threads: int, + directory: str, + links: List[str], + scale: int, + book_id: str, + ) -> List[str]: + links_scaled = [f"{link}&rotate=0&scale={scale}" for link in links] + pages = len(links_scaled) + + tasks = [] + with futures.ThreadPoolExecutor(max_workers=n_threads) as executor: + for i, link in enumerate(links_scaled): + tasks.append( + executor.submit( + cls._archive_download_one_image, + session=session, + link=link, + i=i, + directory=directory, + book_id=book_id, + pages=pages, + ) + ) + if tqdm: + for _ in tqdm(futures.as_completed(tasks), total=len(tasks)): # type: ignore + pass + else: + for _ in futures.as_completed(tasks): + pass + + return [cls._archive_image_name(pages, i, directory) for i in range(pages)] + + @staticmethod + def _archive_check_direct_download(book_id: str) -> Tuple[bool, str]: + """Check for a directly downloadable original PDF in Archive.org metadata.""" + try: + metadata_url = f"https://archive.org/metadata/{book_id}" + response = requests.get(metadata_url, timeout=6) + response.raise_for_status() + metadata = response.json() + files = metadata.get("files") if isinstance(metadata, dict) else None + if isinstance(files, list): + for file_info in files: + if not isinstance(file_info, dict): + continue + filename = str(file_info.get("name", "")) + if filename.endswith(".pdf") and file_info.get("source") == "original": + pdf_url = f"https://archive.org/download/{book_id}/{filename.replace(' ', '%20')}" + check_response = requests.head(pdf_url, timeout=4, allow_redirects=True) + if check_response.status_code == 200: + return True, pdf_url + return False, "" + except Exception: + return False, "" + + @staticmethod + def scrape_isbn_metadata(isbn: str) -> List[str]: + """Scrape tags for an ISBN using Open Library API. + + Returns tags such as: + - title:<...>, author:<...>, publish_date:<...>, publisher:<...>, description:<...>, pages:<...> + - identifiers: openlibrary:<...>, lccn:<...>, oclc:<...>, goodreads:<...>, librarything:<...>, doi:<...>, internet_archive:<...> + """ + new_tags: List[str] = [] + + isbn_clean = str(isbn or "").replace("isbn:", "").replace("-", "").strip() + if not isbn_clean: + return [] + + url = f"https://openlibrary.org/api/books?bibkeys=ISBN:{isbn_clean}&jscmd=data&format=json" + try: + with HTTPClient() as client: + response = client.get(url) + response.raise_for_status() + data = json_module.loads(response.content.decode("utf-8")) + except Exception as exc: + log(f"Failed to fetch ISBN metadata: {exc}", file=sys.stderr) + return [] + + if not data: + log(f"No ISBN metadata found for: {isbn}") + return [] + + book_data = next(iter(data.values()), None) + if not isinstance(book_data, dict): + return [] + + if "title" in book_data: + new_tags.append(f"title:{book_data['title']}") + + authors = book_data.get("authors") + if isinstance(authors, list): + for author in authors[:3]: + if isinstance(author, dict) and author.get("name"): + new_tags.append(f"author:{author['name']}") + + if book_data.get("publish_date"): + new_tags.append(f"publish_date:{book_data['publish_date']}") + + publishers = book_data.get("publishers") + if isinstance(publishers, list) and publishers: + pub = publishers[0] + if isinstance(pub, dict) and pub.get("name"): + new_tags.append(f"publisher:{pub['name']}") + + if "description" in book_data: + desc = book_data.get("description") + if isinstance(desc, dict) and "value" in desc: + desc = desc.get("value") + if desc: + desc_str = str(desc).strip() + if desc_str: + new_tags.append(f"description:{desc_str[:200]}") + + page_count = book_data.get("number_of_pages") + if isinstance(page_count, int) and page_count > 0: + new_tags.append(f"pages:{page_count}") + + identifiers = book_data.get("identifiers") + if isinstance(identifiers, dict): + + def _first(value: Any) -> Any: + if isinstance(value, list) and value: + return value[0] + return value + + for key, ns in ( + ("openlibrary", "openlibrary"), + ("lccn", "lccn"), + ("oclc", "oclc"), + ("goodreads", "goodreads"), + ("librarything", "librarything"), + ("doi", "doi"), + ("internet_archive", "internet_archive"), + ): + val = _first(identifiers.get(key)) + if val: + new_tags.append(f"{ns}:{val}") + + log(f"Found {len(new_tags)} tag(s) from ISBN lookup") + return new_tags + + @staticmethod + def scrape_openlibrary_metadata(olid: str) -> List[str]: + """Scrape tags for an OpenLibrary ID using the .json API endpoint.""" + new_tags: List[str] = [] + + olid_text = str(olid or "").strip() + if not olid_text: + return [] + + # Normalize OLID to the common "OLM" form when possible. + olid_norm = olid_text + try: + if not olid_norm.startswith("OL"): + olid_norm = f"OL{olid_norm}" + if not olid_norm.endswith("M"): + olid_norm = f"{olid_norm}M" + except Exception: + olid_norm = olid_text + + # Ensure we always include a scrapeable identifier tag. + new_tags.append(f"openlibrary:{olid_norm}") + + # Accept OL9674499M, 9674499M, or just digits. + olid_clean = olid_text.replace("OL", "").replace("M", "") + if not olid_clean.isdigit(): + olid_clean = olid_text + + if not olid_text.startswith("OL"): + url = f"https://openlibrary.org/books/OL{olid_clean}M.json" + else: + url = f"https://openlibrary.org/books/{olid_text}.json" + + try: + with HTTPClient() as client: + response = client.get(url) + response.raise_for_status() + data = json_module.loads(response.content.decode("utf-8")) + except Exception as exc: + log(f"Failed to fetch OpenLibrary metadata: {exc}", file=sys.stderr) + return [] + + if not isinstance(data, dict) or not data: + log(f"No OpenLibrary metadata found for: {olid_text}") + return [] + + if "title" in data: + new_tags.append(f"title:{data['title']}") + + authors = data.get("authors") + if isinstance(authors, list): + for author in authors[:3]: + if isinstance(author, dict) and author.get("name"): + new_tags.append(f"author:{author['name']}") + continue + + # Common OL shape: {"key": "/authors/OL...A"} or {"author": {"key": ...}} + author_key = None + if isinstance(author, dict): + if isinstance(author.get("author"), dict): + author_key = author.get("author", {}).get("key") + if not author_key: + author_key = author.get("key") + + if isinstance(author_key, str) and author_key.startswith("/"): + try: + author_url = f"https://openlibrary.org{author_key}.json" + with HTTPClient(timeout=10) as client: + author_resp = client.get(author_url) + author_resp.raise_for_status() + author_data = json_module.loads(author_resp.content.decode("utf-8")) + if isinstance(author_data, dict) and author_data.get("name"): + new_tags.append(f"author:{author_data['name']}") + continue + except Exception: + pass + + if isinstance(author, str) and author: + new_tags.append(f"author:{author}") + + if data.get("publish_date"): + new_tags.append(f"publish_date:{data['publish_date']}") + + publishers = data.get("publishers") + if isinstance(publishers, list) and publishers: + pub = publishers[0] + if isinstance(pub, dict) and pub.get("name"): + new_tags.append(f"publisher:{pub['name']}") + elif isinstance(pub, str) and pub: + new_tags.append(f"publisher:{pub}") + + if "description" in data: + desc = data.get("description") + if isinstance(desc, dict) and "value" in desc: + desc = desc.get("value") + if desc: + desc_str = str(desc).strip() + if desc_str: + new_tags.append(f"description:{desc_str[:200]}") + + page_count = data.get("number_of_pages") + if isinstance(page_count, int) and page_count > 0: + new_tags.append(f"pages:{page_count}") + + subjects = data.get("subjects") + if isinstance(subjects, list): + for subject in subjects[:10]: + if isinstance(subject, str): + subject_clean = subject.strip() + if subject_clean and subject_clean not in new_tags: + new_tags.append(subject_clean) + + identifiers = data.get("identifiers") + if isinstance(identifiers, dict): + + def _first(value: Any) -> Any: + if isinstance(value, list) and value: + return value[0] + return value + + for key, ns in ( + ("isbn_10", "isbn_10"), + ("isbn_13", "isbn_13"), + ("lccn", "lccn"), + ("oclc_numbers", "oclc"), + ("goodreads", "goodreads"), + ("internet_archive", "internet_archive"), + ): + val = _first(identifiers.get(key)) + if val: + new_tags.append(f"{ns}:{val}") + + # Some editions expose a direct Archive.org identifier as "ocaid". + ocaid = data.get("ocaid") + if isinstance(ocaid, str) and ocaid.strip(): + new_tags.append(f"internet_archive:{ocaid.strip()}") + + log(f"Found {len(new_tags)} tag(s) from OpenLibrary lookup") + return new_tags + def search( self, query: str, @@ -155,7 +697,70 @@ class OpenLibrary(SearchProvider): if not isinstance(docs, list): return [] - for doc in docs[: int(limit)]: + # Availability enrichment can be slow if done sequentially (it may require multiple + # network calls per row). Do it concurrently to keep the pipeline responsive. + docs = docs[: int(limit)] + + def _compute_availability(doc_dict: Dict[str, Any]) -> Tuple[str, str, str, str]: + edition_id_local = _resolve_edition_id(doc_dict) + if not edition_id_local: + return "no-olid", "", "", "" + + ia_val_local = doc_dict.get("ia") or [] + if isinstance(ia_val_local, str): + ia_val_local = [ia_val_local] + if not isinstance(ia_val_local, list): + ia_val_local = [] + ia_ids_local = [str(x) for x in ia_val_local if x] + + session_local = requests.Session() + + try: + archive_id_local = _resolve_archive_id(session_local, edition_id_local, ia_ids_local) + except Exception: + archive_id_local = "" + + if not archive_id_local: + return "no-archive", "", "", "" + + # Prefer the fastest signal first: OpenLibrary lendable status. + lendable_local, reason_local = _check_lendable(session_local, edition_id_local) + if lendable_local: + return "borrow", reason_local, archive_id_local, "" + + # Not lendable: check whether it's directly downloadable (public domain uploads, etc.). + try: + can_direct, pdf_url = self._archive_check_direct_download(archive_id_local) + if can_direct and pdf_url: + return "download", reason_local, archive_id_local, str(pdf_url) + except Exception: + pass + + return "unavailable", reason_local, archive_id_local, "" + + availability_rows: List[Tuple[str, str, str, str]] = [("unknown", "", "", "") for _ in range(len(docs))] + if docs: + log(f"[openlibrary] Enriching availability for {len(docs)} result(s)...") + max_workers = min(8, max(1, len(docs))) + done = 0 + with futures.ThreadPoolExecutor(max_workers=max_workers) as executor: + future_to_index = { + executor.submit(_compute_availability, doc_dict): i + for i, doc_dict in enumerate(docs) + if isinstance(doc_dict, dict) + } + for fut in futures.as_completed(list(future_to_index.keys())): + i = future_to_index[fut] + try: + availability_rows[i] = fut.result() + except Exception: + availability_rows[i] = ("unknown", "", "", "") + done += 1 + if done in {1, len(future_to_index)} or (done % 10 == 0): + log(f"[openlibrary] Availability: {done}/{len(future_to_index)}") + log("[openlibrary] Availability enrichment complete") + + for idx, doc in enumerate(docs): if not isinstance(doc, dict): continue @@ -172,6 +777,7 @@ class OpenLibrary(SearchProvider): year = str(year_val) if year_val is not None else "" edition_id = _resolve_edition_id(doc) + work_key = doc.get("key") if isinstance(doc.get("key"), str) else "" ia_val = doc.get("ia") or [] if isinstance(ia_val, str): @@ -193,9 +799,21 @@ class OpenLibrary(SearchProvider): ("Title", book_title), ("Author", ", ".join(authors_list)), ("Year", year), + ("Avail", ""), ("OLID", edition_id), ] + # Determine availability using the concurrently computed enrichment. + availability, availability_reason, archive_id, direct_url = ("unknown", "", "", "") + if 0 <= idx < len(availability_rows): + availability, availability_reason, archive_id, direct_url = availability_rows[idx] + + # Patch the display column. + for idx, (name, _val) in enumerate(columns): + if name == "Avail": + columns[idx] = ("Avail", availability) + break + annotations: List[str] = [] if isbn_13: annotations.append(f"isbn_13:{isbn_13}") @@ -203,12 +821,18 @@ class OpenLibrary(SearchProvider): annotations.append(f"isbn_10:{isbn_10}") if ia_ids: annotations.append("archive") + if availability in {"download", "borrow"}: + annotations.append(availability) results.append( SearchResult( table="openlibrary", title=book_title, - path=(f"https://openlibrary.org/books/{edition_id}" if edition_id else "https://openlibrary.org"), + path=( + f"https://openlibrary.org/books/{edition_id}" if edition_id else ( + f"https://openlibrary.org{work_key}" if isinstance(work_key, str) and work_key.startswith("/") else "https://openlibrary.org" + ) + ), detail=( (f"By: {', '.join(authors_list)}" if authors_list else "") + (f" ({year})" if year else "") @@ -218,11 +842,16 @@ class OpenLibrary(SearchProvider): columns=columns, full_metadata={ "openlibrary_id": edition_id, + "openlibrary_key": work_key, "authors": authors_list, "year": year, "isbn_10": isbn_10, "isbn_13": isbn_13, "ia": ia_ids, + "availability": availability, + "availability_reason": availability_reason, + "archive_id": archive_id, + "direct_url": direct_url, "raw": doc, }, ) @@ -256,9 +885,7 @@ class OpenLibrary(SearchProvider): # 1) Direct download if available. try: - from API.archive_client import check_direct_download - - can_direct, pdf_url = check_direct_download(archive_id) + can_direct, pdf_url = self._archive_check_direct_download(archive_id) except Exception: can_direct, pdf_url = False, "" @@ -272,10 +899,7 @@ class OpenLibrary(SearchProvider): # 2) Borrow flow (credentials required). try: - from API.archive_client import BookNotAvailableError, credential_openlibrary, download as archive_download - from API.archive_client import get_book_infos, loan, login - - email, password = credential_openlibrary(self.config or {}) + email, password = self._credential_archive(self.config or {}) if not email or not password: log("[openlibrary] Archive credentials missing; cannot borrow", file=sys.stderr) return None @@ -285,13 +909,13 @@ class OpenLibrary(SearchProvider): log(f"[openlibrary] Not lendable: {reason}", file=sys.stderr) return None - session = login(email, password) + session = self._archive_login(email, password) try: - session = loan(session, archive_id, verbose=False) - except BookNotAvailableError: + session = self._archive_loan(session, archive_id, verbose=False) + except self.BookNotAvailableError: log("[openlibrary] Book not available to borrow", file=sys.stderr) return None - except SystemExit: + except Exception: log("[openlibrary] Borrow failed", file=sys.stderr) return None @@ -301,7 +925,7 @@ class OpenLibrary(SearchProvider): last_exc: Optional[Exception] = None for u in urls: try: - title_raw, links, _metadata = get_book_infos(session, u) + title_raw, links, _metadata = self._archive_get_book_infos(session, u) if title_raw: title = sanitize_filename(title_raw) break @@ -315,7 +939,7 @@ class OpenLibrary(SearchProvider): temp_dir = tempfile.mkdtemp(prefix=f"{title}_", dir=str(output_dir)) try: - images = archive_download(session=session, n_threads=10, directory=temp_dir, links=links, scale=3, book_id=archive_id) + images = self._archive_download(session=session, n_threads=10, directory=temp_dir, links=links, scale=3, book_id=archive_id) try: import img2pdf # type: ignore diff --git a/SYS/download.py b/SYS/download.py index d827347..9ad8bb0 100644 --- a/SYS/download.py +++ b/SYS/download.py @@ -642,7 +642,7 @@ def _download_direct_file( return DownloadMediaResult( path=file_path, info=info, - tags=tags, + tag=tags, source_url=url, hash_value=hash_value, ) diff --git a/SYS/utils_constant.py b/SYS/utils_constant.py index f319f28..8f60fe1 100644 --- a/SYS/utils_constant.py +++ b/SYS/utils_constant.py @@ -36,6 +36,7 @@ mime_maps = { "mp3": { "ext": ".mp3", "mimes": ["audio/mpeg", "audio/mp3"] }, "m4a": { "ext": ".m4a", "mimes": ["audio/mp4", "audio/x-m4a"] }, "ogg": { "ext": ".ogg", "mimes": ["audio/ogg"] }, + "opus": { "ext": ".opus", "mimes": ["audio/opus"] }, "flac": { "ext": ".flac", "mimes": ["audio/flac"] }, "wav": { "ext": ".wav", "mimes": ["audio/wav", "audio/x-wav", "audio/vnd.wave"] }, "wma": { "ext": ".wma", "mimes": ["audio/x-ms-wma"] }, @@ -98,3 +99,13 @@ def get_type_from_ext(ext: str) -> str: return type_name return 'other' + + +# Canonical supported extension set for all stores/cmdlets. +# Derived from mime_maps so there is a single source of truth. +ALL_SUPPORTED_EXTENSIONS: set[str] = { + spec["ext"].lower() + for group in mime_maps.values() + for spec in group.values() + if isinstance(spec, dict) and isinstance(spec.get("ext"), str) and spec.get("ext") +} diff --git a/Store/Folder.py b/Store/Folder.py index 57df4b4..7667a01 100644 --- a/Store/Folder.py +++ b/Store/Folder.py @@ -30,6 +30,8 @@ def _resolve_file_hash(db_hash: Optional[str], file_path: Path) -> Optional[str] return _normalize_hash(file_path.stem) + + class Folder(Store): """""" # Track which locations have already been migrated to avoid repeated migrations @@ -359,6 +361,17 @@ class Folder(Store): else: shutil.copy2(str(file_path), str(save_file)) debug(f"Local copy: {save_file}", file=sys.stderr) + + # Best-effort: capture duration for media + duration_value: float | None = None + try: + from SYS.utils import ffprobe + probe = ffprobe(str(save_file)) + duration = probe.get("duration") + if isinstance(duration, (int, float)) and duration > 0: + duration_value = float(duration) + except Exception: + duration_value = None # Save to database with API_folder_store(Path(self._location)) as db: @@ -368,7 +381,8 @@ class Folder(Store): db.save_metadata(save_file, { 'hash': file_hash, 'ext': ext_clean, - 'size': file_path.stat().st_size + 'size': file_path.stat().st_size, + 'duration': duration_value, }) # Add tags if provided @@ -405,6 +419,21 @@ class Folder(Store): results = [] search_dir = Path(self._location).expanduser() + def _url_like_pattern(value: str) -> str: + # Interpret user patterns as substring matches (with optional glob wildcards). + v = (value or "").strip().lower() + if not v or v == "*": + return "%" + v = v.replace("%", "\\%").replace("_", "\\_") + v = v.replace("*", "%").replace("?", "_") + if "%" not in v and "_" not in v: + return f"%{v}%" + if not v.startswith("%"): + v = "%" + v + if not v.endswith("%"): + v = v + "%" + return v + tokens = [t.strip() for t in query.split(',') if t.strip()] if not match_all and len(tokens) == 1 and _normalize_hash(query): @@ -453,6 +482,8 @@ class Folder(Store): try: with DatabaseAPI(search_dir) as api: if tokens and len(tokens) > 1: + url_fetch_limit = (limit or 45) * 50 + def _like_pattern(term: str) -> str: return term.replace('*', '%').replace('?', '_') @@ -473,6 +504,11 @@ class Folder(Store): h = api.get_file_hash_by_hash(normalized_hash) return {h} if h else set() + if namespace == 'url': + if not pattern or pattern == '*': + return api.get_file_hashes_with_any_url(limit=url_fetch_limit) + return api.get_file_hashes_by_url_like(_url_like_pattern(pattern), limit=url_fetch_limit) + if namespace == 'store': if pattern not in {'local', 'file', 'filesystem'}: return set() @@ -562,6 +598,29 @@ class Folder(Store): if limit is not None and len(results) >= limit: return results return results + + if namespace == "url": + if not pattern or pattern == "*": + rows = api.get_files_with_any_url(limit) + else: + rows = api.get_files_by_url_like(_url_like_pattern(pattern), limit) + for file_hash, file_path_str, size_bytes, ext in rows: + if not file_path_str: + continue + file_path = Path(file_path_str) + if not file_path.exists(): + continue + if size_bytes is None: + try: + size_bytes = file_path.stat().st_size + except OSError: + size_bytes = None + tags = api.get_tags_for_file(file_hash) + entry = _create_entry(file_path, tags, size_bytes, file_hash) + results.append(entry) + if limit is not None and len(results) >= limit: + return results + return results query_pattern = f"{namespace}:%" rows = api.get_files_by_namespace_pattern(query_pattern, limit) @@ -592,126 +651,59 @@ class Folder(Store): if limit is not None and len(results) >= limit: return results elif not match_all: + # Strict tag-based search only (no filename/path searching). terms = [t.strip() for t in query_lower.replace(',', ' ').split() if t.strip()] if not terms: terms = [query_lower] - - debug(f"Performing filename/tag search for terms: {terms}") - + fetch_limit = (limit or 45) * 50 - - conditions = ["LOWER(f.file_path) LIKE ?" for _ in terms] - params = [f"%{t}%" for t in terms] - - rows = api.get_files_by_multiple_path_conditions(conditions, params, fetch_limit) - debug(f"Found {len(rows)} filename matches in DB (before whole-word filter)") - - word_regex = None - if len(terms) == 1: - term = terms[0] - has_wildcard = '*' in term or '?' in term - - if has_wildcard: - try: - from fnmatch import translate - word_regex = re.compile(translate(term), re.IGNORECASE) - except Exception: - word_regex = None - else: - try: - pattern = r'(?= limit: - return results + entry = hits.get(file_hash) + if entry: + entry["count"] += 1 + if size_bytes is not None: + entry["size"] = size_bytes + else: + hits[file_hash] = { + "path": file_path_str, + "size": size_bytes, + "hash": file_hash, + "count": 1, + } - if terms: - title_hits: dict[str, dict[str, Any]] = {} - for term in terms: - title_pattern = f"title:%{term}%" - title_rows = api.get_files_by_title_tag_pattern(title_pattern, fetch_limit) - for file_hash, file_path_str, size_bytes, ext in title_rows: - if not file_path_str: - continue - entry = title_hits.get(file_hash) - if entry: - entry["count"] += 1 - if size_bytes is not None: - entry["size"] = size_bytes - else: - title_hits[file_hash] = { - "path": file_path_str, - "size": size_bytes, - "hash": file_hash, - "count": 1, - } - - if title_hits: - required = len(terms) - for file_hash, info in title_hits.items(): - if info.get("count") != required: - continue - file_path_str = info.get("path") - if not file_path_str or file_path_str in seen_files: - continue - file_path = Path(file_path_str) - if not file_path.exists(): - continue - seen_files.add(file_path_str) - - size_bytes = info.get("size") - if size_bytes is None: - try: - size_bytes = file_path.stat().st_size - except OSError: - size_bytes = None - - tags = api.get_tags_for_file(file_hash) - entry = _create_entry(file_path, tags, size_bytes, info.get("hash")) - results.append(entry) - if limit is not None and len(results) >= limit: - return results - - query_pattern = f"%{query_lower}%" - tag_rows = api.get_files_by_simple_tag_pattern(query_pattern, limit) - - for file_hash, file_path_str, size_bytes, ext in tag_rows: + required = len(terms) + seen_files: set[str] = set() + for file_hash, info in hits.items(): + if info.get("count") != required: + continue + file_path_str = info.get("path") if not file_path_str or file_path_str in seen_files: continue - seen_files.add(file_path_str) - file_path = Path(file_path_str) - if file_path.exists(): - if size_bytes is None: + if not file_path.exists(): + continue + seen_files.add(file_path_str) + + size_bytes = info.get("size") + if size_bytes is None: + try: size_bytes = file_path.stat().st_size - - tags = api.get_tags_for_file(file_hash) - entry = _create_entry(file_path, tags, size_bytes, file_hash) - results.append(entry) - - if limit is not None and len(results) >= limit: - return results + except OSError: + size_bytes = None + + tags = api.get_tags_for_file(file_hash) + entry_obj = _create_entry(file_path, tags, size_bytes, info.get("hash")) + results.append(entry_obj) + if limit is not None and len(results) >= limit: + break else: rows = api.get_all_files(limit) @@ -726,10 +718,8 @@ class Folder(Store): entry = _create_entry(file_path, tags, size_bytes, file_hash) results.append(entry) - if results: - debug(f"Returning {len(results)} results from DB") - else: - debug("No results found in DB") + backend_label = str(getattr(self, "_name", "") or getattr(self, "NAME", "") or "folder") + debug(f"[folder:{backend_label}] {len(results)} result(s)") return results except Exception as e: @@ -938,9 +928,11 @@ class Folder(Store): file_hash = file_identifier if self._location: try: + from metadata import normalize_urls with API_folder_store(Path(self._location)) as db: meta = db.get_metadata(file_hash) or {} - return list(meta.get("url") or []) + urls = normalize_urls(meta.get("url")) + return urls except Exception as exc: debug(f"Local DB get_metadata failed: {exc}") return [] @@ -955,11 +947,13 @@ class Folder(Store): file_hash = file_identifier if self._location: try: + from metadata import normalize_urls with API_folder_store(Path(self._location)) as db: meta = db.get_metadata(file_hash) or {} - existing_urls = list(meta.get("url") or []) + existing_urls = normalize_urls(meta.get("url")) + incoming_urls = normalize_urls(url) changed = False - for u in list(url or []): + for u in list(incoming_urls or []): if not u: continue if u not in existing_urls: @@ -982,10 +976,11 @@ class Folder(Store): file_hash = file_identifier if self._location: try: + from metadata import normalize_urls with API_folder_store(Path(self._location)) as db: meta = db.get_metadata(file_hash) or {} - existing_urls = list(meta.get("url") or []) - remove_set = {u for u in (url or []) if u} + existing_urls = normalize_urls(meta.get("url")) + remove_set = {u for u in normalize_urls(url) if u} if not remove_set: return False new_urls = [u for u in existing_urls if u not in remove_set] diff --git a/Store/HydrusNetwork.py b/Store/HydrusNetwork.py index b9095cc..58ae0c7 100644 --- a/Store/HydrusNetwork.py +++ b/Store/HydrusNetwork.py @@ -264,6 +264,170 @@ class HydrusNetwork(Store): debug(f"Searching Hydrus for: {query}") + def _extract_urls(meta_obj: Any) -> list[str]: + if not isinstance(meta_obj, dict): + return [] + raw = meta_obj.get("url") + if raw is None: + raw = meta_obj.get("urls") + if isinstance(raw, str): + val = raw.strip() + return [val] if val else [] + if isinstance(raw, list): + out: list[str] = [] + for item in raw: + if not isinstance(item, str): + continue + s = item.strip() + if s: + out.append(s) + return out + return [] + + def _iter_url_filtered_metadata(url_value: str | None, want_any: bool, fetch_limit: int) -> list[dict[str, Any]]: + """Best-effort URL search by scanning Hydrus metadata with include_file_url=True.""" + + # First try a fast system predicate if Hydrus supports it. + candidate_file_ids: list[int] = [] + try: + if want_any: + predicate = "system:has url" + url_search = client.search_files( + tags=[predicate], + return_hashes=False, + return_file_ids=True, + return_file_count=False, + ) + ids = url_search.get("file_ids", []) if isinstance(url_search, dict) else [] + if isinstance(ids, list): + candidate_file_ids = [int(x) for x in ids if isinstance(x, (int, float, str)) and str(x).strip().isdigit()] + except Exception: + candidate_file_ids = [] + + if not candidate_file_ids: + # Fallback: scan from system:everything and filter by URL substring. + everything = client.search_files( + tags=["system:everything"], + return_hashes=False, + return_file_ids=True, + return_file_count=False, + ) + ids = everything.get("file_ids", []) if isinstance(everything, dict) else [] + if isinstance(ids, list): + candidate_file_ids = [int(x) for x in ids if isinstance(x, (int, float))] + + if not candidate_file_ids: + return [] + + needle = (url_value or "").strip().lower() + chunk_size = 200 + out: list[dict[str, Any]] = [] + + for start in range(0, len(candidate_file_ids), chunk_size): + if len(out) >= fetch_limit: + break + chunk = candidate_file_ids[start : start + chunk_size] + try: + payload = client.fetch_file_metadata( + file_ids=chunk, + include_file_url=True, + include_service_keys_to_tags=True, + include_duration=True, + include_size=True, + include_mime=True, + ) + except Exception: + continue + + metas = payload.get("metadata", []) if isinstance(payload, dict) else [] + if not isinstance(metas, list): + continue + + for meta in metas: + if not isinstance(meta, dict): + continue + urls = _extract_urls(meta) + if not urls: + continue + if want_any: + out.append(meta) + if len(out) >= fetch_limit: + break + continue + + if not needle: + continue + if any(needle in u.lower() for u in urls): + out.append(meta) + if len(out) >= fetch_limit: + break + + return out + + query_lower = query.lower().strip() + + # Special case: url:* and url: + metadata_list: list[dict[str, Any]] | None = None + if ":" in query_lower and not query_lower.startswith(":"): + namespace, pattern = query_lower.split(":", 1) + namespace = namespace.strip().lower() + pattern = pattern.strip() + if namespace == "url": + if not pattern or pattern == "*": + metadata_list = _iter_url_filtered_metadata(None, want_any=True, fetch_limit=int(limit) if limit else 100) + else: + # Fast-path: exact URL via /add_url/get_url_files when a full URL is provided. + try: + if pattern.startswith("http://") or pattern.startswith("https://"): + from API.HydrusNetwork import HydrusRequestSpec + + spec = HydrusRequestSpec(method="GET", endpoint="/add_url/get_url_files", query={"url": pattern}) + response = client._perform_request(spec) # type: ignore[attr-defined] + hashes: list[str] = [] + file_ids: list[int] = [] + if isinstance(response, dict): + raw_hashes = response.get("hashes") or response.get("file_hashes") + if isinstance(raw_hashes, list): + hashes = [str(h).strip() for h in raw_hashes if isinstance(h, str) and str(h).strip()] + raw_ids = response.get("file_ids") + if isinstance(raw_ids, list): + for item in raw_ids: + try: + file_ids.append(int(item)) + except (TypeError, ValueError): + continue + + if file_ids: + payload = client.fetch_file_metadata( + file_ids=file_ids, + include_file_url=True, + include_service_keys_to_tags=True, + include_duration=True, + include_size=True, + include_mime=True, + ) + metas = payload.get("metadata", []) if isinstance(payload, dict) else [] + if isinstance(metas, list): + metadata_list = [m for m in metas if isinstance(m, dict)] + elif hashes: + payload = client.fetch_file_metadata( + hashes=hashes, + include_file_url=True, + include_service_keys_to_tags=True, + include_duration=True, + include_size=True, + include_mime=True, + ) + metas = payload.get("metadata", []) if isinstance(payload, dict) else [] + if isinstance(metas, list): + metadata_list = [m for m in metas if isinstance(m, dict)] + except Exception: + metadata_list = None + + # Fallback: substring scan + if metadata_list is None: + metadata_list = _iter_url_filtered_metadata(pattern, want_any=False, fetch_limit=int(limit) if limit else 100) + # Parse the query into tags # Handle both simple tags and complex queries # "*" means "match all" - use system:everything tag in Hydrus @@ -271,7 +435,6 @@ class HydrusNetwork(Store): # Use system:everything to match all files in Hydrus tags = ["system:everything"] else: - query_lower = query.lower().strip() # If query doesn't have a namespace (no ':'), search all files and filter by title/tags # If query has explicit namespace, use it as a tag search if ':' not in query_lower: @@ -286,30 +449,36 @@ class HydrusNetwork(Store): debug(f"Found 0 result(s)") return [] - # Search files with the tags - search_result = client.search_files( - tags=tags, - return_hashes=True, - return_file_ids=True - ) - - # Extract file IDs from search result - file_ids = search_result.get("file_ids", []) - hashes = search_result.get("hashes", []) - - if not file_ids and not hashes: - debug(f"Found 0 result(s)") - return [] - - # Fetch metadata for the found files + # Search files with the tags (unless url: search already produced metadata) results = [] - query_lower = query.lower().strip() # Split by comma or space for AND logic search_terms = set(query_lower.replace(',', ' ').split()) # For substring matching - - if file_ids: - metadata = client.fetch_file_metadata(file_ids=file_ids) - metadata_list = metadata.get("metadata", []) + + if metadata_list is None: + search_result = client.search_files( + tags=tags, + return_hashes=True, + return_file_ids=True + ) + + file_ids = search_result.get("file_ids", []) if isinstance(search_result, dict) else [] + hashes = search_result.get("hashes", []) if isinstance(search_result, dict) else [] + + if not file_ids and not hashes: + debug(f"Found 0 result(s)") + return [] + + if file_ids: + metadata = client.fetch_file_metadata(file_ids=file_ids) + metadata_list = metadata.get("metadata", []) + elif hashes: + metadata = client.fetch_file_metadata(hashes=hashes) + metadata_list = metadata.get("metadata", []) + else: + metadata_list = [] + + if not isinstance(metadata_list, list): + metadata_list = [] for meta in metadata_list: if len(results) >= limit: diff --git a/Store/registry.py b/Store/registry.py index 900b978..e377a99 100644 --- a/Store/registry.py +++ b/Store/registry.py @@ -119,6 +119,37 @@ class Store: self._backend_errors: Dict[str, str] = {} self._load_backends() + def _maybe_register_temp_alias(self, store_type: str, backend_name: str, kwargs: Dict[str, Any], backend: BaseStore) -> None: + """If a folder backend points at config['temp'], also expose it as the 'temp' backend. + + This keeps config compatibility (e.g. existing 'default') while presenting the temp + directory under a clearer name. + """ + try: + if _normalize_store_type(store_type) != "folder": + return + temp_value = self._config.get("temp") + if not temp_value: + return + path_value = kwargs.get("PATH") or kwargs.get("path") + if not path_value: + return + + temp_path = Path(str(temp_value)).expanduser().resolve() + backend_path = Path(str(path_value)).expanduser().resolve() + if backend_path != temp_path: + return + + # If the user already has a dedicated temp backend, do nothing. + if "temp" in self._backends: + return + + # Keep original name working, but add an alias. + if backend_name != "temp": + self._backends["temp"] = backend + except Exception: + return + def _load_backends(self) -> None: store_cfg = self._config.get("store") if not isinstance(store_cfg, dict): @@ -161,6 +192,9 @@ class Store: backend_name = str(kwargs.get("NAME") or instance_name) self._backends[backend_name] = backend + + # If this is the configured temp directory, also alias it as 'temp'. + self._maybe_register_temp_alias(store_type, backend_name, kwargs, backend) except Exception as exc: err_text = str(exc) self._backend_errors[str(instance_name)] = err_text @@ -177,11 +211,24 @@ class Store: return sorted(self._backends.keys()) def list_searchable_backends(self) -> list[str]: - searchable: list[str] = [] + # De-duplicate backends by instance (aliases can point at the same object). + def _rank(name: str) -> int: + n = str(name or "").strip().lower() + if n == "temp": + return 0 + if n == "default": + return 2 + return 1 + + chosen: Dict[int, str] = {} for name, backend in self._backends.items(): - if type(backend).search is not BaseStore.search: - searchable.append(name) - return sorted(searchable) + if type(backend).search is BaseStore.search: + continue + key = id(backend) + prev = chosen.get(key) + if prev is None or _rank(name) < _rank(prev): + chosen[key] = name + return sorted(chosen.values()) def __getitem__(self, backend_name: str) -> BaseStore: if backend_name not in self._backends: diff --git a/cmdlet/_shared.py b/cmdlet/_shared.py index f633560..6abc0a5 100644 --- a/cmdlet/_shared.py +++ b/cmdlet/_shared.py @@ -5,10 +5,9 @@ from __future__ import annotations import json import sys -import inspect from collections.abc import Iterable as IterableABC -from SYS.logger import log, debug +from SYS.logger import log from pathlib import Path from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Set from dataclasses import dataclass, field @@ -690,7 +689,9 @@ def get_field(obj: Any, field: str, default: Optional[Any] = None) -> Any: get_field(result, "table", "unknown") # With default """ # Handle lists by accessing the first element - if isinstance(obj, list) and obj: + if isinstance(obj, list): + if not obj: + return default obj = obj[0] if isinstance(obj, dict): @@ -702,8 +703,9 @@ def get_field(obj: Any, field: str, default: Optional[Any] = None) -> Any: return value # For PipeObjects, also check the extra field - if hasattr(obj, 'extra') and isinstance(obj.extra, dict): - return obj.extra.get(field, default) + extra_val = getattr(obj, 'extra', None) + if isinstance(extra_val, dict): + return extra_val.get(field, default) return default @@ -1118,7 +1120,7 @@ def create_pipe_object_result( Returns: Dict with all PipeObject fields for emission """ - result = { + result: Dict[str, Any] = { 'source': source, 'id': identifier, 'path': file_path, @@ -1546,14 +1548,11 @@ def coerce_to_pipe_object(value: Any, default_path: Optional[str] = None) -> mod extra = {k: v for k, v in value.items() if k not in known_keys} # Extract URL: prefer direct url field, then url list - url_val = value.get("url") - if not url_val: - url = value.get("url") or value.get("url") or [] - if url and isinstance(url, list) and len(url) > 0: - url_val = url[0] - # Preserve url in extra if multiple url exist - if url and len(url) > 1: - extra["url"] = url + from metadata import normalize_urls + url_list = normalize_urls(value.get("url")) + url_val = url_list[0] if url_list else None + if len(url_list) > 1: + extra["url"] = url_list # Extract relationships rels = value.get("relationships") or {} diff --git a/cmdlet/add_file.py b/cmdlet/add_file.py index be489f6..9aae609 100644 --- a/cmdlet/add_file.py +++ b/cmdlet/add_file.py @@ -1,14 +1,16 @@ from __future__ import annotations -from typing import Any, Dict, Optional, Sequence, Tuple, List, Union +from typing import Any, Dict, Optional, Sequence, Tuple, List from pathlib import Path import sys import shutil +import tempfile import models import pipeline as ctx from API import HydrusNetwork as hydrus_wrapper from SYS.logger import log, debug +from SYS.utils_constant import ALL_SUPPORTED_EXTENSIONS from Store import Store from ._shared import ( Cmdlet, CmdletArg, parse_cmdlet_args, SharedArgs, @@ -20,8 +22,8 @@ from API.folder import read_sidecar, find_sidecar, write_sidecar, API_folder_sto from SYS.utils import sha256_file, unique_path from metadata import write_metadata -# Use official Hydrus supported filetypes from hydrus_wrapper -SUPPORTED_MEDIA_EXTENSIONS = hydrus_wrapper.ALL_SUPPORTED_EXTENSIONS +# Canonical supported filetypes for all stores/cmdlets +SUPPORTED_MEDIA_EXTENSIONS = ALL_SUPPORTED_EXTENSIONS class Add_File(Cmdlet): """Add file into the DB""" @@ -53,93 +55,210 @@ class Add_File(Cmdlet): def run(self, result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: """Main execution entry point.""" - # Parse arguments parsed = parse_cmdlet_args(args, self) - # Initialize state path_arg = parsed.get("path") - location = parsed.get("store") # Fixed: was "storage", should be "store" + location = parsed.get("store") provider_name = parsed.get("provider") delete_after = parsed.get("delete", False) - # Coerce result to PipeObject; if result is a list, prefer the first element - effective_result = result - if isinstance(result, list) and result: - first_item = result[0] - # Prefer first item if it's a dict or PipeObject - if isinstance(first_item, (dict, )): - effective_result = first_item - pipe_obj = coerce_to_pipe_object(effective_result, path_arg) + stage_ctx = ctx.get_stage_context() + is_last_stage = (stage_ctx is None) or bool(getattr(stage_ctx, "is_last_stage", False)) + + # Decide which items to process. + # - If user provided -path, treat this invocation as single-item. + # - Otherwise, if piped input is a list, ingest each item. + if path_arg: + items_to_process: List[Any] = [result] + elif isinstance(result, list) and result: + items_to_process = list(result) + else: + items_to_process = [result] - # Debug: Log input result details debug(f"[add-file] INPUT result type={type(result).__name__}") if isinstance(result, list): debug(f"[add-file] INPUT result is list with {len(result)} items") - if result and isinstance(result[0], dict): - first = result[0] - hash_val = first.get('hash') - hash_str = hash_val[:12] + "..." if hash_val else "N/A" - debug(f"[add-file] First item details: title={first.get('title')}, hash={hash_str}, store={first.get('store', 'N/A')}") - elif isinstance(result, dict): - hash_val = result.get('hash') - hash_str = hash_val[:12] + "..." if hash_val else "N/A" - debug(f"[add-file] INPUT result is dict: title={result.get('title')}, hash={hash_str}, store={result.get('store', 'N/A')}") - - # Debug: Log parsed arguments debug(f"[add-file] PARSED args: location={location}, provider={provider_name}, delete={delete_after}") - - # Resolve source - returns (media_path_or_url, file_hash) - media_path_or_url, file_hash = self._resolve_source(result, path_arg, pipe_obj, config) - debug(f"[add-file] RESOLVED source: path={media_path_or_url}, hash={file_hash[:12] if file_hash else 'N/A'}...") - if not media_path_or_url: - debug(f"[add-file] ERROR: Could not resolve source file/URL") - return 1 - - # Update pipe_obj with resolved path - pipe_obj.path = str(media_path_or_url) if isinstance(media_path_or_url, (str, Path)) else str(media_path_or_url) - # Check if it's a URL before validating as file - if isinstance(media_path_or_url, str) and media_path_or_url.lower().startswith(("http://", "https://", "magnet:", "torrent:")): - debug(f"Detected URL target, delegating to download-data: {media_path_or_url}") - return self._delegate_to_download_data(result, media_path_or_url, location, provider_name, args, config) + collected_payloads: List[Dict[str, Any]] = [] + successes = 0 + failures = 0 - # Convert to Path and validate - media_path = Path(media_path_or_url) if isinstance(media_path_or_url, str) else media_path_or_url - - # Validate source - if not self._validate_source(media_path): - debug(f"[add-file] ERROR: Source validation failed for {media_path}") - return 1 + # Only run the search-store refresh when add-file is the last stage. + # In the middle of a pipeline, downstream cmdlets should receive the emitted + # storage payload directly (no need to re-search and risk duplicate emits). + auto_search_store_after_add = bool(is_last_stage) and len(items_to_process) == 1 - # Debug: Log execution path decision - debug(f"[add-file] DECISION POINT: provider={provider_name}, location={location}") - debug(f" media_path={media_path}, exists={media_path.exists()}") + for item in items_to_process: + pipe_obj = coerce_to_pipe_object(item, path_arg) - # Execute transfer based on destination (using Store registry) - if provider_name: - debug(f"[add-file] ROUTE: file provider upload") - return self._handle_provider_upload(media_path, provider_name, pipe_obj, config, delete_after) - elif location: - # Check if location is a registered backend name + temp_dir_to_cleanup: Optional[Path] = None + delete_after_item = delete_after try: - store = Store(config) - backends = store.list_backends() - - if location in backends: - debug(f"[add-file] ROUTE: storage backend '{location}'") - return self._handle_storage_backend(media_path, location, pipe_obj, config, delete_after) - else: - # Treat as local export path - debug(f"[add-file] ROUTE: local export to path '{location}'") - return self._handle_local_export(media_path, location, pipe_obj, config, delete_after) - except Exception as exc: - debug(f"[add-file] ERROR: Failed to resolve location: {exc}") - log(f"Invalid location: {location}", file=sys.stderr) - return 1 - else: - debug(f"[add-file] ERROR: No location or provider specified") - log(f"No storage location or provider specified", file=sys.stderr) - return 1 + media_path_or_url, file_hash = self._resolve_source(item, path_arg, pipe_obj, config) + debug(f"[add-file] RESOLVED source: path={media_path_or_url}, hash={file_hash[:12] if file_hash else 'N/A'}...") + if not media_path_or_url: + failures += 1 + continue + + # Update pipe_obj with resolved path + pipe_obj.path = str(media_path_or_url) + + # URL targets: prefer provider-aware download for OpenLibrary selections. + if isinstance(media_path_or_url, str) and media_path_or_url.lower().startswith( + ("http://", "https://", "magnet:", "torrent:") + ): + table = None + full_metadata = None + if isinstance(pipe_obj.extra, dict): + table = pipe_obj.extra.get("table") + full_metadata = pipe_obj.extra.get("full_metadata") + + is_openlibrary = (str(table or "").lower() == "openlibrary") or ("openlibrary.org/books/" in media_path_or_url.lower()) + if is_openlibrary: + # Enrich tags from OpenLibrary metadata so the stored file has book tags (author/pages/etc). + try: + from Provider.openlibrary import OpenLibrary as _OpenLibrary + + olid = None + archive_id = None + if isinstance(full_metadata, dict): + olid = full_metadata.get("openlibrary_id") or full_metadata.get("openlibrary") + archive_id = full_metadata.get("archive_id") + + if not olid: + import re + m = re.search(r"/books/(OL\d+M)", str(media_path_or_url), flags=re.IGNORECASE) + if m: + olid = m.group(1) + + scraped_tags: List[str] = [] + if olid: + scraped_tags.extend(_OpenLibrary.scrape_openlibrary_metadata(str(olid)) or []) + if archive_id: + scraped_tags.append(f"internet_archive:{archive_id}") + + if scraped_tags: + existing = list(pipe_obj.tag or []) + pipe_obj.tag = merge_sequences(existing, scraped_tags, case_sensitive=False) + except Exception: + pass + + from ProviderCore.registry import get_search_provider + from ProviderCore.base import SearchResult + + provider = get_search_provider("openlibrary", config) + if provider is None: + log("[add-file] OpenLibrary provider not available", file=sys.stderr) + failures += 1 + continue + + temp_dir_to_cleanup = Path(tempfile.mkdtemp(prefix="medios_openlibrary_")) + sr = SearchResult( + table="openlibrary", + title=str(getattr(pipe_obj, "title", None) or "Unknown"), + path=str(media_path_or_url), + full_metadata=full_metadata if isinstance(full_metadata, dict) else {}, + ) + downloaded = provider.download(sr, temp_dir_to_cleanup) + if downloaded is None: + log("[add-file] OpenLibrary download failed", file=sys.stderr) + failures += 1 + continue + + downloaded_path = Path(downloaded) + if downloaded_path.exists() and downloaded_path.is_dir(): + log( + "[add-file] OpenLibrary download produced a directory (missing img2pdf?). Cannot ingest.", + file=sys.stderr, + ) + failures += 1 + continue + + media_path_or_url = str(downloaded_path) + pipe_obj.path = str(downloaded_path) + delete_after_item = True + + # For non-provider URLs, or if still a URL after provider attempt, delegate to download-media. + if isinstance(media_path_or_url, str) and media_path_or_url.lower().startswith( + ("http://", "https://", "magnet:", "torrent:") + ): + code = self._delegate_to_download_data(item, media_path_or_url, location, provider_name, args, config) + if code == 0: + successes += 1 + else: + failures += 1 + continue + + media_path = Path(media_path_or_url) if isinstance(media_path_or_url, str) else media_path_or_url + + if not self._validate_source(media_path): + failures += 1 + continue + + if provider_name: + code = self._handle_provider_upload(media_path, provider_name, pipe_obj, config, delete_after_item) + if code == 0: + successes += 1 + else: + failures += 1 + continue + + if location: + try: + store = Store(config) + backends = store.list_backends() + if location in backends: + code = self._handle_storage_backend( + item, + media_path, + location, + pipe_obj, + config, + delete_after_item, + collect_payloads=collected_payloads, + suppress_last_stage_overlay=is_last_stage and len(items_to_process) > 1, + auto_search_store=auto_search_store_after_add, + ) + else: + code = self._handle_local_export(media_path, location, pipe_obj, config, delete_after_item) + except Exception as exc: + debug(f"[add-file] ERROR: Failed to resolve location: {exc}") + log(f"Invalid location: {location}", file=sys.stderr) + failures += 1 + continue + + if code == 0: + successes += 1 + else: + failures += 1 + continue + + log("No destination specified", file=sys.stderr) + failures += 1 + finally: + if temp_dir_to_cleanup is not None: + try: + shutil.rmtree(temp_dir_to_cleanup, ignore_errors=True) + except Exception: + pass + + # If we processed multiple storage ingests, present a single consolidated overlay table. + if is_last_stage and len(items_to_process) > 1 and collected_payloads: + try: + from result_table import ResultTable + + table = ResultTable("Result") + for payload in collected_payloads: + table.add_result(payload) + # Make this the active selectable table so @.. returns here (and playlist table is kept in history). + ctx.set_last_result_table(table, collected_payloads, subject=collected_payloads) + except Exception: + pass + + if successes > 0: + return 0 + return 1 @staticmethod def _resolve_source( @@ -149,10 +268,7 @@ class Add_File(Cmdlet): config: Dict[str, Any], ) -> Tuple[Optional[Path | str], Optional[str]]: """Resolve the source file path from args or pipeline result. - - PRIORITY: hash+store pattern is preferred over path-based resolution. - This ensures consistency when @N selections pass hash+store identifiers. - + Returns (media_path_or_url, file_hash) where media_path_or_url can be a Path object or a URL string. """ @@ -161,8 +277,9 @@ class Add_File(Cmdlet): result_hash = result.get("hash") result_store = result.get("store") if result_hash and result_store: - debug(f"[add-file] Using hash+store from result: hash={result_hash[:12]}..., store={result_store}") - # Use get_file to retrieve from the specific store + debug( + f"[add-file] Using hash+store from result: hash={str(result_hash)[:12]}..., store={result_store}" + ) try: store = Store(config) if result_store in store.list_backends(): @@ -170,16 +287,15 @@ class Add_File(Cmdlet): media_path = backend.get_file(result_hash) if isinstance(media_path, Path) and media_path.exists(): pipe_obj.path = str(media_path) - debug(f"[add-file] Retrieved file from {result_store}: {media_path}") - return media_path, result_hash - - if isinstance(media_path, str) and media_path.lower().startswith(("http://", "https://")): + return media_path, str(result_hash) + if isinstance(media_path, str) and media_path.lower().startswith( + ("http://", "https://", "magnet:", "torrent:") + ): pipe_obj.path = media_path - debug(f"[add-file] Retrieved URL from {result_store}: {media_path}") - return media_path, result_hash + return media_path, str(result_hash) except Exception as exc: debug(f"[add-file] Failed to retrieve via hash+store: {exc}") - + # PRIORITY 2: Try explicit path argument if path_arg: media_path = Path(path_arg) @@ -196,10 +312,9 @@ class Add_File(Cmdlet): file_hash = pipe_path_str.split(":", 1)[1] media_path, success = Add_File._fetch_hydrus_path(file_hash, config) return media_path, file_hash if success else None - # Check if pipe_path is a URL - skip to URL handling below - if not pipe_path_str.lower().startswith(("http://", "https://", "magnet:", "torrent:")): - media_path = Path(pipe_path_str) - return media_path, None + if pipe_path_str.lower().startswith(("http://", "https://", "magnet:", "torrent:")): + return pipe_path_str, None + return Path(pipe_path_str), None # PRIORITY 4: Try from pipe_obj.url (for streaming url without downloaded file) pipe_url = getattr(pipe_obj, "url", None) @@ -248,8 +363,9 @@ class Add_File(Cmdlet): # Look for path or path-like keys path_candidate = first_item.get("path") or first_item.get("filepath") or first_item.get("file") # If the dict includes a 'paths' list (multi-part/section download), prefer the first file - if not path_candidate and isinstance(first_item.get("paths"), (list, tuple)) and first_item.get("paths"): - path_candidate = first_item.get("paths")[0] + paths_val = first_item.get("paths") + if not path_candidate and isinstance(paths_val, (list, tuple)) and paths_val: + path_candidate = paths_val[0] if path_candidate: debug(f"Resolved path from result dict: {path_candidate}") try: @@ -361,10 +477,12 @@ class Add_File(Cmdlet): selection_args = result["_selection_args"] if selection_args: dl_args.extend(selection_args) - elif hasattr(result, 'extra') and isinstance(result.extra, dict) and "_selection_args" in result.extra: - selection_args = result.extra["_selection_args"] - if selection_args: - dl_args.extend(selection_args) + else: + extra_val = getattr(result, "extra", None) + if isinstance(extra_val, dict) and "_selection_args" in extra_val: + selection_args = extra_val["_selection_args"] + if selection_args: + dl_args.extend(selection_args) # download-media doesn't support -storage flag # It downloads to the configured directory, then add-file will handle storage @@ -375,18 +493,32 @@ class Add_File(Cmdlet): @staticmethod def _get_url(result: Any, pipe_obj: models.PipeObject) -> List[str]: - url: List[str] = [] - try: - if isinstance(pipe_obj.extra, dict): - url = list(pipe_obj.extra.get("url") or pipe_obj.extra.get("url") or []) - except Exception: - pass + from metadata import normalize_urls - if not url and isinstance(result, dict): - url = list(result.get("url") or result.get("url") or []) - if not url: - url = list(extract_url_from_result(result) or []) - return url + # Prefer explicit PipeObject.url if present + urls: List[str] = [] + try: + urls = normalize_urls(getattr(pipe_obj, "url", None)) + except Exception: + urls = [] + + # Then check extra.url + if not urls: + try: + if isinstance(pipe_obj.extra, dict): + urls = normalize_urls(pipe_obj.extra.get("url")) + except Exception: + pass + + # Then check result dict + if not urls and isinstance(result, dict): + urls = normalize_urls(result.get("url")) + + # Finally, try extractor helper + if not urls: + urls = normalize_urls(extract_url_from_result(result)) + + return urls @staticmethod def _get_relationships(result: Any, pipe_obj: models.PipeObject) -> Optional[Dict[str, Any]]: @@ -405,10 +537,36 @@ class Add_File(Cmdlet): @staticmethod def _get_duration(result: Any, pipe_obj: models.PipeObject) -> Optional[float]: - if getattr(pipe_obj, "duration", None) is not None: - return pipe_obj.duration + def _parse_duration(value: Any) -> Optional[float]: + if value is None: + return None + if isinstance(value, (int, float)): + return float(value) if value > 0 else None + if isinstance(value, str): + s = value.strip() + if not s: + return None + try: + candidate = float(s) + return candidate if candidate > 0 else None + except ValueError: + pass + if ":" in s: + parts = [p.strip() for p in s.split(":") if p.strip()] + if len(parts) in {2, 3} and all(p.isdigit() for p in parts): + nums = [int(p) for p in parts] + if len(nums) == 2: + minutes, seconds = nums + return float(minutes * 60 + seconds) + hours, minutes, seconds = nums + return float(hours * 3600 + minutes * 60 + seconds) + return None + + parsed = _parse_duration(getattr(pipe_obj, "duration", None)) + if parsed is not None: + return parsed try: - return extract_duration(result) + return _parse_duration(extract_duration(result)) except Exception: return None @@ -442,19 +600,20 @@ class Add_File(Cmdlet): ctx.set_current_stage_table(None) @staticmethod - def _emit_storage_result(payload: Dict[str, Any]) -> None: + def _emit_storage_result(payload: Dict[str, Any], *, overlay: bool = True, emit: bool = True) -> None: """Emit a storage-style result payload. - Always emits the dict downstream (when in a pipeline). - If this is the last stage (or not in a pipeline), prints a search-store-like table and sets an overlay table/items for @N selection. """ - # Always emit for downstream commands (no-op if not in a pipeline) - ctx.emit(payload) + # Emit for downstream commands (no-op if not in a pipeline) + if emit: + ctx.emit(payload) stage_ctx = ctx.get_stage_context() is_last = (stage_ctx is None) or bool(getattr(stage_ctx, "is_last_stage", False)) - if not is_last: + if not is_last or not overlay: return try: @@ -470,6 +629,53 @@ class Add_File(Cmdlet): except Exception: pass + @staticmethod + def _try_emit_search_store_by_hash(*, store: str, hash_value: str, config: Dict[str, Any]) -> bool: + """Run search-store for a single hash so the final table/payload is consistent. + + Important: `add-file` is treated as an action command by the CLI, so the CLI only + prints tables for it when a display overlay exists. After running search-store, + this copies the resulting table into the display overlay (when this is the last + stage) so the canonical store table is what the user sees and can select from. + + Returns True if search-store ran successfully, else False. + """ + try: + from cmdlet.search_store import CMDLET as search_store_cmdlet + + args = ["-store", str(store), f"hash:{str(hash_value)}"] + log(f"[add-file] Refresh: search-store -store {store} \"hash:{hash_value}\"", file=sys.stderr) + + # Run search-store under a temporary stage context so its ctx.emit() calls + # don't interfere with the outer add-file pipeline stage. + prev_ctx = ctx.get_stage_context() + temp_ctx = ctx.PipelineStageContext(stage_index=0, total_stages=1, worker_id=getattr(prev_ctx, "worker_id", None)) + ctx.set_stage_context(temp_ctx) + try: + code = search_store_cmdlet.run(None, args, config) + finally: + ctx.set_stage_context(prev_ctx) + if code != 0: + return False + + # Promote the search-store result to a display overlay so the CLI prints it + # for action commands like add-file. + stage_ctx = ctx.get_stage_context() + is_last = (stage_ctx is None) or bool(getattr(stage_ctx, "is_last_stage", False)) + if is_last: + try: + table = ctx.get_last_result_table() + items = ctx.get_last_result_items() + if table is not None and items: + ctx.set_last_result_table_overlay(table, items, subject={"store": store, "hash": hash_value}) + except Exception: + pass + + return True + except Exception as exc: + debug(f"[add-file] Failed to run search-store after add-file: {type(exc).__name__}: {exc}") + return False + @staticmethod def _prepare_metadata( result: Any, @@ -664,8 +870,9 @@ class Add_File(Cmdlet): if not username or not filename: debug(f"[add-file] ERROR: Could not extract soulseek metadata from result (type={type(result).__name__})") - if hasattr(result, "extra"): - debug(f"[add-file] Result extra keys: {list(result.extra.keys())}") + extra_val = getattr(result, "extra", None) + if isinstance(extra_val, dict): + debug(f"[add-file] Result extra keys: {list(extra_val.keys())}") return None if not username or not filename: @@ -769,28 +976,55 @@ class Add_File(Cmdlet): @staticmethod def _handle_storage_backend( + result: Any, media_path: Path, backend_name: str, pipe_obj: models.PipeObject, config: Dict[str, Any], delete_after: bool, + *, + collect_payloads: Optional[List[Dict[str, Any]]] = None, + suppress_last_stage_overlay: bool = False, + auto_search_store: bool = True, ) -> int: """Handle uploading to a registered storage backend (e.g., 'test' folder store, 'hydrus', etc.).""" log(f"Adding file to storage backend '{backend_name}': {media_path.name}", file=sys.stderr) + + delete_after_effective = bool(delete_after) + if not delete_after_effective: + # When download-media is piped into add-file, the downloaded artifact is a temp file. + # After it is persisted to a storage backend, delete the temp copy to avoid duplicates. + try: + if ( + str(backend_name or "").strip().lower() != "temp" + and getattr(pipe_obj, "is_temp", False) + and getattr(pipe_obj, "action", None) == "cmdlet:download-media" + ): + from config import resolve_output_dir + temp_dir = resolve_output_dir(config) + try: + if media_path.resolve().is_relative_to(temp_dir.expanduser().resolve()): + delete_after_effective = True + debug(f"[add-file] Auto-delete temp source after ingest: {media_path}") + except Exception: + # If path resolution fails, fall back to non-destructive behavior + pass + except Exception: + pass try: store = Store(config) backend = store[backend_name] # Prepare metadata from pipe_obj and sidecars - tags, url, title, f_hash = Add_File._prepare_metadata(None, media_path, pipe_obj, config) + tags, url, title, f_hash = Add_File._prepare_metadata(result, media_path, pipe_obj, config) # Call backend's add_file with full metadata # Backend returns hash as identifier file_identifier = backend.add_file( media_path, title=title, - tags=tags, + tag=tags, url=url ) log(f"✓ File added to '{backend_name}': {file_identifier}", file=sys.stderr) @@ -822,6 +1056,14 @@ class Add_File(Cmdlet): # Keep hash/store for downstream commands (get-tag, get-file, etc.). resolved_hash = file_identifier if len(file_identifier) == 64 else (f_hash or file_identifier or "unknown") + # If we have url(s), ensure they get associated with the destination file. + # This mirrors `add-url` behavior but avoids emitting extra pipeline noise. + if url: + try: + backend.add_url(resolved_hash, list(url)) + except Exception: + pass + meta: Dict[str, Any] = {} try: meta = backend.get_metadata(resolved_hash) or {} @@ -865,9 +1107,30 @@ class Add_File(Cmdlet): "tag": list(tags or []), "url": list(url or []), } - Add_File._emit_storage_result(payload) + if collect_payloads is not None: + try: + collect_payloads.append(payload) + except Exception: + pass + + # Keep the add-file 1-row summary overlay (when last stage), then emit the + # canonical search-store payload/table for piping/selection consistency. + if auto_search_store and resolved_hash and resolved_hash != "unknown": + # Show the add-file summary (overlay only) but let search-store provide the downstream payload. + Add_File._emit_storage_result(payload, overlay=not suppress_last_stage_overlay, emit=False) + + ok = Add_File._try_emit_search_store_by_hash( + store=backend_name, + hash_value=resolved_hash, + config=config, + ) + if not ok: + # Fall back to emitting the add-file payload so downstream stages still receive an item. + ctx.emit(payload) + else: + Add_File._emit_storage_result(payload, overlay=not suppress_last_stage_overlay, emit=True) - Add_File._cleanup_after_success(media_path, delete_source=delete_after) + Add_File._cleanup_after_success(media_path, delete_source=delete_after_effective) return 0 except Exception as exc: diff --git a/cmdlet/add_url.py b/cmdlet/add_url.py index 2254d6e..afccabe 100644 --- a/cmdlet/add_url.py +++ b/cmdlet/add_url.py @@ -3,7 +3,6 @@ from __future__ import annotations from typing import Any, Dict, Sequence import sys -from . import register import pipeline as ctx from ._shared import Cmdlet, CmdletArg, SharedArgs, parse_cmdlet_args, get_field, normalize_hash from SYS.logger import log @@ -12,19 +11,24 @@ from Store import Store class Add_Url(Cmdlet): """Add URL associations to files via hash+store.""" - - NAME = "add-url" - SUMMARY = "Associate a URL with a file" - USAGE = "@1 | add-url " - ARGS = [ - SharedArgs.HASH, - SharedArgs.STORE, - CmdletArg("url", required=True, description="URL to associate"), - ] - DETAIL = [ - "- Associates URL with file identified by hash+store", - "- Multiple url can be comma-separated", - ] + + def __init__(self) -> None: + super().__init__( + name="add-url", + summary="Associate a URL with a file", + usage="@1 | add-url ", + arg=[ + SharedArgs.HASH, + SharedArgs.STORE, + CmdletArg("url", required=True, description="URL to associate"), + ], + detail=[ + "- Associates URL with file identified by hash+store", + "- Multiple url can be comma-separated", + ], + exec=self.run, + ) + self.register() def run(self, result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: """Add URL to file via hash+store backend.""" @@ -78,8 +82,7 @@ class Add_Url(Cmdlet): return 1 -# Register cmdlet -register(["add-url", "add_url"])(Add_Url) +CMDLET = Add_Url() diff --git a/cmdlet/delete_url.py b/cmdlet/delete_url.py index 479ac2b..41c1b22 100644 --- a/cmdlet/delete_url.py +++ b/cmdlet/delete_url.py @@ -3,7 +3,6 @@ from __future__ import annotations from typing import Any, Dict, Sequence import sys -from . import register import pipeline as ctx from ._shared import Cmdlet, CmdletArg, SharedArgs, parse_cmdlet_args, get_field, normalize_hash from SYS.logger import log @@ -12,19 +11,24 @@ from Store import Store class Delete_Url(Cmdlet): """Delete URL associations from files via hash+store.""" - - NAME = "delete-url" - SUMMARY = "Remove a URL association from a file" - USAGE = "@1 | delete-url " - ARGS = [ - SharedArgs.HASH, - SharedArgs.STORE, - CmdletArg("url", required=True, description="URL to remove"), - ] - DETAIL = [ - "- Removes URL association from file identified by hash+store", - "- Multiple url can be comma-separated", - ] + + def __init__(self) -> None: + super().__init__( + name="delete-url", + summary="Remove a URL association from a file", + usage="@1 | delete-url ", + arg=[ + SharedArgs.HASH, + SharedArgs.STORE, + CmdletArg("url", required=True, description="URL to remove"), + ], + detail=[ + "- Removes URL association from file identified by hash+store", + "- Multiple url can be comma-separated", + ], + exec=self.run, + ) + self.register() def run(self, result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: """Delete URL from file via hash+store backend.""" @@ -78,5 +82,4 @@ class Delete_Url(Cmdlet): return 1 -# Register cmdlet -register(["delete-url", "del-url", "delete_url"])(Delete_Url) +CMDLET = Delete_Url() diff --git a/cmdlet/download_file.py b/cmdlet/download_file.py index 1595be5..f0b09a3 100644 --- a/cmdlet/download_file.py +++ b/cmdlet/download_file.py @@ -190,9 +190,11 @@ class Download_File(Cmdlet): # If this looks like a provider item and providers are available, prefer provider.download() downloaded_path: Optional[Path] = None + attempted_provider_download = False if table and get_search_provider and SearchResult: provider = get_search_provider(str(table), config) if provider is not None: + attempted_provider_download = True sr = SearchResult( table=str(table), title=str(title or "Unknown"), @@ -202,6 +204,19 @@ class Download_File(Cmdlet): debug(f"[download-file] Downloading provider item via {table}: {sr.title}") downloaded_path = provider.download(sr, final_output_dir) + # OpenLibrary: if provider download failed, do NOT try to download the OpenLibrary page HTML. + if downloaded_path is None and attempted_provider_download and str(table or "").lower() == "openlibrary": + availability = None + reason = None + if isinstance(full_metadata, dict): + availability = full_metadata.get("availability") + reason = full_metadata.get("availability_reason") + msg = "[download-file] OpenLibrary item not downloadable" + if availability or reason: + msg += f" (availability={availability or ''} reason={reason or ''})" + log(msg, file=sys.stderr) + continue + # Fallback: if we have a direct HTTP URL, download it directly if downloaded_path is None and isinstance(target, str) and target.startswith("http"): debug(f"[download-file] Provider item looks like direct URL, downloading: {target}") diff --git a/cmdlet/download_media.py b/cmdlet/download_media.py index de95d33..7fa8ec9 100644 --- a/cmdlet/download_media.py +++ b/cmdlet/download_media.py @@ -693,6 +693,7 @@ def probe_url(url: str, no_playlist: bool = False, timeout_seconds: int = 15) -> return # Extract relevant fields + webpage_url = info.get("webpage_url") or info.get("original_url") or info.get("url") result_container[0] = { "extractor": info.get("extractor", ""), "title": info.get("title", ""), @@ -700,7 +701,9 @@ def probe_url(url: str, no_playlist: bool = False, timeout_seconds: int = 15) -> "duration": info.get("duration"), "uploader": info.get("uploader"), "description": info.get("description"), - "url": url, + # Keep both the requested and canonical URL forms; callers should prefer webpage_url. + "requested_url": url, + "webpage_url": webpage_url, } except Exception as exc: log(f"Probe error for {url}: {exc}") @@ -1220,9 +1223,359 @@ class Download_Media(Cmdlet): log(f"Invalid clip format: {clip_spec}", file=sys.stderr) return 1 + quiet_mode = bool(config.get("_quiet_background_output")) if isinstance(config, dict) else False + + storage = None + hydrus_available = True + try: + from Store import Store + storage = Store(config=config or {}, suppress_debug=True) + from API.HydrusNetwork import is_hydrus_available + hydrus_available = bool(is_hydrus_available(config or {})) + except Exception: + storage = None + + def _preflight_url_duplicate(candidate_url: str, extra_urls: Optional[Sequence[str]] = None) -> bool: + # NOTE: download-media sets _quiet_background_output=True when running in a pipeline to + # reduce background noise. URL de-dup is interactive and must still run in pipelines. + if storage is None: + debug("Preflight URL check skipped: storage unavailable") + return True + + debug(f"Preflight URL check: candidate={candidate_url}") + + try: + from metadata import normalize_urls + except Exception: + normalize_urls = None # type: ignore[assignment] + + needles: List[str] = [] + if normalize_urls is not None: + for raw in [candidate_url, *(list(extra_urls) if extra_urls else [])]: + try: + needles.extend(normalize_urls(raw)) + except Exception: + continue + # Fallback: always have at least one needle + if not needles: + needles = [str(candidate_url)] + + # Deduplicate needles (preserve order) + seen_needles: List[str] = [] + for needle in needles: + if needle and needle not in seen_needles: + seen_needles.append(needle) + needles = seen_needles + + try: + debug(f"Preflight URL needles: {needles}") + except Exception: + pass + + url_matches: List[Dict[str, Any]] = [] + try: + from Store.HydrusNetwork import HydrusNetwork + + # Avoid searching the temp/download directory backend during dedup. + # We only want to warn about duplicates in real stores. + backend_names_all = storage.list_searchable_backends() + backend_names: List[str] = [] + skipped: List[str] = [] + for backend_name in backend_names_all: + try: + backend = storage[backend_name] + except Exception: + continue + + try: + if str(backend_name).strip().lower() == "temp": + skipped.append(backend_name) + continue + except Exception: + pass + + # Heuristic: if a Folder backend points at the configured temp output dir, skip it. + try: + backend_location = getattr(backend, "_location", None) + if backend_location and final_output_dir: + backend_path = Path(str(backend_location)).expanduser().resolve() + temp_path = Path(str(final_output_dir)).expanduser().resolve() + if backend_path == temp_path: + skipped.append(backend_name) + continue + except Exception: + pass + + backend_names.append(backend_name) + + try: + if skipped: + debug(f"Preflight backends: {backend_names} (skipped temp: {skipped})") + else: + debug(f"Preflight backends: {backend_names}") + except Exception: + pass + + for backend_name in backend_names: + backend = storage[backend_name] + if isinstance(backend, HydrusNetwork) and not hydrus_available: + continue + + backend_hits: List[Dict[str, Any]] = [] + for needle in needles: + try: + backend_hits = backend.search(f"url:{needle}", limit=25) or [] + if backend_hits: + break + except Exception: + continue + if backend_hits: + url_matches.extend([dict(x) if isinstance(x, dict) else {"title": str(x)} for x in backend_hits]) + + if len(url_matches) >= 25: + url_matches = url_matches[:25] + break + except Exception: + url_matches = [] + + if not url_matches: + debug("Preflight URL check: no matches") + return True + + table = ResultTable(f"URL already exists ({len(url_matches)} match(es))") + results_list: List[Dict[str, Any]] = [] + for item in url_matches: + if "title" not in item: + item["title"] = item.get("name") or item.get("target") or item.get("path") or "Result" + table.add_result(item) + results_list.append(item) + + pipeline_context.set_current_stage_table(table) + pipeline_context.set_last_result_table(table, results_list) + + print(f"\n{table}") + response = input("Continue anyway? (y/n): ").strip().lower() + if response not in {"y", "yes"}: + return False + return True + + def _canonicalize_url_for_storage(requested_url: str) -> str: + # Prefer yt-dlp's canonical webpage URL (e.g. strips timestamps/redirects). + # Fall back to the requested URL if probing fails. + # Important: when playlist item selection is used, avoid probing (can hang on large playlists). + if playlist_items: + return str(requested_url) + try: + pr = probe_url(requested_url, no_playlist=False, timeout_seconds=15) + if isinstance(pr, dict): + for key in ("webpage_url", "original_url", "url", "requested_url"): + value = pr.get(key) + if isinstance(value, str) and value.strip(): + return value.strip() + except Exception: + pass + return str(requested_url) + # Check if we need to show format selection playlist_items = str(parsed.get("item")) if parsed.get("item") else None ytdl_format = parsed.get("format") + playlist_selection_handled = False + + def _parse_at_selection(choice: str, *, max_index: int) -> Optional[List[int]]: + """Parse @ selection syntax (@2, @2-5, @{1,3,5}, @2,5,7) into 1-based indices.""" + raw = str(choice or "").strip() + if not raw: + return None + + if raw.lower() in {"q", "quit", "cancel"}: + return None + + if raw == "@*" or raw == "*": + return list(range(1, max_index + 1)) + + if raw.startswith("@"): + raw = raw[1:].strip() + + if raw.startswith("{") and raw.endswith("}"): + raw = raw[1:-1].strip() + + if not raw: + return None + + indices: set[int] = set() + for part in raw.split(","): + part = part.strip() + if not part: + continue + if "-" in part: + left, right = [p.strip() for p in part.split("-", 1)] + if not left or not right: + return None + try: + start = int(left) + end = int(right) + except ValueError: + return None + if start < 1 or end < 1: + return None + if end < start: + start, end = end, start + for i in range(start, end + 1): + if 1 <= i <= max_index: + indices.add(i) + else: + try: + i = int(part) + except ValueError: + return None + if 1 <= i <= max_index: + indices.add(i) + if not indices: + return None + return sorted(indices) + + def _maybe_prompt_playlist_items(url: str) -> Optional[Dict[str, Any]]: + """If URL appears to be a playlist/channel/collection, prompt user for @ selection. + + Returns: + - None if URL is not a playlist-like multi-entry page (or probe fails) + - Dict with keys: + - cancel: bool + - playlist_items: Optional[str] (None means download all) + - selected_urls: Optional[List[str]] (expanded per-entry urls when available) + """ + try: + pr = probe_url(url, no_playlist=False, timeout_seconds=15) + except Exception: + pr = None + if not isinstance(pr, dict): + return None + entries = pr.get("entries") + if not isinstance(entries, list) or len(entries) <= 1: + return None + + # Display table (limit rows to keep output reasonable) + max_rows = 200 + display_entries = entries[:max_rows] + total = len(entries) + + def _entry_to_url(entry: Any) -> Optional[str]: + if not isinstance(entry, dict): + return None + # Prefer explicit absolute URLs when present + for key in ("webpage_url", "original_url", "url"): + v = entry.get(key) + if isinstance(v, str) and v.strip(): + s = v.strip() + try: + if urlparse(s).scheme in {"http", "https"}: + return s + except Exception: + return s + + # Best-effort YouTube fallback from id + entry_id = entry.get("id") + if isinstance(entry_id, str) and entry_id.strip(): + extractor_name = str(pr.get("extractor") or pr.get("extractor_key") or "").lower() + if "youtube" in extractor_name: + return f"https://www.youtube.com/watch?v={entry_id.strip()}" + return None + + table = ResultTable() + table.title = f"Playlist items ({total}{' shown ' + str(len(display_entries)) if total > max_rows else ''})" + table.set_source_command("download-media", [url]) + try: + table.set_preserve_order(True) + except Exception: + pass + + results_list: List[Dict[str, Any]] = [] + for idx, entry in enumerate(display_entries, 1): + title = None + uploader = None + duration = None + try: + if isinstance(entry, dict): + title = entry.get("title") + uploader = entry.get("uploader") or pr.get("uploader") + duration = entry.get("duration") + except Exception: + pass + + row: Dict[str, Any] = { + "table": "download-media", + "title": str(title or f"Item {idx}"), + "detail": str(uploader or ""), + "media_kind": "playlist-item", + "playlist_index": idx, + "columns": [ + ("#", str(idx)), + ("Title", str(title or "")), + ("Duration", str(duration or "")), + ("Uploader", str(uploader or "")), + ], + } + results_list.append(row) + table.add_result(row) + + pipeline_context.set_current_stage_table(table) + pipeline_context.set_last_result_table(table, results_list) + + print(f"\n{table}") + choice = input("Select items to download (@N, @2-5, @{1,3}, @*, or 'q' to cancel): ").strip() + if not choice or choice.lower() in {"q", "quit", "cancel"}: + return {"cancel": True, "playlist_items": None, "selected_urls": []} + if choice.strip() == "@*" or choice.strip() == "*": + # @* means all entries, not just displayed rows. + selected_urls: List[str] = [] + for entry in entries: + u = _entry_to_url(entry) + if u and u not in selected_urls: + selected_urls.append(u) + # Only expand when we can derive URLs for all entries; otherwise fall back to yt-dlp playlist handling. + if len(selected_urls) == len(entries): + return {"cancel": False, "playlist_items": None, "selected_urls": selected_urls} + return {"cancel": False, "playlist_items": None, "selected_urls": []} + + parsed_indices = _parse_at_selection(choice, max_index=len(display_entries)) + if not parsed_indices: + log("Invalid selection. Use @N, @2-5, @{1,3}, or @*", file=sys.stderr) + return {"cancel": True, "playlist_items": None, "selected_urls": []} + + selected_urls: List[str] = [] + for i in parsed_indices: + try: + entry = display_entries[i - 1] + except Exception: + continue + u = _entry_to_url(entry) + if u and u not in selected_urls: + selected_urls.append(u) + + # If we can expand per-entry URLs, return them. + if selected_urls and len(selected_urls) == len(parsed_indices): + return {"cancel": False, "playlist_items": None, "selected_urls": selected_urls} + + # yt-dlp accepts comma-separated 1-based indices for playlist_items + return {"cancel": False, "playlist_items": ",".join(str(i) for i in parsed_indices), "selected_urls": []} + + # Playlist/multi-entry detection: if the URL has multiple items and the user didn't + # specify -item, prompt for @ selection (supports @* for all). + if len(supported_url) == 1 and not playlist_items and not ytdl_format: + candidate_url = supported_url[0] + selection_info = _maybe_prompt_playlist_items(candidate_url) + if selection_info is not None: + playlist_selection_handled = True + if bool(selection_info.get("cancel")): + return 0 + selected_urls = selection_info.get("selected_urls") + if isinstance(selected_urls, list) and selected_urls: + # Expand playlist/channel URL into per-entry URLs so that de-dup preflight + # and downloads operate per file. + supported_url = selected_urls + playlist_items = None + else: + playlist_items = selection_info.get("playlist_items") # If no -item, no explicit -format specified, and single URL, show the format table. # Do NOT stop to show formats when -audio is used (auto-pick) or when -clip is used. @@ -1232,8 +1585,15 @@ class Download_Media(Cmdlet): and not playlist_items and not ytdl_format and len(supported_url) == 1 + and not playlist_selection_handled ): url = supported_url[0] + + canonical_url = _canonicalize_url_for_storage(url) + if not _preflight_url_duplicate(canonical_url, extra_urls=[url]): + log(f"Skipping download: {url}", file=sys.stderr) + return 0 + formats = list_formats(url, no_playlist=False) if formats and len(formats) > 1: @@ -1379,12 +1739,18 @@ class Download_Media(Cmdlet): # Download each URL downloaded_count = 0 clip_sections_spec = self._build_clip_sections_spec(clip_range) - quiet_mode = bool(config.get("_quiet_background_output")) if isinstance(config, dict) else False for url in supported_url: try: debug(f"Processing: {url}") + canonical_url = _canonicalize_url_for_storage(url) + + # Preflight: warn if URL already exists in storage backends. + if not _preflight_url_duplicate(canonical_url, extra_urls=[url]): + log(f"Skipping download: {url}", file=sys.stderr) + continue + # If playlist_items is specified but looks like a format ID (e.g. from table selection), # treat it as a format selector instead of playlist items. # This handles the case where @N selection passes -item @@ -1532,24 +1898,17 @@ class Download_Media(Cmdlet): if title and f"title:{title}" not in tag: tag.insert(0, f"title:{title}") - # Build a single canonical URL field; prefer yt-dlp provided webpage_url or info.url, - # but fall back to the original requested URL. If multiple unique urls are available, - # join them into a comma-separated string. - urls_to_consider: List[str] = [] + # Store the canonical URL for de-dup/search purposes. + # Prefer yt-dlp's webpage_url, and do not mix in the raw requested URL (which may contain timestamps). + final_url = None try: - page_url = info.get("webpage_url") or info.get("url") + page_url = info.get("webpage_url") or info.get("original_url") or info.get("url") if page_url: - urls_to_consider.append(str(page_url)) + final_url = str(page_url) except Exception: - pass - if url: - urls_to_consider.append(str(url)) - - seen_urls: List[str] = [] - for u in urls_to_consider: - if u and u not in seen_urls: - seen_urls.append(u) - final_url = ",".join(seen_urls) if seen_urls else None + final_url = None + if not final_url and url: + final_url = str(url) # Construct canonical PipeObject dict: hash, store, path, url, title, tags # Prefer explicit backend names (storage_name/storage_location). If none, default to PATH @@ -1561,6 +1920,7 @@ class Download_Media(Cmdlet): "url": final_url, "tag": tag, "action": "cmdlet:download-media", + "is_temp": True, # download_mode removed (deprecated), keep media_kind "store": getattr(opts, "storage_name", None) or getattr(opts, "storage_location", None) or "PATH", "media_kind": "video" if opts.mode == "video" else "audio", diff --git a/cmdlet/get_metadata.py b/cmdlet/get_metadata.py index c550eeb..0681c1e 100644 --- a/cmdlet/get_metadata.py +++ b/cmdlet/get_metadata.py @@ -184,6 +184,32 @@ class Get_Metadata(Cmdlet): mime_type = metadata.get("mime") or metadata.get("ext", "") file_size = metadata.get("size") duration_seconds = metadata.get("duration") + if duration_seconds is None: + duration_seconds = metadata.get("duration_seconds") + if duration_seconds is None: + duration_seconds = metadata.get("length") + if duration_seconds is None and isinstance(metadata.get("duration_ms"), (int, float)): + try: + duration_seconds = float(metadata["duration_ms"]) / 1000.0 + except Exception: + duration_seconds = None + + if isinstance(duration_seconds, str): + s = duration_seconds.strip() + if s: + try: + duration_seconds = float(s) + except ValueError: + if ":" in s: + parts = [p.strip() for p in s.split(":") if p.strip()] + if len(parts) in {2, 3} and all(p.isdigit() for p in parts): + nums = [int(p) for p in parts] + if len(nums) == 2: + duration_seconds = float(nums[0] * 60 + nums[1]) + else: + duration_seconds = float(nums[0] * 3600 + nums[1] * 60 + nums[2]) + else: + duration_seconds = None pages = metadata.get("pages") url = metadata.get("url") or [] imported_ts = self._extract_imported_ts(metadata) diff --git a/cmdlet/get_tag.py b/cmdlet/get_tag.py index e50b7a1..c3cd013 100644 --- a/cmdlet/get_tag.py +++ b/cmdlet/get_tag.py @@ -12,7 +12,13 @@ from __future__ import annotations import sys -from SYS.logger import log, debug +try: + from Provider.openlibrary import OpenLibrary + _ol_scrape_isbn_metadata = OpenLibrary.scrape_isbn_metadata + _ol_scrape_openlibrary_metadata = OpenLibrary.scrape_openlibrary_metadata +except Exception: + _ol_scrape_isbn_metadata = None # type: ignore[assignment] + _ol_scrape_openlibrary_metadata = None # type: ignore[assignment] from Provider.metadata_provider import get_metadata_provider, list_metadata_providers import subprocess from pathlib import Path @@ -31,6 +37,10 @@ except ImportError: extract_title = None +_scrape_isbn_metadata = _ol_scrape_isbn_metadata # type: ignore[assignment] +_scrape_openlibrary_metadata = _ol_scrape_openlibrary_metadata # type: ignore[assignment] + + @@ -691,249 +701,22 @@ def _extract_url_formats(formats: list) -> List[Tuple[str, str]]: def _scrape_isbn_metadata(isbn: str) -> List[str]: - """Scrape metadata for an ISBN using Open Library API.""" - new_tags = [] + if _ol_scrape_isbn_metadata is None: + log("OpenLibrary scraper unavailable", file=sys.stderr) + return [] try: - from ..API.HTTP import HTTPClient - import json as json_module - - isbn_clean = isbn.replace('-', '').strip() - url = f"https://openlibrary.org/api/books?bibkeys=ISBN:{isbn_clean}&jscmd=data&format=json" - - try: - with HTTPClient() as client: - response = client.get(url) - response.raise_for_status() - data = json_module.loads(response.content.decode('utf-8')) - except Exception as e: - log(f"Failed to fetch ISBN metadata: {e}", file=sys.stderr) - return [] - - if not data: - log(f"No ISBN metadata found for: {isbn}") - return [] - - book_data = next(iter(data.values()), None) - if not book_data: - return [] - - if 'title' in book_data: - new_tags.append(f"title:{book_data['title']}") - - if 'authors' in book_data and isinstance(book_data['authors'], list): - for author in book_data['authors'][:3]: - if 'name' in author: - new_tags.append(f"author:{author['name']}") - - if 'publish_date' in book_data: - new_tags.append(f"publish_date:{book_data['publish_date']}") - - if 'publishers' in book_data and isinstance(book_data['publishers'], list): - for pub in book_data['publishers'][:1]: - if 'name' in pub: - new_tags.append(f"publisher:{pub['name']}") - - if 'description' in book_data: - desc = book_data['description'] - if isinstance(desc, dict) and 'value' in desc: - desc = desc['value'] - if desc: - desc_str = str(desc).strip() - # Include description if available (limit to 200 chars to keep it manageable) - if len(desc_str) > 0: - new_tags.append(f"description:{desc_str[:200]}") - - if 'number_of_pages' in book_data: - page_count = book_data['number_of_pages'] - if page_count and isinstance(page_count, int) and page_count > 0: - new_tags.append(f"pages:{page_count}") - - if 'identifiers' in book_data and isinstance(book_data['identifiers'], dict): - identifiers = book_data['identifiers'] - - if 'openlibrary' in identifiers: - ol_ids = identifiers['openlibrary'] - if isinstance(ol_ids, list) and ol_ids: - new_tags.append(f"openlibrary:{ol_ids[0]}") - elif isinstance(ol_ids, str): - new_tags.append(f"openlibrary:{ol_ids}") - - if 'lccn' in identifiers: - lccn_list = identifiers['lccn'] - if isinstance(lccn_list, list) and lccn_list: - new_tags.append(f"lccn:{lccn_list[0]}") - elif isinstance(lccn_list, str): - new_tags.append(f"lccn:{lccn_list}") - - if 'oclc' in identifiers: - oclc_list = identifiers['oclc'] - if isinstance(oclc_list, list) and oclc_list: - new_tags.append(f"oclc:{oclc_list[0]}") - elif isinstance(oclc_list, str): - new_tags.append(f"oclc:{oclc_list}") - - if 'goodreads' in identifiers: - goodreads_list = identifiers['goodreads'] - if isinstance(goodreads_list, list) and goodreads_list: - new_tags.append(f"goodreads:{goodreads_list[0]}") - elif isinstance(goodreads_list, str): - new_tags.append(f"goodreads:{goodreads_list}") - - if 'librarything' in identifiers: - lt_list = identifiers['librarything'] - if isinstance(lt_list, list) and lt_list: - new_tags.append(f"librarything:{lt_list[0]}") - elif isinstance(lt_list, str): - new_tags.append(f"librarything:{lt_list}") - - if 'doi' in identifiers: - doi_list = identifiers['doi'] - if isinstance(doi_list, list) and doi_list: - new_tags.append(f"doi:{doi_list[0]}") - elif isinstance(doi_list, str): - new_tags.append(f"doi:{doi_list}") - - if 'internet_archive' in identifiers: - ia_list = identifiers['internet_archive'] - if isinstance(ia_list, list) and ia_list: - new_tags.append(f"internet_archive:{ia_list[0]}") - elif isinstance(ia_list, str): - new_tags.append(f"internet_archive:{ia_list}") - - log(f"Found {len(new_tags)} tag(s) from ISBN lookup") - return new_tags + return list(_ol_scrape_isbn_metadata(isbn)) except Exception as e: log(f"ISBN scraping error: {e}", file=sys.stderr) return [] def _scrape_openlibrary_metadata(olid: str) -> List[str]: - """Scrape metadata for an OpenLibrary ID using the .json API endpoint. - - Fetches from https://openlibrary.org/books/{OLID}.json and extracts: - - Title, authors, publish date, publishers - - Description - - Subjects as freeform tags (without namespace prefix) - - Identifiers (ISBN, LCCN, OCLC, etc.) - """ - new_tags = [] + if _ol_scrape_openlibrary_metadata is None: + log("OpenLibrary scraper unavailable", file=sys.stderr) + return [] try: - from ..API.HTTP import HTTPClient - import json as json_module - - # Format: OL9674499M or just 9674499M - olid_clean = olid.replace('OL', '').replace('M', '') - if not olid_clean.isdigit(): - olid_clean = olid - - # Ensure we have the full OLID format for the URL - if not olid.startswith('OL'): - url = f"https://openlibrary.org/books/OL{olid_clean}M.json" - else: - url = f"https://openlibrary.org/books/{olid}.json" - - try: - with HTTPClient() as client: - response = client.get(url) - response.raise_for_status() - data = json_module.loads(response.content.decode('utf-8')) - except Exception as e: - log(f"Failed to fetch OpenLibrary metadata: {e}", file=sys.stderr) - return [] - - if not data: - log(f"No OpenLibrary metadata found for: {olid}") - return [] - - # Add title - if 'title' in data: - new_tags.append(f"title:{data['title']}") - - # Add authors - if 'authors' in data and isinstance(data['authors'], list): - for author in data['authors'][:3]: - if isinstance(author, dict) and 'name' in author: - new_tags.append(f"author:{author['name']}") - elif isinstance(author, str): - new_tags.append(f"author:{author}") - - # Add publish date - if 'publish_date' in data: - new_tags.append(f"publish_date:{data['publish_date']}") - - # Add publishers - if 'publishers' in data and isinstance(data['publishers'], list): - for pub in data['publishers'][:1]: - if isinstance(pub, dict) and 'name' in pub: - new_tags.append(f"publisher:{pub['name']}") - elif isinstance(pub, str): - new_tags.append(f"publisher:{pub}") - - # Add description - if 'description' in data: - desc = data['description'] - if isinstance(desc, dict) and 'value' in desc: - desc = desc['value'] - if desc: - desc_str = str(desc).strip() - if len(desc_str) > 0: - new_tags.append(f"description:{desc_str[:200]}") - - # Add number of pages - if 'number_of_pages' in data: - page_count = data['number_of_pages'] - if page_count and isinstance(page_count, int) and page_count > 0: - new_tags.append(f"pages:{page_count}") - - # Add subjects as FREEFORM tags (no namespace prefix) - if 'subjects' in data and isinstance(data['subjects'], list): - for subject in data['subjects'][:10]: - if subject and isinstance(subject, str): - subject_clean = str(subject).strip() - if subject_clean and subject_clean not in new_tags: - new_tags.append(subject_clean) - - # Add identifiers - if 'identifiers' in data and isinstance(data['identifiers'], dict): - identifiers = data['identifiers'] - - if 'isbn_10' in identifiers: - isbn_10_list = identifiers['isbn_10'] - if isinstance(isbn_10_list, list) and isbn_10_list: - new_tags.append(f"isbn_10:{isbn_10_list[0]}") - elif isinstance(isbn_10_list, str): - new_tags.append(f"isbn_10:{isbn_10_list}") - - if 'isbn_13' in identifiers: - isbn_13_list = identifiers['isbn_13'] - if isinstance(isbn_13_list, list) and isbn_13_list: - new_tags.append(f"isbn_13:{isbn_13_list[0]}") - elif isinstance(isbn_13_list, str): - new_tags.append(f"isbn_13:{isbn_13_list}") - - if 'lccn' in identifiers: - lccn_list = identifiers['lccn'] - if isinstance(lccn_list, list) and lccn_list: - new_tags.append(f"lccn:{lccn_list[0]}") - elif isinstance(lccn_list, str): - new_tags.append(f"lccn:{lccn_list}") - - if 'oclc_numbers' in identifiers: - oclc_list = identifiers['oclc_numbers'] - if isinstance(oclc_list, list) and oclc_list: - new_tags.append(f"oclc:{oclc_list[0]}") - elif isinstance(oclc_list, str): - new_tags.append(f"oclc:{oclc_list}") - - if 'goodreads' in identifiers: - goodreads_list = identifiers['goodreads'] - if isinstance(goodreads_list, list) and goodreads_list: - new_tags.append(f"goodreads:{goodreads_list[0]}") - elif isinstance(goodreads_list, str): - new_tags.append(f"goodreads:{goodreads_list}") - - log(f"Found {len(new_tags)} tag(s) from OpenLibrary lookup") - return new_tags + return list(_ol_scrape_openlibrary_metadata(olid)) except Exception as e: log(f"OpenLibrary scraping error: {e}", file=sys.stderr) return [] diff --git a/cmdlet/get_url.py b/cmdlet/get_url.py index 2c3f473..d54e12a 100644 --- a/cmdlet/get_url.py +++ b/cmdlet/get_url.py @@ -1,28 +1,40 @@ from __future__ import annotations -from typing import Any, Dict, Sequence +from dataclasses import dataclass +from typing import Any, Dict, List, Sequence import sys -from . import register import pipeline as ctx -from ._shared import Cmdlet, CmdletArg, SharedArgs, parse_cmdlet_args, get_field, normalize_hash +from ._shared import Cmdlet, SharedArgs, parse_cmdlet_args, get_field, normalize_hash from SYS.logger import log from Store import Store +@dataclass +class UrlItem: + url: str + hash: str + store: str + + class Get_Url(Cmdlet): """Get url associated with files via hash+store.""" - - NAME = "get-url" - SUMMARY = "List url associated with a file" - USAGE = "@1 | get-url" - ARGS = [ - SharedArgs.HASH, - SharedArgs.STORE, - ] - DETAIL = [ - "- Lists all url associated with file identified by hash+store", - ] + + def __init__(self) -> None: + super().__init__( + name="get-url", + summary="List url associated with a file", + usage="@1 | get-url", + arg=[ + SharedArgs.HASH, + SharedArgs.STORE, + ], + detail=[ + "- Lists all url associated with file identified by hash+store", + ], + exec=self.run, + ) + self.register() def run(self, result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: """Get url for file via hash+store backend.""" @@ -53,18 +65,34 @@ class Get_Url(Cmdlet): urls = backend.get_url(file_hash) - if urls: - for u in urls: - # Emit rich object for pipeline compatibility - ctx.emit({ - "url": u, - "hash": file_hash, - "store": store_name, - }) - return 0 - else: - ctx.emit("No url found") - return 0 + from result_table import ResultTable + + title = str(get_field(result, "title") or "").strip() + table_title = "Title" + if title: + table_title = f"Title: {title}" + + table = ResultTable(table_title, max_columns=1).set_preserve_order(True) + table.set_source_command("get-url", []) + + items: List[UrlItem] = [] + for u in list(urls or []): + u = str(u or "").strip() + if not u: + continue + row = table.add_row() + row.add_column("Url", u) + item = UrlItem(url=u, hash=file_hash, store=str(store_name)) + items.append(item) + ctx.emit(item) + + # Make this a real result table so @.. / @,, can navigate it + ctx.set_last_result_table(table if items else None, items, subject=result) + + if not items: + log("No url found", file=sys.stderr) + + return 0 except KeyError: log(f"Error: Storage backend '{store_name}' not configured") @@ -74,7 +102,6 @@ class Get_Url(Cmdlet): return 1 -# Register cmdlet -register(["get-url", "get_url"])(Get_Url) +CMDLET = Get_Url() diff --git a/cmdlet/search_store.py b/cmdlet/search_store.py index 185b45c..d3081f5 100644 --- a/cmdlet/search_store.py +++ b/cmdlet/search_store.py @@ -3,7 +3,6 @@ from __future__ import annotations from typing import Any, Dict, Sequence, List, Optional, Tuple from pathlib import Path -from dataclasses import dataclass, field from collections import OrderedDict import re import json @@ -11,57 +10,9 @@ import sys from SYS.logger import log, debug -from ._shared import Cmdlet, CmdletArg, get_field, should_show_help +from ._shared import Cmdlet, CmdletArg, get_field, should_show_help, normalize_hash, first_title_tag import pipeline as ctx -# Optional dependencies -try: - import mutagen # type: ignore -except ImportError: # pragma: no cover - mutagen = None # type: ignore - -try: - from config import get_hydrus_url, resolve_output_dir -except Exception: # pragma: no cover - get_hydrus_url = None # type: ignore - resolve_output_dir = None # type: ignore - -try: - from API.HydrusNetwork import HydrusNetwork, HydrusRequestError -except ImportError: # pragma: no cover - HydrusNetwork = None # type: ignore - HydrusRequestError = RuntimeError # type: ignore - -try: - from SYS.utils import sha256_file -except ImportError: # pragma: no cover - sha256_file = None # type: ignore - -try: - from SYS.utils_constant import mime_maps -except ImportError: # pragma: no cover - mime_maps = {} # type: ignore - -@dataclass(slots=True) -class SearchRecord: - path: str - size_bytes: int | None = None - duration_seconds: str | None = None - tag: str | None = None - hash: str | None = None - - def as_dict(self) -> dict[str, str]: - payload: dict[str, str] = {"path": self.path} - if self.size_bytes is not None: - payload["size"] = str(self.size_bytes) - if self.duration_seconds: - payload["duration"] = self.duration_seconds - if self.tag: - payload["tag"] = self.tag - if self.hash: - payload["hash"] = self.hash - return payload - STORAGE_ORIGINS = {"local", "hydrus", "folder"} @@ -86,12 +37,15 @@ class Search_Store(Cmdlet): detail=[ "Search across storage backends: Folder stores and Hydrus instances", "Use -store to search a specific backend by name", + "URL search: url:* (any URL) or url: (URL substring)", "Filter results by: tag, size, type, duration", "Results include hash for downstream commands (get-file, add-tag, etc.)", "Examples:", "search-store foo # Search all storage backends", "search-store -store home '*' # Search 'home' Hydrus instance", "search-store -store test 'video' # Search 'test' folder store", + "search-store 'url:*' # Files that have any URL", + "search-store 'url:youtube.com' # Files whose URL contains substring", "search-store song -type audio # Search for audio files", "search-store movie -tag action # Search with tag filter", ], @@ -100,6 +54,40 @@ class Search_Store(Cmdlet): self.register() # --- Helper methods ------------------------------------------------- + @staticmethod + def _parse_hash_query(query: str) -> List[str]: + """Parse a `hash:` query into a list of normalized 64-hex SHA256 hashes. + + Supported examples: + - hash:

,

,

+ - Hash:

+ - hash:{

,

} + """ + q = str(query or "").strip() + if not q: + return [] + + m = re.match(r"^hash(?:es)?\s*:\s*(.+)$", q, flags=re.IGNORECASE) + if not m: + return [] + + rest = (m.group(1) or "").strip() + if rest.startswith("{") and rest.endswith("}"): + rest = rest[1:-1].strip() + if rest.startswith("[") and rest.endswith("]"): + rest = rest[1:-1].strip() + + # Split on commas and whitespace. + raw_parts = [p.strip() for p in re.split(r"[\s,]+", rest) if p.strip()] + out: List[str] = [] + for part in raw_parts: + h = normalize_hash(part) + if not h: + continue + if h not in out: + out.append(h) + return out + @staticmethod def _normalize_extension(ext_value: Any) -> str: """Sanitize extension strings to alphanumerics and cap at 5 chars.""" @@ -150,10 +138,10 @@ class Search_Store(Cmdlet): # Parse arguments query = "" - tag_filters: List[str] = [] - size_filter: Optional[Tuple[str, int]] = None - duration_filter: Optional[Tuple[str, float]] = None - type_filter: Optional[str] = None + _tag_filters: List[str] = [] + _size_filter: Optional[Tuple[str, int]] = None + _duration_filter: Optional[Tuple[str, float]] = None + _type_filter: Optional[str] = None storage_backend: Optional[str] = None limit = 100 searched_backends: List[str] = [] @@ -166,7 +154,7 @@ class Search_Store(Cmdlet): storage_backend = args_list[i + 1] i += 2 elif low in {"-tag", "--tag"} and i + 1 < len(args_list): - tag_filters.append(args_list[i + 1]) + _tag_filters.append(args_list[i + 1]) i += 2 elif low in {"-limit", "--limit"} and i + 1 < len(args_list): try: @@ -175,7 +163,7 @@ class Search_Store(Cmdlet): limit = 100 i += 2 elif low in {"-type", "--type"} and i + 1 < len(args_list): - type_filter = args_list[i + 1].lower() + _type_filter = args_list[i + 1].lower() i += 2 elif not arg.startswith("-"): query = f"{query} {arg}".strip() if query else arg @@ -195,6 +183,8 @@ class Search_Store(Cmdlet): if store_filter and not storage_backend: storage_backend = store_filter + hash_query = self._parse_hash_query(query) + if not query: log("Provide a search query", file=sys.stderr) return 1 @@ -230,12 +220,136 @@ class Search_Store(Cmdlet): table_title += f" [{storage_backend}]" table = ResultTable(table_title) + if hash_query: + try: + table.set_preserve_order(True) + except Exception: + pass from Store import Store storage = Store(config=config or {}) from Store._base import Store as BaseStore backend_to_search = storage_backend or None + if hash_query: + # Explicit hash list search: build rows from backend metadata. + backends_to_try: List[str] = [] + if backend_to_search: + backends_to_try = [backend_to_search] + else: + backends_to_try = list(storage.list_backends()) + + found_any = False + for h in hash_query: + resolved_backend_name: Optional[str] = None + resolved_backend = None + + for backend_name in backends_to_try: + try: + backend = storage[backend_name] + except Exception: + continue + try: + # If get_metadata works, consider it a hit; get_file can be optional (e.g. remote URL). + meta = backend.get_metadata(h) + if meta is None: + continue + resolved_backend_name = backend_name + resolved_backend = backend + break + except Exception: + continue + + if resolved_backend_name is None or resolved_backend is None: + continue + + found_any = True + searched_backends.append(resolved_backend_name) + + # Resolve a path/URL string if possible + path_str: Optional[str] = None + try: + maybe_path = resolved_backend.get_file(h) + if isinstance(maybe_path, Path): + path_str = str(maybe_path) + elif isinstance(maybe_path, str) and maybe_path: + path_str = maybe_path + except Exception: + path_str = None + + meta_obj: Dict[str, Any] = {} + try: + meta_obj = resolved_backend.get_metadata(h) or {} + except Exception: + meta_obj = {} + + tags_list: List[str] = [] + try: + tag_result = resolved_backend.get_tag(h) + if isinstance(tag_result, tuple) and tag_result: + maybe_tags = tag_result[0] + else: + maybe_tags = tag_result + if isinstance(maybe_tags, list): + tags_list = [str(t).strip() for t in maybe_tags if isinstance(t, str) and str(t).strip()] + except Exception: + tags_list = [] + + title_from_tag: Optional[str] = None + try: + title_tag = first_title_tag(tags_list) + if title_tag and ":" in title_tag: + title_from_tag = title_tag.split(":", 1)[1].strip() + except Exception: + title_from_tag = None + + title = title_from_tag or meta_obj.get("title") or meta_obj.get("name") + if not title and path_str: + try: + title = Path(path_str).stem + except Exception: + title = path_str + + ext_val = meta_obj.get("ext") or meta_obj.get("extension") + if not ext_val and path_str: + try: + ext_val = Path(path_str).suffix + except Exception: + ext_val = None + + size_bytes = meta_obj.get("size") + if size_bytes is None: + size_bytes = meta_obj.get("size_bytes") + try: + size_bytes_int: Optional[int] = int(size_bytes) if size_bytes is not None else None + except Exception: + size_bytes_int = None + + payload: Dict[str, Any] = { + "title": str(title or h), + "hash": h, + "store": resolved_backend_name, + "path": path_str, + "ext": self._normalize_extension(ext_val), + "size_bytes": size_bytes_int, + "tag": tags_list, + } + + table.add_result(payload) + results_list.append(payload) + ctx.emit(payload) + + if found_any: + ctx.set_last_result_table(table, results_list) + db.append_worker_stdout(worker_id, json.dumps(results_list, indent=2)) + db.update_worker_status(worker_id, 'completed') + return 0 + + log("No results found", file=sys.stderr) + db.append_worker_stdout(worker_id, json.dumps([], indent=2)) + db.update_worker_status(worker_id, 'completed') + return 0 + if backend_to_search: searched_backends.append(backend_to_search) target_backend = storage[backend_to_search] @@ -243,7 +357,9 @@ class Search_Store(Cmdlet): log(f"Backend '{backend_to_search}' does not support searching", file=sys.stderr) db.update_worker_status(worker_id, 'error') return 1 + debug(f"[search-store] Searching '{backend_to_search}'") results = target_backend.search(query, limit=limit) + debug(f"[search-store] '{backend_to_search}' -> {len(results or [])} result(s)") else: from API.HydrusNetwork import is_hydrus_available hydrus_available = is_hydrus_available(config or {}) @@ -257,7 +373,9 @@ class Search_Store(Cmdlet): continue searched_backends.append(backend_name) + debug(f"[search-store] Searching '{backend_name}'") backend_results = backend.search(query, limit=limit - len(all_results)) + debug(f"[search-store] '{backend_name}' -> {len(backend_results or [])} result(s)") if backend_results: all_results.extend(backend_results) if len(all_results) >= limit: @@ -317,11 +435,6 @@ class Search_Store(Cmdlet): results_list.append(normalized) ctx.emit(normalized) - # Debug: Verify table rows match items list - debug(f"[search-store] Added {len(table.rows)} rows to table, {len(results_list)} items to results_list") - if len(table.rows) != len(results_list): - debug(f"[search-store] WARNING: Table/items mismatch! rows={len(table.rows)} items={len(results_list)}", file=sys.stderr) - ctx.set_last_result_table(table, results_list) db.append_worker_stdout(worker_id, json.dumps(results_list, indent=2)) else: diff --git a/metadata.py b/metadata.py index b8fa907..847c45a 100644 --- a/metadata.py +++ b/metadata.py @@ -3,14 +3,12 @@ import re import subprocess import sys import shutil -import sqlite3 -import requests from SYS.logger import log, debug from urllib.parse import urlsplit, urlunsplit, unquote from collections import deque from pathlib import Path from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple -from models import PipeObject, FileRelationshipTracker, _get_file_hash +from models import FileRelationshipTracker try: import musicbrainzngs # type: ignore except ImportError: # pragma: no cover @@ -332,6 +330,112 @@ def _generate_hydrus_url_variants(url: str) -> List[str]: return variants +def normalize_urls(value: Any) -> List[str]: + """Normalize a URL field into a stable, deduplicated list. + + Accepts: + - None + - a single URL string (optionally containing multiple URLs) + - a list/tuple/set of URL strings + + This helper is used by cmdlets/stores/pipeline to keep `url` consistent. + """ + + def _iter_raw_urls(raw: Any) -> Iterable[str]: + if raw is None: + return + + if isinstance(raw, str): + text = raw.strip() + if not text: + return + # Support legacy prefixes like "url:https://...". + if text.lower().startswith("url:"): + text = text.split(":", 1)[1].strip() + + # Prefer extracting obvious URLs to avoid splitting inside query strings. + matches = re.findall(r"https?://[^\s,]+", text, flags=re.IGNORECASE) + if matches: + for m in matches: + yield m + return + + # Fallback: split on commas/whitespace. + for token in text.replace("\n", " ").replace("\r", " ").replace(",", " ").split(): + if token: + yield token + return + + if isinstance(raw, (list, tuple, set)): + for item in raw: + if item is None: + continue + if isinstance(item, str): + if item.strip(): + yield item + else: + text = str(item).strip() + if text: + yield text + return + + # Last resort: string-coerce. + text = str(raw).strip() + if text: + yield text + + def _canonicalize(url_text: str) -> Optional[str]: + u = str(url_text or "").strip() + if not u: + return None + + # Trim common wrappers and trailing punctuation. + u = u.strip("<>\"' ") + u = u.rstrip(")].,;\"") + if not u: + return None + + lower = u.lower() + if not (lower.startswith("http://") or lower.startswith("https://")): + return u + + try: + parsed = urlsplit(u) + except Exception: + return u + + scheme = (parsed.scheme or "").lower() + netloc = (parsed.netloc or "").lower() + path = unquote(parsed.path or "") + query = parsed.query or "" + + # Normalize default ports. + if scheme == "http" and netloc.endswith(":80"): + netloc = netloc[:-3] + elif scheme == "https" and netloc.endswith(":443"): + netloc = netloc[:-4] + + # Prefer no trailing slash except root. + if path and path != "/": + path = path.rstrip("/") + + # Fragments are not part of the resource. + return urlunsplit((scheme, netloc, path, query, "")) + + seen: Set[str] = set() + out: List[str] = [] + for raw_url in _iter_raw_urls(value): + canonical = _canonicalize(raw_url) + if not canonical: + continue + if canonical in seen: + continue + seen.add(canonical) + out.append(canonical) + + return out + + def value_normalize(value: str) -> str: """Normalize whitespace: collapse internal spaces, strip, remove newlines.""" value = value.replace("\n", " ").replace("\r", " ") @@ -358,6 +462,7 @@ def import_pending_sidecars(db_root: Path, db: Any) -> None: continue # Ensure file entry exists + file_id: Optional[int] = None try: cursor = db.connection.cursor() if db.connection else None if cursor: @@ -394,10 +499,16 @@ def import_pending_sidecars(db_root: Path, db: Any) -> None: try: cursor = db.connection.cursor() if db.connection else None if cursor: + file_hash_value: Optional[str] = None + if hasattr(db, 'get_file_hash'): + try: + file_hash_value = db.get_file_hash(file_id) + except Exception: + file_hash_value = None for tag in tags: cursor.execute( 'INSERT OR IGNORE INTO tags (hash, tag) VALUES (?, ?)', - (file_hash_value, tag) if hasattr(db, 'get_file_hash') else (None, tag) + (file_hash_value, tag) ) db.connection.commit() except Exception: @@ -663,128 +774,6 @@ def fetch_musicbrainz_tags(mbid: str, entity: str) -> Dict[str, object]: return {"source": "musicbrainz", "id": mbid, "tag": tags, "entity": entity} -def fetch_openlibrary_tags(ol_id: str) -> Dict[str, object]: - """Fetch metadata tags from OpenLibrary. - - Args: - ol_id: OpenLibrary ID (e.g., 'OL123456M' for a book) - - Returns: - Dictionary with 'tag' key containing list of extracted tags - """ - import urllib.request - - # Normalize OL ID - ol_id = ol_id.strip().upper() - if not ol_id.startswith('OL'): - ol_id = f'OL{ol_id}' - - # Fetch from OpenLibrary API - url = f"https://openlibrary.org/books/{ol_id}.json" - tags: List[str] = [] - - try: - with urllib.request.urlopen(url, timeout=10) as response: - data = json.loads(response.read().decode('utf-8')) - except Exception as e: - raise ValueError(f"Failed to fetch OpenLibrary data for {ol_id}: {e}") - - # Add OpenLibrary ID tag - _add_tag(tags, "openlibrary", ol_id) - - # Extract title - _add_tag(tags, "title", data.get("title")) - - # Extract subtitle if present - if data.get("subtitle"): - _add_tag(tags, "subtitle", data["subtitle"]) - - # Extract authors - authors = data.get("authors", []) - author_names: List[str] = [] - for author in authors: - if isinstance(author, dict): - name = author.get("name") - else: - name = str(author) - if name: - author_names.append(name) - if author_names: - _extend_tags(tags, "author", author_names) - - # Extract publication details - if data.get("publish_date"): - _add_tag(tags, "publish_date", data["publish_date"]) - # Extract year if present - year_match = re.search(r'\b(\d{4})\b', str(data.get("publish_date", ""))) - if year_match: - _add_tag(tags, "year", year_match.group(1)) - - # Extract publishers - publishers = data.get("publishers", []) - if publishers: - publisher_names = [] - for pub in publishers: - if isinstance(pub, dict): - name = pub.get("name") - else: - name = str(pub) - if name: - publisher_names.append(name) - if publisher_names: - _extend_tags(tags, "publisher", publisher_names) - - # Extract languages - languages = data.get("languages", []) - if languages: - lang_codes = [] - for lang in languages: - if isinstance(lang, dict): - code = lang.get("key", "").split("/")[-1] - else: - code = str(lang).split("/")[-1] - if code and code != "": - lang_codes.append(code) - if lang_codes: - _extend_tags(tags, "language", lang_codes) - - # Extract ISBN - isbns = data.get("isbn_10", []) + data.get("isbn_13", []) - if isbns: - for isbn in isbns[:1]: # Just take first one - if len(str(isbn)) == 10: - _add_tag(tags, "isbn_10", isbn) - elif len(str(isbn)) == 13: - _add_tag(tags, "isbn_13", isbn) - - # Extract page count - _add_tag(tags, "pages", data.get("number_of_pages")) - - # Extract genres/subjects (OpenLibrary calls them subjects) - # Subjects are added as plain freeform tags (no namespace prefix) - subjects = data.get("subjects", []) - if subjects: - for subject in subjects[:10]: # Limit to 10 subjects - if isinstance(subject, dict): - name = subject.get("name") - else: - name = str(subject) - if name: - # Add subject as plain tag without "subject:" prefix - normalized = value_normalize(str(name)) - if normalized: - tags.append(normalized) - - # Extract OpenLibrary description - description = data.get("description") - if description: - if isinstance(description, dict): - description = description.get("value") - _add_tag(tags, "summary", description) - - return {"source": "openlibrary", "id": ol_id, "tag": tags} - - def _append_unique(target: List[str], seen: Set[str], value: Optional[str]) -> None: """Append a single value if not already in seen set (deduplication).""" if value is None: @@ -1545,7 +1534,7 @@ def _derive_sidecar_path(media_path: Path) -> Path: return preferred -def _read_sidecar_metadata(sidecar_path: Path) -> tuple[Optional[str], List[str], List[str]]: +def _read_sidecar_metadata(sidecar_path: Path) -> tuple[Optional[str], List[str], List[str]]: # pyright: ignore[reportUnusedFunction] """Read hash, tags, and url from sidecar file. Consolidated with read_tags_from_file - this extracts extra metadata (hash, url). @@ -1559,7 +1548,7 @@ def _read_sidecar_metadata(sidecar_path: Path) -> tuple[Optional[str], List[str] hash_value: Optional[str] = None tags: List[str] = [] - url: List[str] = [] + urls: List[str] = [] for raw_line in raw.splitlines(): line = raw_line.strip() @@ -1574,15 +1563,15 @@ def _read_sidecar_metadata(sidecar_path: Path) -> tuple[Optional[str], List[str] url_part = line.split(':', 1)[1].strip() if ':' in line else '' if url_part: for url_segment in url_part.split(','): - for url in url_segment.split(): - url_clean = url.strip() - if url_clean and url_clean not in url: - url.append(url_clean) + for url_token in url_segment.split(): + url_clean = url_token.strip() + if url_clean and url_clean not in urls: + urls.append(url_clean) else: # Everything else is a tag (including relationship: lines) tags.append(line) - return hash_value, tags, url + return hash_value, tags, urls @@ -1827,63 +1816,6 @@ def apply_title_to_path(media_path: Path, tags: Iterable[str]) -> Path: return destination -def _collect_search_roots(payload: Dict[str, Any]) -> List[Path]: - roots: List[Path] = [] - for key in ('paths', 'search_paths', 'roots', 'directories'): - raw = payload.get(key) - if not raw: - continue - entries = raw if isinstance(raw, (list, tuple, set)) else [raw] - for entry in entries: - if not entry: - continue - try: - candidate = Path(str(entry)).expanduser() - except Exception: - continue - roots.append(candidate) - if load_config is not None and resolve_output_dir is not None: - try: - config = load_config() - except Exception: - config = None - if isinstance(config, dict) and config: - try: - default_root = resolve_output_dir(config) - except Exception: - default_root = None - if default_root is not None: - roots.append(default_root) - return roots - - -def _locate_sidecar_by_hash(hash_value: str, roots: Iterable[Path]) -> Optional[Path]: - target = f'hash:{hash_value.strip().lower()}' - for root in roots: - try: - root_path = root.expanduser() - except Exception: - continue - if not root_path.exists() or not root_path.is_dir(): - continue - for pattern in ('*.tag',): - try: - iterator = root_path.rglob(pattern) - except OSError: - continue - for candidate in iterator: - if not candidate.is_file(): - continue - try: - with candidate.open('r', encoding='utf-8', errors='ignore') as handle: - for line in handle: - if line.strip().lower() == target: - return candidate - except OSError: - continue - return None - - def sync_sidecar(payload: Dict[str, Any]) -> Dict[str, Any]: path_value = payload.get('path') if not path_value: @@ -2506,8 +2438,8 @@ def write_tags_to_file( # Add known url if provided - each on separate line to prevent corruption if url: - for url in url: - content_lines.append(f"url:{url}") + for url_item in url: + content_lines.append(f"url:{url_item}") # Add tags if tags: @@ -2642,10 +2574,10 @@ def detect_metadata_request(tag: str) -> Optional[Dict[str, str]]: def expand_metadata_tag(payload: Dict[str, Any]) -> Dict[str, Any]: tag = payload.get('tag') if not isinstance(tag, str): - return {'tags': []} + return {'tag': []} trimmed = value_normalize(tag) if not trimmed: - return {'tags': []} + return {'tag': []} request = detect_metadata_request(trimmed) tags: List[str] = [] seen: Set[str] = set() @@ -2653,7 +2585,7 @@ def expand_metadata_tag(payload: Dict[str, Any]) -> Dict[str, Any]: _append_unique(tags, seen, request['base']) else: _append_unique(tags, seen, trimmed) - return {'tags': tags} + return {'tag': tags} try: if request['source'] == 'imdb': data = imdb_tag(request['id']) @@ -2662,8 +2594,15 @@ def expand_metadata_tag(payload: Dict[str, Any]) -> Dict[str, Any]: except Exception as exc: # pragma: no cover - network/service errors return {'tag': tags, 'error': str(exc)} # Add tags from fetched data (no namespace, just unique append) - for tag in (data.get('tag') or []): - _append_unique(tags, seen, tag) + raw_tags = data.get('tag') if isinstance(data, dict) else None + if isinstance(raw_tags, str): + tag_iter: Iterable[str] = [raw_tags] + elif isinstance(raw_tags, (list, tuple, set)): + tag_iter = [t for t in raw_tags if isinstance(t, str)] + else: + tag_iter = [] + for tag_value in tag_iter: + _append_unique(tags, seen, tag_value) result = { 'tag': tags, 'source': request['source'], @@ -3082,14 +3021,14 @@ def expand_tag_lists(tags_set: Set[str]) -> Set[str]: # Load adjective.json from workspace root adjective_path = Path(__file__).parent / "adjective.json" if not adjective_path.exists(): - log.debug(f"adjective.json not found at {adjective_path}") + debug(f"adjective.json not found at {adjective_path}") return tags_set try: with open(adjective_path, 'r') as f: adjective_lists = json.load(f) except Exception as e: - log.error(f"Error loading adjective.json: {e}") + debug(f"Error loading adjective.json: {e}") return tags_set expanded_tags = set() @@ -3108,10 +3047,10 @@ def expand_tag_lists(tags_set: Set[str]) -> Set[str]: if matched_list: # Add all tags from the list expanded_tags.update(matched_list) - log.info(f"Expanded {tag} to {len(matched_list)} tags") + debug(f"Expanded {tag} to {len(matched_list)} tags") else: # List not found, log warning but don't add the reference - log.warning(f"Tag list '{list_name}' not found in adjective.json") + debug(f"Tag list '{list_name}' not found in adjective.json") else: # Regular tag, keep as is expanded_tags.add(tag) @@ -3194,98 +3133,6 @@ def build_book_tags( return deduped -def fetch_openlibrary_metadata_tags(isbn: Optional[str] = None, olid: Optional[str] = None) -> List[str]: - """Fetch book metadata from OpenLibrary and return as tags. - - Args: - isbn: ISBN number (with or without isbn: prefix) - olid: OpenLibrary ID - - Returns: - List of tags extracted from OpenLibrary metadata - """ - metadata_tags = [] - - # Try OLID first (preferred), then ISBN - url = None - - if olid: - # Clean up OLID format - olid_clean = str(olid).replace('OL', '').replace('M', '').replace('W', '') - if olid_clean.isdigit(): - url = f"https://openlibrary.org/books/OL{olid_clean}M.json" - else: - url = f"https://openlibrary.org/books/{olid}.json" - elif isbn: - # Clean up ISBN - isbn_clean = str(isbn).replace('isbn:', '').strip() - url = f"https://openlibrary.org/isbn/{isbn_clean}.json" - - if not url: - return metadata_tags - - try: - response = requests.get(url, timeout=10) - if response.status_code != 200: - return metadata_tags - - data = response.json() - if not data: - return metadata_tags - - # Extract title - if 'title' in data: - metadata_tags.append(f"title:{data['title']}") - - # Extract authors - if 'authors' in data and isinstance(data['authors'], list): - for author in data['authors'][:3]: - if isinstance(author, dict) and 'name' in author: - metadata_tags.append(f"author:{author['name']}") - elif isinstance(author, str): - metadata_tags.append(f"author:{author}") - - # Extract publish date - if 'publish_date' in data: - metadata_tags.append(f"publish_date:{data['publish_date']}") - - # Extract publishers - if 'publishers' in data and isinstance(data['publishers'], list): - for pub in data['publishers'][:1]: - if isinstance(pub, dict) and 'name' in pub: - metadata_tags.append(f"publisher:{pub['name']}") - elif isinstance(pub, str): - metadata_tags.append(f"publisher:{pub}") - - # Extract number of pages - if 'number_of_pages' in data: - page_count = data['number_of_pages'] - if page_count and isinstance(page_count, int) and page_count > 0: - metadata_tags.append(f"pages:{page_count}") - - # Extract language - if 'languages' in data and isinstance(data['languages'], list) and data['languages']: - lang = data['languages'][0] - if isinstance(lang, dict) and 'key' in lang: - lang_code = lang['key'].split('/')[-1] - metadata_tags.append(f"language:{lang_code}") - elif isinstance(lang, str): - metadata_tags.append(f"language:{lang}") - - # Extract subjects as freeform tags (limit to 5) - if 'subjects' in data and isinstance(data['subjects'], list): - for subject in data['subjects'][:5]: - if subject and isinstance(subject, str): - subject_clean = str(subject).strip() - if subject_clean: - metadata_tags.append(subject_clean) - - except Exception as e: - debug(f"⚠ Failed to fetch OpenLibrary metadata: {e}") - - return metadata_tags - - def enrich_playlist_entries(entries: list, extractor: str) -> list: """Enrich playlist entries with full metadata by fetching individual entry info. @@ -3312,7 +3159,7 @@ def enrich_playlist_entries(entries: list, extractor: str) -> list: if entry_url and is_url_supported_by_ytdlp(entry_url): try: import yt_dlp - ydl_opts = { + ydl_opts: Any = { "quiet": True, "no_warnings": True, "skip_download": True, @@ -3690,294 +3537,3 @@ def extract_url_formats(formats: list) -> List[Tuple[str, str]]: return [] -def scrape_isbn_metadata(isbn: str) -> List[str]: - """Scrape metadata for an ISBN using Open Library API.""" - new_tags = [] - try: - from API.HTTP import HTTPClient - import json as json_module - - isbn_clean = isbn.replace('-', '').strip() - url = f"https://openlibrary.org/api/books?bibkeys=ISBN:{isbn_clean}&jscmd=data&format=json" - - try: - with HTTPClient() as client: - response = client.get(url) - response.raise_for_status() - data = json_module.loads(response.content.decode('utf-8')) - except Exception as e: - log(f"Failed to fetch ISBN metadata: {e}", file=sys.stderr) - return [] - - if not data: - log(f"No ISBN metadata found for: {isbn}") - return [] - - book_data = next(iter(data.values()), None) - if not book_data: - return [] - - if 'title' in book_data: - new_tags.append(f"title:{book_data['title']}") - - if 'authors' in book_data and isinstance(book_data['authors'], list): - for author in book_data['authors'][:3]: - if 'name' in author: - new_tags.append(f"author:{author['name']}") - - if 'publish_date' in book_data: - new_tags.append(f"publish_date:{book_data['publish_date']}") - - if 'publishers' in book_data and isinstance(book_data['publishers'], list): - for pub in book_data['publishers'][:1]: - if 'name' in pub: - new_tags.append(f"publisher:{pub['name']}") - - if 'description' in book_data: - desc = book_data['description'] - if isinstance(desc, dict) and 'value' in desc: - desc = desc['value'] - if desc: - desc_str = str(desc).strip() - # Include description if available (limit to 200 chars to keep it manageable) - if len(desc_str) > 0: - new_tags.append(f"description:{desc_str[:200]}") - - if 'number_of_pages' in book_data: - page_count = book_data['number_of_pages'] - if page_count and isinstance(page_count, int) and page_count > 0: - new_tags.append(f"pages:{page_count}") - - if 'identifiers' in book_data and isinstance(book_data['identifiers'], dict): - identifiers = book_data['identifiers'] - - if 'openlibrary' in identifiers: - ol_ids = identifiers['openlibrary'] - if isinstance(ol_ids, list) and ol_ids: - new_tags.append(f"openlibrary:{ol_ids[0]}") - elif isinstance(ol_ids, str): - new_tags.append(f"openlibrary:{ol_ids}") - - if 'lccn' in identifiers: - lccn_list = identifiers['lccn'] - if isinstance(lccn_list, list) and lccn_list: - new_tags.append(f"lccn:{lccn_list[0]}") - elif isinstance(lccn_list, str): - new_tags.append(f"lccn:{lccn_list}") - - if 'oclc' in identifiers: - oclc_list = identifiers['oclc'] - if isinstance(oclc_list, list) and oclc_list: - new_tags.append(f"oclc:{oclc_list[0]}") - elif isinstance(oclc_list, str): - new_tags.append(f"oclc:{oclc_list}") - - if 'goodreads' in identifiers: - goodreads_list = identifiers['goodreads'] - if isinstance(goodreads_list, list) and goodreads_list: - new_tags.append(f"goodreads:{goodreads_list[0]}") - elif isinstance(goodreads_list, str): - new_tags.append(f"goodreads:{goodreads_list}") - - if 'librarything' in identifiers: - lt_list = identifiers['librarything'] - if isinstance(lt_list, list) and lt_list: - new_tags.append(f"librarything:{lt_list[0]}") - elif isinstance(lt_list, str): - new_tags.append(f"librarything:{lt_list}") - - if 'doi' in identifiers: - doi_list = identifiers['doi'] - if isinstance(doi_list, list) and doi_list: - new_tags.append(f"doi:{doi_list[0]}") - elif isinstance(doi_list, str): - new_tags.append(f"doi:{doi_list}") - - if 'internet_archive' in identifiers: - ia_list = identifiers['internet_archive'] - if isinstance(ia_list, list) and ia_list: - new_tags.append(f"internet_archive:{ia_list[0]}") - elif isinstance(ia_list, str): - new_tags.append(f"internet_archive:{ia_list}") - - log(f"Found {len(new_tags)} tag(s) from ISBN lookup") - return new_tags - except Exception as e: - log(f"ISBN scraping error: {e}", file=sys.stderr) - return [] - - -def scrape_openlibrary_metadata(olid: str) -> List[str]: - """Scrape metadata for an OpenLibrary ID using the .json API endpoint. - - Fetches from https://openlibrary.org/books/{OLID}.json and extracts: - - Title, authors, publish date, publishers - - Description - - Subjects as freeform tags (without namespace prefix) - - Identifiers (ISBN, LCCN, OCLC, etc.) - """ - new_tags = [] - try: - from API.HTTP import HTTPClient - import json as json_module - - # Format: OL9674499M or just 9674499M - olid_clean = olid.replace('OL', '').replace('M', '') - if not olid_clean.isdigit(): - olid_clean = olid - - # Ensure we have the full OLID format for the URL - if not olid.startswith('OL'): - url = f"https://openlibrary.org/books/OL{olid_clean}M.json" - else: - url = f"https://openlibrary.org/books/{olid}.json" - - try: - with HTTPClient() as client: - response = client.get(url) - response.raise_for_status() - data = json_module.loads(response.content.decode('utf-8')) - except Exception as e: - log(f"Failed to fetch OpenLibrary metadata: {e}", file=sys.stderr) - return [] - - if not data: - log(f"No OpenLibrary metadata found for: {olid}") - return [] - - # Add title - if 'title' in data: - new_tags.append(f"title:{data['title']}") - - # Add authors - if 'authors' in data and isinstance(data['authors'], list): - for author in data['authors'][:3]: - if isinstance(author, dict) and 'name' in author: - new_tags.append(f"author:{author['name']}") - elif isinstance(author, str): - new_tags.append(f"author:{author}") - - # Add publish date - if 'publish_date' in data: - new_tags.append(f"publish_date:{data['publish_date']}") - - # Add publishers - if 'publishers' in data and isinstance(data['publishers'], list): - for pub in data['publishers'][:1]: - if isinstance(pub, dict) and 'name' in pub: - new_tags.append(f"publisher:{pub['name']}") - elif isinstance(pub, str): - new_tags.append(f"publisher:{pub}") - - # Add description - if 'description' in data: - desc = data['description'] - if isinstance(desc, dict) and 'value' in desc: - desc = desc['value'] - if desc: - desc_str = str(desc).strip() - if len(desc_str) > 0: - new_tags.append(f"description:{desc_str[:200]}") - - # Add number of pages - if 'number_of_pages' in data: - page_count = data['number_of_pages'] - if page_count and isinstance(page_count, int) and page_count > 0: - new_tags.append(f"pages:{page_count}") - - # Add subjects as FREEFORM tags (no namespace prefix) - if 'subjects' in data and isinstance(data['subjects'], list): - for subject in data['subjects'][:10]: - if subject and isinstance(subject, str): - subject_clean = str(subject).strip() - if subject_clean and subject_clean not in new_tags: - new_tags.append(subject_clean) - - # Add identifiers - if 'identifiers' in data and isinstance(data['identifiers'], dict): - identifiers = data['identifiers'] - - if 'isbn_10' in identifiers: - isbn_10_list = identifiers['isbn_10'] - if isinstance(isbn_10_list, list) and isbn_10_list: - new_tags.append(f"isbn_10:{isbn_10_list[0]}") - elif isinstance(isbn_10_list, str): - new_tags.append(f"isbn_10:{isbn_10_list}") - - if 'isbn_13' in identifiers: - isbn_13_list = identifiers['isbn_13'] - if isinstance(isbn_13_list, list) and isbn_13_list: - new_tags.append(f"isbn_13:{isbn_13_list[0]}") - elif isinstance(isbn_13_list, str): - new_tags.append(f"isbn_13:{isbn_13_list}") - - if 'lccn' in identifiers: - lccn_list = identifiers['lccn'] - if isinstance(lccn_list, list) and lccn_list: - new_tags.append(f"lccn:{lccn_list[0]}") - elif isinstance(lccn_list, str): - new_tags.append(f"lccn:{lccn_list}") - - if 'oclc_numbers' in identifiers: - oclc_list = identifiers['oclc_numbers'] - if isinstance(oclc_list, list) and oclc_list: - new_tags.append(f"oclc:{oclc_list[0]}") - elif isinstance(oclc_list, str): - new_tags.append(f"oclc:{oclc_list}") - - if 'goodreads' in identifiers: - goodreads_list = identifiers['goodreads'] - if isinstance(goodreads_list, list) and goodreads_list: - new_tags.append(f"goodreads:{goodreads_list[0]}") - elif isinstance(goodreads_list, str): - new_tags.append(f"goodreads:{goodreads_list}") - - log(f"Found {len(new_tags)} tag(s) from OpenLibrary lookup") - return new_tags - except Exception as e: - log(f"OpenLibrary scraping error: {e}", file=sys.stderr) - return [] - - -def perform_metadata_scraping(tags_list: List[str]) -> List[str]: - """Perform scraping based on identifiers in tags. - - Priority order: - 1. openlibrary: (preferred - more complete metadata) - 2. isbn_10 or isbn (fallback) - """ - identifiers = extract_scrapable_identifiers(tags_list) - - if not identifiers: - log("No scrapable identifiers found (openlibrary, ISBN, musicbrainz, imdb)") - return [] - - log(f"Found scrapable identifiers: {', '.join(identifiers.keys())}") - - new_tags = [] - - # Prefer OpenLibrary over ISBN (more complete metadata) - if 'openlibrary' in identifiers: - olid = identifiers['openlibrary'] - if olid: - log(f"Scraping OpenLibrary: {olid}") - new_tags.extend(scrape_openlibrary_metadata(olid)) - elif 'isbn_13' in identifiers or 'isbn_10' in identifiers or 'isbn' in identifiers: - isbn = identifiers.get('isbn_13') or identifiers.get('isbn_10') or identifiers.get('isbn') - if isbn: - log(f"Scraping ISBN: {isbn}") - new_tags.extend(scrape_isbn_metadata(isbn)) - - existing_tags_lower = {tag.lower() for tag in tags_list} - scraped_unique = [] - seen = set() - for tag in new_tags: - tag_lower = tag.lower() - if tag_lower not in existing_tags_lower and tag_lower not in seen: - scraped_unique.append(tag) - seen.add(tag_lower) - - if scraped_unique: - log(f"Added {len(scraped_unique)} new tag(s) from scraping") - - return scraped_unique diff --git a/models.py b/models.py index c4c75c7..81a7b8c 100644 --- a/models.py +++ b/models.py @@ -150,6 +150,35 @@ class PipeObject: # Truncate key if needed key_display = key if len(key) <= 15 else key[:12] + "..." debug(f"│ {key_display:<15}: {val_display:<42}│") + + # If we have structured provider metadata, expand it for debugging. + full_md = self.extra.get("full_metadata") + if isinstance(full_md, dict) and full_md: + debug("├─────────────────────────────────────────────────────────────┤") + debug("│ full_metadata: │") + for md_key in sorted(full_md.keys(), key=lambda x: str(x)): + md_val = full_md.get(md_key) + if isinstance(md_val, (str, int, float)) or md_val is None or isinstance(md_val, bool): + md_display = str(md_val) + elif isinstance(md_val, list): + if len(md_val) <= 6 and all(isinstance(x, (str, int, float, bool)) or x is None for x in md_val): + md_display = "[" + ", ".join(str(x) for x in md_val) + "]" + else: + md_display = f"list({len(md_val)})" + elif isinstance(md_val, dict): + # Avoid dumping huge nested dicts (like raw provider docs). + keys = list(md_val.keys()) + preview = ",".join(str(k) for k in keys[:6]) + md_display = f"dict({len(keys)})[{preview}{',...' if len(keys) > 6 else ''}]" + else: + md_str = str(md_val) + md_display = md_str if len(md_str) <= 40 else md_str[:37] + "..." + + md_key_display = str(md_key) + md_key_display = md_key_display if len(md_key_display) <= 15 else md_key_display[:12] + "..." + if len(md_display) > 42: + md_display = md_display[:39] + "..." + debug(f"│ {md_key_display:<15}: {md_display:<42}│") if self.action: debug("├─────────────────────────────────────────────────────────────┤") diff --git a/pipeline.py b/pipeline.py index 425c909..39395dc 100644 --- a/pipeline.py +++ b/pipeline.py @@ -575,7 +575,12 @@ def restore_previous_result_table() -> bool: _DISPLAY_ITEMS = [] _DISPLAY_TABLE = None _DISPLAY_SUBJECT = None - return True + # If an underlying table exists, we're done. + # Otherwise, fall through to history restore so @.. actually returns to the last table. + if _LAST_RESULT_TABLE is not None: + return True + if not _RESULT_TABLE_HISTORY: + return True if not _RESULT_TABLE_HISTORY: return False @@ -613,7 +618,12 @@ def restore_next_result_table() -> bool: _DISPLAY_ITEMS = [] _DISPLAY_TABLE = None _DISPLAY_SUBJECT = None - return True + # If an underlying table exists, we're done. + # Otherwise, fall through to forward restore when available. + if _LAST_RESULT_TABLE is not None: + return True + if not _RESULT_TABLE_FORWARD: + return True if not _RESULT_TABLE_FORWARD: return False diff --git a/test-login.py b/test-login.py new file mode 100644 index 0000000..cb38c13 --- /dev/null +++ b/test-login.py @@ -0,0 +1,336 @@ +import requests +import random, string +from concurrent import futures +from tqdm import tqdm +import time +from datetime import datetime +import argparse +import os +import sys +import shutil +import json +import re +import base64 +import hashlib +from Crypto.Cipher import AES +from Crypto.Util import Counter + +def display_error(response, message): + print(message) + print(response) + print(response.text) + exit() + +def get_book_infos(session, url): + r = session.get(url).text + infos_url = "https:" + r.split('"url":"')[1].split('"')[0].replace("\\u0026", "&") + response = session.get(infos_url) + data = response.json()['data'] + title = data['brOptions']['bookTitle'].strip().replace(" ", "_") + title = ''.join( c for c in title if c not in '<>:"/\\|?*' ) # Filter forbidden chars in directory names (Windows & Linux) + title = title[:150] # Trim the title to avoid long file names + metadata = data['metadata'] + links = [] + for item in data['brOptions']['data']: + for page in item: + links.append(page['uri']) + + if len(links) > 1: + print(f"[+] Found {len(links)} pages") + return title, links, metadata + else: + print(f"[-] Error while getting image links") + exit() + +def login(email, password): + session = requests.Session() + response = session.get("https://archive.org/services/account/login/") + login_data = response.json() + if not login_data['success']: + display_error(response, "[-] Error while getting login token:") + + login_token = login_data["value"]["token"] + + headers = {"Content-Type": "application/x-www-form-urlencoded"} + data = {"username":email, "password":password, "t": login_token} + + response = session.post("https://archive.org/services/account/login/", headers=headers, data=json.dumps(data)) + try: + response_json = response.json() + except: + display_error(response, "[-] Error while login:") + + if response_json["success"] == False: + if response_json["value"] == "bad_login": + print("[-] Invalid credentials!") + exit() + display_error(response, "[-] Error while login:") + else: + print("[+] Successful login") + return session + +def loan(session, book_id, verbose=True): + data = { + "action": "grant_access", + "identifier": book_id + } + response = session.post("https://archive.org/services/loans/loan/searchInside.php", data=data) + data['action'] = "browse_book" + response = session.post("https://archive.org/services/loans/loan/", data=data) + + if response.status_code == 400 : + try: + if response.json()["error"] == "This book is not available to borrow at this time. Please try again later.": + print("This book doesn't need to be borrowed") + return session + else : + display_error(response, "Something went wrong when trying to borrow the book.") + except: # The response is not in JSON format + display_error(response, "The book cannot be borrowed") + + data['action'] = "create_token" + response = session.post("https://archive.org/services/loans/loan/", data=data) + + if "token" in response.text: + if verbose: + print("[+] Successful loan") + return session + else: + display_error(response, "Something went wrong when trying to borrow the book, maybe you can't borrow this book.") + +def return_loan(session, book_id): + data = { + "action": "return_loan", + "identifier": book_id + } + response = session.post("https://archive.org/services/loans/loan/", data=data) + if response.status_code == 200 and response.json()["success"]: + print("[+] Book returned") + else: + display_error(response, "Something went wrong when trying to return the book") + +def image_name(pages, page, directory): + return f"{directory}/{(len(str(pages)) - len(str(page))) * '0'}{page}.jpg" + +def deobfuscate_image(image_data, link, obf_header): + """ + @Author: https://github.com/justimm + Decrypts the first 1024 bytes of image_data using AES-CTR. + The obfuscation_header is expected in the form "1|" + where the base64-decoded counter is 16 bytes. + We derive the AES key by taking the SHA-1 digest of the image URL (with protocol/host removed) + and using the first 16 bytes. + For AES-CTR, we use a 16-byte counter block. The first 8 bytes are used as a fixed prefix, + and the remaining 8 bytes (interpreted as a big-endian integer) are used as the initial counter value. + """ + try: + version, counter_b64 = obf_header.split('|') + except Exception as e: + raise ValueError("Invalid X-Obfuscate header format") from e + + if version != '1': + raise ValueError("Unsupported obfuscation version: " + version) + + # Derive AES key: replace protocol/host in link with '/' + aesKey = re.sub(r"^https?:\/\/.*?\/", "/", link) + sha1_digest = hashlib.sha1(aesKey.encode('utf-8')).digest() + key = sha1_digest[:16] + + # Decode the counter (should be 16 bytes) + counter_bytes = base64.b64decode(counter_b64) + if len(counter_bytes) != 16: + raise ValueError(f"Expected counter to be 16 bytes, got {len(counter_bytes)}") + + prefix = counter_bytes[:8] + initial_value = int.from_bytes(counter_bytes[8:], byteorder='big') + + # Create AES-CTR cipher with a 64-bit counter length. + ctr = Counter.new(64, prefix=prefix, initial_value=initial_value, little_endian=False) + cipher = AES.new(key, AES.MODE_CTR, counter=ctr) + + decrypted_part = cipher.decrypt(image_data[:1024]) + new_data = decrypted_part + image_data[1024:] + return new_data + +def download_one_image(session, link, i, directory, book_id, pages): + headers = { + "Referer": "https://archive.org/", + "Accept": "image/avif,image/webp,image/apng,image/*,*/*;q=0.8", + "Sec-Fetch-Site": "same-site", + "Sec-Fetch-Mode": "no-cors", + "Sec-Fetch-Dest": "image", + } + retry = True + response = None + while retry: + try: + response = session.get(link, headers=headers) + if response.status_code == 403: + session = loan(session, book_id, verbose=False) + raise Exception("Borrow again") + elif response.status_code == 200: + retry = False + except: + time.sleep(1) # Wait 1 second before retrying + + image = image_name(pages, i, directory) + + obf_header = response.headers.get("X-Obfuscate") + image_content = None + if obf_header: + try: + image_content = deobfuscate_image(response.content, link, obf_header) + except Exception as e: + print(f"[ERROR] Deobfuscation failed: {e}") + return + else: + image_content = response.content + + with open(image, "wb") as f: + f.write(image_content) + +def download(session, n_threads, directory, links, scale, book_id): + print("Downloading pages...") + links = [f"{link}&rotate=0&scale={scale}" for link in links] + pages = len(links) + + tasks = [] + with futures.ThreadPoolExecutor(max_workers=n_threads) as executor: + for link in links: + i = links.index(link) + tasks.append(executor.submit(download_one_image, session=session, link=link, i=i, directory=directory, book_id=book_id, pages=pages)) + for task in tqdm(futures.as_completed(tasks), total=len(tasks)): + pass + + images = [image_name(pages, i, directory) for i in range(len(links))] + return images + +def make_pdf(pdf, title, directory): + file = title+".pdf" + # Handle the case where multiple books with the same name are downloaded + i = 1 + while os.path.isfile(os.path.join(directory, file)): + file = f"{title}({i}).pdf" + i += 1 + + with open(os.path.join(directory, file),"wb") as f: + f.write(pdf) + print(f"[+] PDF saved as \"{file}\"") + +if __name__ == "__main__": + + my_parser = argparse.ArgumentParser() + my_parser.add_argument('-e', '--email', help='Your archive.org email', type=str, required=True) + my_parser.add_argument('-p', '--password', help='Your archive.org password', type=str, required=True) + my_parser.add_argument('-u', '--url', help='Link to the book (https://archive.org/details/XXXX). You can use this argument several times to download multiple books', action='append', type=str) + my_parser.add_argument('-d', '--dir', help='Output directory', type=str) + my_parser.add_argument('-f', '--file', help='File where are stored the URLs of the books to download', type=str) + my_parser.add_argument('-r', '--resolution', help='Image resolution (10 to 0, 0 is the highest), [default 3]', type=int, default=3) + my_parser.add_argument('-t', '--threads', help="Maximum number of threads, [default 50]", type=int, default=50) + my_parser.add_argument('-j', '--jpg', help="Output to individual JPG's rather than a PDF", action='store_true') + my_parser.add_argument('-m', '--meta', help="Output the metadata of the book to a json file (-j option required)", action='store_true') + + if len(sys.argv) == 1: + my_parser.print_help(sys.stderr) + sys.exit(1) + args = my_parser.parse_args() + + if args.url is None and args.file is None: + my_parser.error("At least one of --url and --file required") + + email = args.email + password = args.password + scale = args.resolution + n_threads = args.threads + d = args.dir + + if d == None: + d = os.getcwd() + elif not os.path.isdir(d): + print(f"Output directory does not exist!") + exit() + + if args.url is not None: + urls = args.url + else: + if os.path.exists(args.file): + with open(args.file) as f: + urls = f.read().strip().split("\n") + else: + print(f"{args.file} does not exist!") + exit() + + # Check the urls format + for url in urls: + if not url.startswith("https://archive.org/details/"): + print(f"{url} --> Invalid url. URL must starts with \"https://archive.org/details/\"") + exit() + + print(f"{len(urls)} Book(s) to download") + session = login(email, password) + + for url in urls: + book_id = list(filter(None, url.split("/")))[3] + print("="*40) + print(f"Current book: https://archive.org/details/{book_id}") + session = loan(session, book_id) + title, links, metadata = get_book_infos(session, url) + + directory = os.path.join(d, title) + # Handle the case where multiple books with the same name are downloaded + i = 1 + _directory = directory + while os.path.isdir(directory): + directory = f"{_directory}({i})" + i += 1 + os.makedirs(directory) + + if args.meta: + print("Writing metadata.json...") + with open(f"{directory}/metadata.json",'w') as f: + json.dump(metadata,f) + + images = download(session, n_threads, directory, links, scale, book_id) + + if not args.jpg: # Create pdf with images and remove the images folder + import img2pdf + + # prepare PDF metadata + # sometimes archive metadata is missing + pdfmeta = { } + # ensure metadata are str + for key in ["title", "creator", "associated-names"]: + if key in metadata: + if isinstance(metadata[key], str): + pass + elif isinstance(metadata[key], list): + metadata[key] = "; ".join(metadata[key]) + else: + raise Exception("unsupported metadata type") + # title + if 'title' in metadata: + pdfmeta['title'] = metadata['title'] + # author + if 'creator' in metadata and 'associated-names' in metadata: + pdfmeta['author'] = metadata['creator'] + "; " + metadata['associated-names'] + elif 'creator' in metadata: + pdfmeta['author'] = metadata['creator'] + elif 'associated-names' in metadata: + pdfmeta['author'] = metadata['associated-names'] + # date + if 'date' in metadata: + try: + pdfmeta['creationdate'] = datetime.strptime(metadata['date'][0:4], '%Y') + except: + pass + # keywords + pdfmeta['keywords'] = [f"https://archive.org/details/{book_id}"] + + pdf = img2pdf.convert(images, **pdfmeta) + make_pdf(pdf, title, args.dir if args.dir != None else "") + try: + shutil.rmtree(directory) + except OSError as e: + print ("Error: %s - %s." % (e.filename, e.strerror)) + + return_loan(session, book_id) \ No newline at end of file