This commit is contained in:
nose
2025-12-14 00:53:52 -08:00
parent 52a79b0086
commit a03eb0d1be
24 changed files with 2785 additions and 1868 deletions

View File

@@ -12,6 +12,7 @@ import sys
import time
from SYS.logger import log, debug
from SYS.utils_constant import ALL_SUPPORTED_EXTENSIONS as GLOBAL_SUPPORTED_EXTENSIONS
import tempfile
import logging
from dataclasses import dataclass, field
@@ -1103,9 +1104,7 @@ SUPPORTED_FILETYPES = {
}
# Flatten to get all supported extensions
ALL_SUPPORTED_EXTENSIONS = set()
for category_extensions in SUPPORTED_FILETYPES.values():
ALL_SUPPORTED_EXTENSIONS.update(category_extensions.keys())
ALL_SUPPORTED_EXTENSIONS = set(GLOBAL_SUPPORTED_EXTENSIONS)
# Global Hydrus client cache to reuse session keys

View File

@@ -1,584 +0,0 @@
"""Archive.org API client for borrowing and downloading books.
This module provides low-level functions for interacting with Archive.org:
- Authentication (login, credential management)
- Borrowing (loan, return_loan)
- Book metadata extraction (get_book_infos, get_book_metadata)
- Image downloading and deobfuscation
- PDF creation with metadata
Used by Provider/openlibrary.py for the borrowing workflow.
"""
from __future__ import annotations
import base64
import hashlib
import logging
import os
import re
import sys
import time
from concurrent import futures
from typing import Any, Dict, List, Optional, Sequence, Tuple
import requests
from SYS.logger import log, debug
try:
from Crypto.Cipher import AES # type: ignore
from Crypto.Util import Counter # type: ignore
except ImportError:
AES = None # type: ignore
Counter = None # type: ignore
try:
from tqdm import tqdm # type: ignore
except ImportError:
tqdm = None # type: ignore
def credential_openlibrary(config: Dict[str, Any]) -> Tuple[Optional[str], Optional[str]]:
"""Get OpenLibrary/Archive.org email and password from config.
Supports both formats:
- New: {"provider": {"openlibrary": {"email": "...", "password": "..."}}}
- Old: {"Archive": {"email": "...", "password": "..."}}
{"archive_org_email": "...", "archive_org_password": "..."}
Returns: (email, password) tuple, each can be None
"""
if not isinstance(config, dict):
return None, None
# Try new format first
provider_config = config.get("provider", {})
if isinstance(provider_config, dict):
openlibrary_config = provider_config.get("openlibrary", {})
if isinstance(openlibrary_config, dict):
email = openlibrary_config.get("email")
password = openlibrary_config.get("password")
if email or password:
return email, password
# Try old nested format
archive_config = config.get("Archive")
if isinstance(archive_config, dict):
email = archive_config.get("email")
password = archive_config.get("password")
if email or password:
return email, password
# Fall back to old flat format
email = config.get("archive_org_email")
password = config.get("archive_org_password")
return email, password
class BookNotAvailableError(Exception):
"""Raised when a book is not available for borrowing (waitlisted/in use)."""
pass
def display_error(response: requests.Response, message: str) -> None:
"""Display error and exit."""
log(message, file=sys.stderr)
log(response.text, file=sys.stderr)
sys.exit(1)
def login(email: str, password: str) -> requests.Session:
"""Login to archive.org.
Args:
email: Archive.org email
password: Archive.org password
Returns:
Authenticated requests.Session
Raises:
SystemExit on login failure
"""
session = requests.Session()
session.get("https://archive.org/account/login", timeout=30)
data = {"username": email, "password": password}
response = session.post("https://archive.org/account/login", data=data, timeout=30)
if "bad_login" in response.text:
log("Invalid credentials!", file=sys.stderr)
sys.exit(1)
if "Successful login" in response.text:
debug("Successful login")
return session
display_error(response, "[-] Error while login:")
sys.exit(1) # Unreachable but satisfies type checker
def loan(session: requests.Session, book_id: str, verbose: bool = True) -> requests.Session:
"""Borrow a book from archive.org (14-day loan).
Args:
session: Authenticated requests.Session from login()
book_id: Archive.org book identifier (e.g., 'ia_book_id')
verbose: Whether to log messages
Returns:
Session with active loan
Raises:
SystemExit on loan failure
"""
data = {"action": "grant_access", "identifier": book_id}
response = session.post("https://archive.org/services/loans/loan/searchInside.php", data=data, timeout=30)
data["action"] = "browse_book"
response = session.post("https://archive.org/services/loans/loan/", data=data, timeout=30)
if response.status_code == 400:
try:
if response.json()["error"] == "This book is not available to borrow at this time. Please try again later.":
debug("Book is not available for borrowing (waitlisted or in use)")
raise BookNotAvailableError("Book is waitlisted or in use")
display_error(response, "Something went wrong when trying to borrow the book.")
except BookNotAvailableError:
raise
except:
display_error(response, "The book cannot be borrowed")
data["action"] = "create_token"
response = session.post("https://archive.org/services/loans/loan/", data=data, timeout=30)
if "token" in response.text:
if verbose:
debug("Successful loan")
return session
display_error(response, "Something went wrong when trying to borrow the book.")
sys.exit(1) # Unreachable but satisfies type checker
def return_loan(session: requests.Session, book_id: str) -> None:
"""Return a borrowed book.
Args:
session: Authenticated requests.Session with active loan
book_id: Archive.org book identifier
"""
data = {"action": "return_loan", "identifier": book_id}
response = session.post("https://archive.org/services/loans/loan/", data=data, timeout=30)
if response.status_code == 200 and response.json()["success"]:
debug("Book returned")
else:
display_error(response, "Something went wrong when trying to return the book")
def get_book_infos(session: requests.Session, url: str) -> Tuple[str, List[str], Dict[str, Any]]:
"""Extract book information and page links from archive.org viewer.
Args:
session: Authenticated requests.Session
url: Book URL (e.g., https://archive.org/borrow/book_id or /details/book_id)
Returns:
Tuple of (title, page_links, metadata)
Raises:
RuntimeError: If page data cannot be extracted
"""
r = session.get(url, timeout=30).text
# Try to extract the infos URL from the response
try:
# Look for the "url" field in the response using regex
# Matches "url":"//archive.org/..."
import re
match = re.search(r'"url"\s*:\s*"([^"]+)"', r)
if not match:
raise ValueError("No 'url' field found in response")
url_path = match.group(1)
if url_path.startswith("//"):
infos_url = "https:" + url_path
else:
infos_url = url_path
infos_url = infos_url.replace("\\u0026", "&")
except (IndexError, ValueError, AttributeError) as e:
# If URL extraction fails, raise with better error message
raise RuntimeError(f"Failed to extract book info URL from response: {e}")
response = session.get(infos_url, timeout=30)
data = response.json()["data"]
title = data["brOptions"]["bookTitle"].strip().replace(" ", "_")
title = "".join(c for c in title if c not in '<>:"/\\|?*') # Filter forbidden chars
title = title[:150] # Trim to avoid long file names
metadata = data["metadata"]
links = []
# Safely extract page links from brOptions data
try:
br_data = data.get("brOptions", {}).get("data", [])
for item in br_data:
if isinstance(item, list):
for page in item:
if isinstance(page, dict) and "uri" in page:
links.append(page["uri"])
elif isinstance(item, dict) and "uri" in item:
links.append(item["uri"])
except (KeyError, IndexError, TypeError) as e:
log(f"Warning: Error parsing page links: {e}", file=sys.stderr)
# Continue with whatever links we found
if len(links) > 1:
debug(f"Found {len(links)} pages")
return title, links, metadata
elif len(links) == 1:
debug(f"Found {len(links)} page")
return title, links, metadata
else:
log("Error while getting image links - no pages found", file=sys.stderr)
raise RuntimeError("No pages found in book data")
def image_name(pages: int, page: int, directory: str) -> str:
"""Generate image filename for page.
Args:
pages: Total number of pages
page: Current page number (0-indexed)
directory: Directory to save to
Returns:
Full path to image file
"""
return f"{directory}/{(len(str(pages)) - len(str(page))) * '0'}{page}.jpg"
def deobfuscate_image(image_data: bytes, link: str, obf_header: str) -> bytes:
"""Decrypt obfuscated image data using AES-CTR.
This handles Archive.org's image obfuscation for borrowed books.
Based on: https://github.com/justimm
Args:
image_data: Encrypted image bytes
link: Image URL (used to derive AES key)
obf_header: X-Obfuscate header value (format: "1|BASE64_COUNTER")
Returns:
Decrypted image bytes
"""
if not AES or not Counter:
raise RuntimeError("Crypto library not available")
try:
version, counter_b64 = obf_header.split("|")
except Exception as e:
raise ValueError("Invalid X-Obfuscate header format") from e
if version != "1":
raise ValueError("Unsupported obfuscation version: " + version)
# Derive AES key from URL
aesKey = re.sub(r"^https?:\/\/.*?\/", "/", link)
sha1_digest = hashlib.sha1(aesKey.encode("utf-8")).digest()
key = sha1_digest[:16]
# Decode counter
counter_bytes = base64.b64decode(counter_b64)
if len(counter_bytes) != 16:
raise ValueError(f"Expected counter to be 16 bytes, got {len(counter_bytes)}")
prefix = counter_bytes[:8]
initial_value = int.from_bytes(counter_bytes[8:], byteorder="big")
# Create AES-CTR cipher
ctr = Counter.new(64, prefix=prefix, initial_value=initial_value, little_endian=False) # type: ignore
cipher = AES.new(key, AES.MODE_CTR, counter=ctr) # type: ignore
decrypted_part = cipher.decrypt(image_data[:1024])
new_data = decrypted_part + image_data[1024:]
return new_data
def download_one_image(
session: requests.Session,
link: str,
i: int,
directory: str,
book_id: str,
pages: int,
) -> None:
"""Download a single book page image.
Handles obfuscated images and re-borrowing on 403 errors.
Args:
session: Authenticated requests.Session
link: Direct image URL
i: Page index (0-based)
directory: Directory to save to
book_id: Archive.org book ID (for re-borrowing on 403)
pages: Total number of pages
"""
headers = {
"Referer": "https://archive.org/",
"Accept": "image/avif,image/webp,image/apng,image/*,*/*;q=0.8",
"Sec-Fetch-Site": "same-site",
"Sec-Fetch-Mode": "no-cors",
"Sec-Fetch-Dest": "image",
}
retry = True
response = None
while retry:
try:
response = session.get(link, headers=headers, timeout=30)
if response.status_code == 403:
session = loan(session, book_id, verbose=False)
raise Exception("Borrow again")
if response.status_code == 200:
retry = False
except:
time.sleep(1)
image = image_name(pages, i, directory)
if response is None:
log(f"Failed to download page {i}", file=sys.stderr)
return
obf_header = response.headers.get("X-Obfuscate")
image_content = None
if obf_header:
try:
image_content = deobfuscate_image(response.content, link, obf_header)
except Exception as e:
log(f"Deobfuscation failed: {e}", file=sys.stderr)
return
else:
image_content = response.content
with open(image, "wb") as f:
f.write(image_content)
def download(
session: requests.Session,
n_threads: int,
directory: str,
links: List[str],
scale: int,
book_id: str,
) -> List[str]:
"""Download all book pages as images.
Uses thread pool for parallel downloads.
Args:
session: Authenticated requests.Session
n_threads: Number of download threads
directory: Directory to save images to
links: List of image url
scale: Image resolution (0=highest, 10=lowest)
book_id: Archive.org book ID (for re-borrowing)
Returns:
List of downloaded image file paths
"""
debug("Downloading pages...")
links = [f"{link}&rotate=0&scale={scale}" for link in links]
pages = len(links)
tasks = []
with futures.ThreadPoolExecutor(max_workers=n_threads) as executor:
for link in links:
i = links.index(link)
tasks.append(
executor.submit(
download_one_image,
session=session,
link=link,
i=i,
directory=directory,
book_id=book_id,
pages=pages,
)
)
if tqdm:
for _ in tqdm(futures.as_completed(tasks), total=len(tasks)): # type: ignore
pass
else:
for _ in futures.as_completed(tasks):
pass
images = [image_name(pages, i, directory) for i in range(len(links))]
return images
def check_direct_download(book_id: str) -> Tuple[bool, str]:
"""Check if a book can be downloaded directly without borrowing.
Searches Archive.org metadata for downloadable PDF files.
Args:
book_id: Archive.org book identifier
Returns:
Tuple of (can_download: bool, pdf_url: str)
"""
try:
# First, try to get the metadata to find the actual PDF filename
metadata_url = f"https://archive.org/metadata/{book_id}"
response = requests.get(metadata_url, timeout=10)
response.raise_for_status()
metadata = response.json()
# Find PDF file in files list
if "files" in metadata:
for file_info in metadata["files"]:
filename = file_info.get("name", "")
if filename.endswith(".pdf") and file_info.get("source") == "original":
# Found the original PDF
pdf_filename = filename
pdf_url = f"https://archive.org/download/{book_id}/{pdf_filename.replace(' ', '%20')}"
# Verify it's accessible
check_response = requests.head(pdf_url, timeout=5, allow_redirects=True)
if check_response.status_code == 200:
return True, pdf_url
return False, ""
except Exception as e:
log(f"Error checking direct download: {e}", file=sys.stderr)
return False, ""
def get_openlibrary_by_isbn(isbn: str) -> Dict[str, Any]:
"""Fetch book data from OpenLibrary using ISBN.
Args:
isbn: ISBN-10 or ISBN-13 to search for
Returns:
Dictionary with book metadata from OpenLibrary
"""
try:
# Try ISBN API first
api_url = f"https://openlibrary.org/api/books?bibkeys=ISBN:{isbn}&jscmd=data&format=json"
response = requests.get(api_url, timeout=10)
response.raise_for_status()
data = response.json()
if data:
# Get first result
key = list(data.keys())[0]
return data[key]
return {}
except Exception as e:
log(f"Error fetching OpenLibrary data by ISBN: {e}", file=sys.stderr)
return {}
def extract_isbn_from_metadata(metadata: Dict[str, Any]) -> str:
"""Extract ISBN from archive.org metadata.
Looks for ISBN in various metadata fields.
Args:
metadata: Archive.org metadata dictionary
Returns:
ISBN string (clean, no hyphens) or empty string if not found
"""
# Try various common metadata fields
isbn_fields = [
"isbn", "ISBN", "isbn_13", "isbn_10", "isbns",
"isbn-10", "isbn-13", "identifer_isbn"
]
for field in isbn_fields:
if field in metadata:
isbn_val = metadata[field]
if isinstance(isbn_val, list):
isbn_val = isbn_val[0] if isbn_val else None
if isbn_val and isinstance(isbn_val, str):
# Clean ISBN (remove hyphens, spaces)
isbn_clean = isbn_val.replace("-", "").replace(" ", "")
if len(isbn_clean) in [10, 13]:
return isbn_clean
return ""
def normalize_url(url: str) -> str:
"""Convert openlibrary.org URL to archive.org URL.
Looks up the actual Archive.org ID from OpenLibrary API.
Args:
url: Book URL (archive.org or openlibrary.org format)
Returns:
Normalized archive.org URL
"""
url = url.strip()
# Already archive.org format
if url.startswith("https://archive.org/details/"):
return url
# Convert openlibrary.org format by querying the OpenLibrary API
if "openlibrary.org/books/" in url:
try:
# Extract the book ID (e.g., OL6796852M)
parts = url.split("/books/")
if len(parts) > 1:
book_id = parts[1].split("/")[0]
# Query OpenLibrary API to get the book metadata
api_url = f"https://openlibrary.org/books/{book_id}.json"
response = requests.get(api_url, timeout=10)
response.raise_for_status()
data = response.json()
# Look for identifiers including internet_archive or ocaid
# First try ocaid (Open Content Alliance ID) - this is most common
if "ocaid" in data:
ocaid = data["ocaid"]
return f"https://archive.org/details/{ocaid}"
# Check for identifiers object
if "identifiers" in data:
identifiers = data["identifiers"]
# Look for internet_archive ID
if "internet_archive" in identifiers:
ia_ids = identifiers["internet_archive"]
if isinstance(ia_ids, list) and ia_ids:
ia_id = ia_ids[0]
else:
ia_id = ia_ids
return f"https://archive.org/details/{ia_id}"
# If no IA identifier found, use the book ID as fallback
log(f"No Internet Archive ID found for {book_id}. Attempting with OpenLibrary ID.", file=sys.stderr)
return f"https://archive.org/details/{book_id}"
except requests.RequestException as e:
log(f"Could not fetch OpenLibrary metadata: {e}", file=sys.stderr)
# Fallback to using the book ID directly
parts = url.split("/books/")
if len(parts) > 1:
book_id = parts[1].split("/")[0]
return f"https://archive.org/details/{book_id}"
except (KeyError, IndexError) as e:
log(f"Error parsing OpenLibrary response: {e}", file=sys.stderr)
# Fallback to using the book ID directly
parts = url.split("/books/")
if len(parts) > 1:
book_id = parts[1].split("/")[0]
return f"https://archive.org/details/{book_id}"
# Return original if can't parse
return url

View File

@@ -407,38 +407,53 @@ class API_folder_store:
logger.error(f"Error clearing worker log for {worker_id}: {exc}", exc_info=True)
def _migrate_metadata_schema(self, cursor) -> None:
"""Import legacy metadata from old schema if present. Existing hash-based schema is ready to use."""
"""Ensure metadata schema is up-to-date.
- If a legacy schema is detected, attempt to import/upgrade (best-effort).
- If the hash-based schema exists, add any missing columns expected by current code.
"""
try:
# Check if this is a fresh new database (hash-based schema)
cursor.execute('PRAGMA table_info(metadata)')
existing_columns = {row[1] for row in cursor.fetchall()}
# If hash column exists, we're already on the new schema
if 'hash' in existing_columns:
logger.info("Database is already using hash-based schema - no migration needed")
return
# Legacy migration: If old schema exists, try to import data
# Legacy migration: If old schema exists, try to import data.
# Old schema would have had: id (INTEGER PRIMARY KEY), file_hash (TEXT), etc.
if 'id' in existing_columns and 'file_hash' in existing_columns:
logger.info("Detected legacy metadata schema - importing to new hash-based schema")
# This would be complex legacy migration - for now just note it
logger.info("Legacy metadata table detected but import not yet implemented")
if 'hash' not in existing_columns:
if 'id' in existing_columns and 'file_hash' in existing_columns:
logger.info("Detected legacy metadata schema - importing to new hash-based schema")
# This would be complex legacy migration - for now just note it.
logger.info("Legacy metadata table detected but import not yet implemented")
return
# Unknown/unsupported schema; nothing we can safely do here.
return
# Add any missing columns to the new schema
for col_name, col_def in [('size', 'INTEGER'), ('ext', 'TEXT'),
('type', 'TEXT'),
('time_imported', 'TIMESTAMP DEFAULT CURRENT_TIMESTAMP'),
('time_modified', 'TIMESTAMP DEFAULT CURRENT_TIMESTAMP')]:
# Hash-based schema exists: add any missing columns expected by current code.
# These are safe ALTER TABLE additions for older DBs.
column_specs = {
'size': 'INTEGER',
'ext': 'TEXT',
'type': 'TEXT',
'url': 'TEXT',
'relationships': 'TEXT',
'duration': 'REAL',
'time_imported': 'TIMESTAMP DEFAULT CURRENT_TIMESTAMP',
'time_modified': 'TIMESTAMP DEFAULT CURRENT_TIMESTAMP',
'created_at': 'TIMESTAMP DEFAULT CURRENT_TIMESTAMP',
'updated_at': 'TIMESTAMP DEFAULT CURRENT_TIMESTAMP',
}
for col_name, col_def in column_specs.items():
if col_name not in existing_columns:
try:
cursor.execute(f"ALTER TABLE metadata ADD COLUMN {col_name} {col_def}")
existing_columns.add(col_name)
logger.info(f"Added '{col_name}' column to metadata table")
except Exception as e:
logger.debug(f"Column '{col_name}' may already exist: {e}")
# Populate type column from ext if not already populated
# Populate type column from ext if not already populated.
if 'type' in existing_columns and 'ext' in existing_columns:
try:
from SYS.utils_constant import get_type_from_ext
@@ -451,7 +466,7 @@ class API_folder_store:
logger.info(f"Populated type column for {len(rows)} metadata entries")
except Exception as e:
logger.debug(f"Could not populate type column: {e}")
self.connection.commit()
except Exception as e:
logger.debug(f"Note: Schema import/migration completed with status: {e}")
@@ -929,6 +944,13 @@ class API_folder_store:
if not fields:
return
# Ensure a metadata row exists so updates don't silently no-op.
# This can happen for older DBs or entries created without explicit metadata.
cursor.execute(
"INSERT OR IGNORE INTO metadata (hash) VALUES (?)",
(file_hash,),
)
values.append(file_hash)
sql = f"UPDATE metadata SET {', '.join(fields)}, time_modified = CURRENT_TIMESTAMP, updated_at = CURRENT_TIMESTAMP WHERE hash = ?"
@@ -1681,6 +1703,84 @@ class DatabaseAPI:
)
return {row[0] for row in cursor.fetchall()}
def get_file_hashes_with_any_url(self, limit: Optional[int] = None) -> Set[str]:
"""Get hashes of files that have any non-empty URL metadata."""
cursor = self.get_cursor()
cursor.execute(
"""
SELECT DISTINCT f.hash
FROM files f
JOIN metadata m ON f.hash = m.hash
WHERE m.url IS NOT NULL
AND TRIM(m.url) != ''
AND TRIM(m.url) != '[]'
LIMIT ?
""",
(limit or 10000,),
)
return {row[0] for row in cursor.fetchall()}
def get_file_hashes_by_url_like(self, like_pattern: str, limit: Optional[int] = None) -> Set[str]:
"""Get hashes of files whose URL metadata contains a substring (case-insensitive)."""
cursor = self.get_cursor()
cursor.execute(
"""
SELECT DISTINCT f.hash
FROM files f
JOIN metadata m ON f.hash = m.hash
WHERE m.url IS NOT NULL
AND LOWER(m.url) LIKE ?
LIMIT ?
""",
(like_pattern.lower(), limit or 10000),
)
return {row[0] for row in cursor.fetchall()}
def get_files_with_any_url(self, limit: Optional[int] = None) -> List[tuple]:
"""Get files that have any non-empty URL metadata.
Returns (hash, file_path, size, ext) tuples.
"""
cursor = self.get_cursor()
cursor.execute(
"""
SELECT f.hash, f.file_path,
COALESCE((SELECT size FROM metadata WHERE hash = f.hash), 0) as size,
COALESCE((SELECT ext FROM metadata WHERE hash = f.hash), '') as ext
FROM files f
JOIN metadata m ON f.hash = m.hash
WHERE m.url IS NOT NULL
AND TRIM(m.url) != ''
AND TRIM(m.url) != '[]'
ORDER BY f.file_path
LIMIT ?
""",
(limit or 10000,),
)
return cursor.fetchall()
def get_files_by_url_like(self, like_pattern: str, limit: Optional[int] = None) -> List[tuple]:
"""Get files whose URL metadata contains a substring (case-insensitive).
Returns (hash, file_path, size, ext) tuples.
"""
cursor = self.get_cursor()
cursor.execute(
"""
SELECT f.hash, f.file_path,
COALESCE((SELECT size FROM metadata WHERE hash = f.hash), 0) as size,
COALESCE((SELECT ext FROM metadata WHERE hash = f.hash), '') as ext
FROM files f
JOIN metadata m ON f.hash = m.hash
WHERE m.url IS NOT NULL
AND LOWER(m.url) LIKE ?
ORDER BY f.file_path
LIMIT ?
""",
(like_pattern.lower(), limit or 10000),
)
return cursor.fetchall()
def get_file_metadata(self, file_hashes: Set[str], limit: Optional[int] = None) -> List[tuple]:
"""Get metadata for files given their hashes. Returns (hash, file_path, size, extension) tuples."""
if not file_hashes:

35
CLI.py
View File

@@ -1498,6 +1498,9 @@ def _execute_pipeline(tokens: list):
elif table_type == 'soulseek':
print(f"Auto-piping Soulseek selection to download-file")
stages.append(['download-file'])
elif table_type == 'openlibrary':
print(f"Auto-piping OpenLibrary selection to download-file")
stages.append(['download-file'])
elif source_cmd == 'search-file' and source_args and 'youtube' in source_args:
# Legacy check
print(f"Auto-piping YouTube selection to .pipe")
@@ -1667,6 +1670,35 @@ def _execute_pipeline(tokens: list):
filtered_pipe_objs = [coerce_to_pipe_object(item) for item in filtered]
piped_result = filtered_pipe_objs if len(filtered_pipe_objs) > 1 else filtered_pipe_objs[0]
print(f"Selected {len(filtered)} item(s) using {cmd_name}")
# If selection is the last stage and looks like a provider result,
# auto-initiate the borrow/download flow.
if stage_index + 1 >= len(stages):
try:
from ProviderCore.registry import get_search_provider as _get_search_provider
except Exception:
_get_search_provider = None
if _get_search_provider is not None:
selected_list = filtered_pipe_objs
provider_table: Optional[str] = None
try:
for obj in selected_list:
extra = getattr(obj, "extra", None)
if isinstance(extra, dict) and extra.get("table"):
provider_table = str(extra.get("table"))
break
except Exception:
provider_table = None
if provider_table:
try:
provider = _get_search_provider(provider_table, config)
except Exception:
provider = None
if provider is not None:
print("Auto-downloading selection via download-file")
stages.append(["download-file"])
continue
else:
print(f"No items matched selection {cmd_name}\n")
@@ -1736,13 +1768,14 @@ def _execute_pipeline(tokens: list):
}
# Display-only commands (just show data, don't modify or search)
display_only_commands = {
'get-url', 'get_url', 'get-note', 'get_note',
'get-note', 'get_note',
'get-relationship', 'get_relationship', 'get-file', 'get_file',
'check-file-status', 'check_file_status'
}
# Commands that manage their own table/history state (e.g. get-tag)
self_managing_commands = {
'get-tag', 'get_tag', 'tags',
'get-url', 'get_url',
'search-file', 'search_file'
}

View File

@@ -1,19 +1,38 @@
from __future__ import annotations
import base64
from concurrent import futures
import hashlib
import json as json_module
import re
import shutil
import sys
import tempfile
import time
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
import requests
from API.HTTP import HTTPClient
from ProviderCore.base import SearchProvider, SearchResult
from ProviderCore.download import download_file, sanitize_filename
from cli_syntax import get_field, get_free_text, parse_query
from SYS.logger import log
from SYS.utils import unique_path
try:
from Crypto.Cipher import AES # type: ignore
from Crypto.Util import Counter # type: ignore
except ImportError:
AES = None # type: ignore
Counter = None # type: ignore
try:
from tqdm import tqdm # type: ignore
except ImportError:
tqdm = None # type: ignore
def _looks_like_isbn(text: str) -> bool:
t = (text or "").replace("-", "").strip()
@@ -38,6 +57,13 @@ def _resolve_edition_id(doc: Dict[str, Any]) -> str:
edition_key = doc.get("edition_key")
if isinstance(edition_key, list) and edition_key:
return str(edition_key[0]).strip()
if isinstance(edition_key, str) and edition_key.strip():
return edition_key.strip()
# Often present even when edition_key is missing.
cover_edition_key = doc.get("cover_edition_key")
if isinstance(cover_edition_key, str) and cover_edition_key.strip():
return cover_edition_key.strip()
# Fallback: sometimes key can be /books/OL...M
key = doc.get("key")
@@ -54,7 +80,7 @@ def _check_lendable(session: requests.Session, edition_id: str) -> Tuple[bool, s
return False, "not-an-edition"
url = f"https://openlibrary.org/api/volumes/brief/json/OLID:{edition_id}"
resp = session.get(url, timeout=10)
resp = session.get(url, timeout=6)
resp.raise_for_status()
data = resp.json() or {}
wrapped = data.get(f"OLID:{edition_id}")
@@ -88,7 +114,7 @@ def _resolve_archive_id(session: requests.Session, edition_id: str, ia_candidate
# Otherwise query the edition JSON.
try:
resp = session.get(f"https://openlibrary.org/books/{edition_id}.json", timeout=10)
resp = session.get(f"https://openlibrary.org/books/{edition_id}.json", timeout=6)
resp.raise_for_status()
data = resp.json() or {}
@@ -116,6 +142,522 @@ class OpenLibrary(SearchProvider):
super().__init__(config)
self._session = requests.Session()
class BookNotAvailableError(Exception):
"""Raised when a book is not available for borrowing (waitlisted/in use)."""
@staticmethod
def _credential_archive(config: Dict[str, Any]) -> Tuple[Optional[str], Optional[str]]:
"""Get Archive.org email/password from config.
Supports:
- New: {"provider": {"openlibrary": {"email": "...", "password": "..."}}}
- Old: {"Archive": {"email": "...", "password": "..."}}
{"archive_org_email": "...", "archive_org_password": "..."}
"""
if not isinstance(config, dict):
return None, None
provider_config = config.get("provider", {})
if isinstance(provider_config, dict):
openlibrary_config = provider_config.get("openlibrary", {})
if isinstance(openlibrary_config, dict):
email = openlibrary_config.get("email")
password = openlibrary_config.get("password")
if email or password:
return str(email) if email is not None else None, str(password) if password is not None else None
archive_config = config.get("Archive")
if isinstance(archive_config, dict):
email = archive_config.get("email")
password = archive_config.get("password")
if email or password:
return str(email) if email is not None else None, str(password) if password is not None else None
email = config.get("archive_org_email")
password = config.get("archive_org_password")
return str(email) if email is not None else None, str(password) if password is not None else None
@staticmethod
def _archive_error_body(response: requests.Response) -> str:
try:
body = response.text or ""
except Exception:
return ""
if len(body) > 2000:
return body[:1200] + "\n... (truncated) ...\n" + body[-400:]
return body
@classmethod
def _archive_login(cls, email: str, password: str) -> requests.Session:
"""Login to archive.org using the token-based services endpoint (matches test-login.py)."""
session = requests.Session()
token_resp = session.get("https://archive.org/services/account/login/", timeout=30)
try:
token_json = token_resp.json()
except Exception as exc:
raise RuntimeError(f"Archive login token parse failed: {exc}\n{cls._archive_error_body(token_resp)}")
if not token_json.get("success"):
raise RuntimeError(f"Archive login token fetch failed\n{cls._archive_error_body(token_resp)}")
token = (token_json.get("value") or {}).get("token")
if not token:
raise RuntimeError("Archive login token missing")
headers = {"Content-Type": "application/x-www-form-urlencoded"}
payload = {"username": email, "password": password, "t": token}
login_resp = session.post(
"https://archive.org/services/account/login/",
headers=headers,
data=json_module.dumps(payload),
timeout=30,
)
try:
login_json = login_resp.json()
except Exception as exc:
raise RuntimeError(f"Archive login parse failed: {exc}\n{cls._archive_error_body(login_resp)}")
if login_json.get("success") is False:
if login_json.get("value") == "bad_login":
raise RuntimeError("Invalid Archive.org credentials")
raise RuntimeError(f"Archive login failed: {login_json}")
return session
@classmethod
def _archive_loan(cls, session: requests.Session, book_id: str, *, verbose: bool = True) -> requests.Session:
data = {"action": "grant_access", "identifier": book_id}
session.post("https://archive.org/services/loans/loan/searchInside.php", data=data, timeout=30)
data["action"] = "browse_book"
response = session.post("https://archive.org/services/loans/loan/", data=data, timeout=30)
if response.status_code == 400:
try:
err = (response.json() or {}).get("error")
if err == "This book is not available to borrow at this time. Please try again later.":
raise cls.BookNotAvailableError("Book is waitlisted or in use")
raise RuntimeError(f"Borrow failed: {err or response.text}")
except cls.BookNotAvailableError:
raise
except Exception:
raise RuntimeError("The book cannot be borrowed")
data["action"] = "create_token"
response = session.post("https://archive.org/services/loans/loan/", data=data, timeout=30)
if "token" in (response.text or ""):
return session
raise RuntimeError("Something went wrong when trying to borrow the book")
@staticmethod
def _archive_return_loan(session: requests.Session, book_id: str) -> None:
data = {"action": "return_loan", "identifier": book_id}
response = session.post("https://archive.org/services/loans/loan/", data=data, timeout=30)
if response.status_code == 200:
try:
if (response.json() or {}).get("success"):
return
except Exception:
pass
raise RuntimeError("Something went wrong when trying to return the book")
@staticmethod
def _archive_get_book_infos(session: requests.Session, url: str) -> Tuple[str, List[str], Dict[str, Any]]:
"""Extract page links from Archive.org book reader."""
r = session.get(url, timeout=30).text
# Matches: "url":"//archive.org/..." (allow whitespace)
match = re.search(r'"url"\s*:\s*"([^"]+)"', r)
if not match:
raise RuntimeError("Failed to extract book info URL from response")
url_path = match.group(1)
infos_url = ("https:" + url_path) if url_path.startswith("//") else url_path
infos_url = infos_url.replace("\\u0026", "&")
response = session.get(infos_url, timeout=30)
payload = response.json()
data = payload["data"]
title = str(data["brOptions"]["bookTitle"]).strip().replace(" ", "_")
title = "".join(c for c in title if c not in '<>:"/\\|?*')
title = title[:150]
metadata = data.get("metadata") or {}
links: List[str] = []
br_data = (data.get("brOptions") or {}).get("data", [])
if isinstance(br_data, list):
for item in br_data:
if isinstance(item, list):
for page in item:
if isinstance(page, dict) and "uri" in page:
links.append(page["uri"])
elif isinstance(item, dict) and "uri" in item:
links.append(item["uri"])
if not links:
raise RuntimeError("No pages found in book data")
return title, links, metadata if isinstance(metadata, dict) else {}
@staticmethod
def _archive_image_name(pages: int, page: int, directory: str) -> str:
return f"{directory}/{(len(str(pages)) - len(str(page))) * '0'}{page}.jpg"
@staticmethod
def _archive_deobfuscate_image(image_data: bytes, link: str, obf_header: str) -> bytes:
if not AES or not Counter:
raise RuntimeError("Crypto library not available")
try:
version, counter_b64 = obf_header.split("|")
except Exception as exc:
raise ValueError("Invalid X-Obfuscate header format") from exc
if version != "1":
raise ValueError("Unsupported obfuscation version: " + version)
aes_key = re.sub(r"^https?:\/\/.*?\/", "/", link)
sha1_digest = hashlib.sha1(aes_key.encode("utf-8")).digest()
key = sha1_digest[:16]
counter_bytes = base64.b64decode(counter_b64)
if len(counter_bytes) != 16:
raise ValueError(f"Expected counter to be 16 bytes, got {len(counter_bytes)}")
prefix = counter_bytes[:8]
initial_value = int.from_bytes(counter_bytes[8:], byteorder="big")
ctr = Counter.new(64, prefix=prefix, initial_value=initial_value, little_endian=False) # type: ignore
cipher = AES.new(key, AES.MODE_CTR, counter=ctr) # type: ignore
decrypted_part = cipher.decrypt(image_data[:1024])
return decrypted_part + image_data[1024:]
@classmethod
def _archive_download_one_image(
cls,
session: requests.Session,
link: str,
i: int,
directory: str,
book_id: str,
pages: int,
) -> None:
headers = {
"Referer": "https://archive.org/",
"Accept": "image/avif,image/webp,image/apng,image/*,*/*;q=0.8",
"Sec-Fetch-Site": "same-site",
"Sec-Fetch-Mode": "no-cors",
"Sec-Fetch-Dest": "image",
}
while True:
try:
response = session.get(link, headers=headers, timeout=30)
if response.status_code == 403:
cls._archive_loan(session, book_id, verbose=False)
raise RuntimeError("Borrow again")
if response.status_code == 200:
break
except Exception:
time.sleep(1)
image = cls._archive_image_name(pages, i, directory)
obf_header = response.headers.get("X-Obfuscate")
if obf_header:
image_content = cls._archive_deobfuscate_image(response.content, link, obf_header)
else:
image_content = response.content
with open(image, "wb") as f:
f.write(image_content)
@classmethod
def _archive_download(
cls,
session: requests.Session,
n_threads: int,
directory: str,
links: List[str],
scale: int,
book_id: str,
) -> List[str]:
links_scaled = [f"{link}&rotate=0&scale={scale}" for link in links]
pages = len(links_scaled)
tasks = []
with futures.ThreadPoolExecutor(max_workers=n_threads) as executor:
for i, link in enumerate(links_scaled):
tasks.append(
executor.submit(
cls._archive_download_one_image,
session=session,
link=link,
i=i,
directory=directory,
book_id=book_id,
pages=pages,
)
)
if tqdm:
for _ in tqdm(futures.as_completed(tasks), total=len(tasks)): # type: ignore
pass
else:
for _ in futures.as_completed(tasks):
pass
return [cls._archive_image_name(pages, i, directory) for i in range(pages)]
@staticmethod
def _archive_check_direct_download(book_id: str) -> Tuple[bool, str]:
"""Check for a directly downloadable original PDF in Archive.org metadata."""
try:
metadata_url = f"https://archive.org/metadata/{book_id}"
response = requests.get(metadata_url, timeout=6)
response.raise_for_status()
metadata = response.json()
files = metadata.get("files") if isinstance(metadata, dict) else None
if isinstance(files, list):
for file_info in files:
if not isinstance(file_info, dict):
continue
filename = str(file_info.get("name", ""))
if filename.endswith(".pdf") and file_info.get("source") == "original":
pdf_url = f"https://archive.org/download/{book_id}/{filename.replace(' ', '%20')}"
check_response = requests.head(pdf_url, timeout=4, allow_redirects=True)
if check_response.status_code == 200:
return True, pdf_url
return False, ""
except Exception:
return False, ""
@staticmethod
def scrape_isbn_metadata(isbn: str) -> List[str]:
"""Scrape tags for an ISBN using Open Library API.
Returns tags such as:
- title:<...>, author:<...>, publish_date:<...>, publisher:<...>, description:<...>, pages:<...>
- identifiers: openlibrary:<...>, lccn:<...>, oclc:<...>, goodreads:<...>, librarything:<...>, doi:<...>, internet_archive:<...>
"""
new_tags: List[str] = []
isbn_clean = str(isbn or "").replace("isbn:", "").replace("-", "").strip()
if not isbn_clean:
return []
url = f"https://openlibrary.org/api/books?bibkeys=ISBN:{isbn_clean}&jscmd=data&format=json"
try:
with HTTPClient() as client:
response = client.get(url)
response.raise_for_status()
data = json_module.loads(response.content.decode("utf-8"))
except Exception as exc:
log(f"Failed to fetch ISBN metadata: {exc}", file=sys.stderr)
return []
if not data:
log(f"No ISBN metadata found for: {isbn}")
return []
book_data = next(iter(data.values()), None)
if not isinstance(book_data, dict):
return []
if "title" in book_data:
new_tags.append(f"title:{book_data['title']}")
authors = book_data.get("authors")
if isinstance(authors, list):
for author in authors[:3]:
if isinstance(author, dict) and author.get("name"):
new_tags.append(f"author:{author['name']}")
if book_data.get("publish_date"):
new_tags.append(f"publish_date:{book_data['publish_date']}")
publishers = book_data.get("publishers")
if isinstance(publishers, list) and publishers:
pub = publishers[0]
if isinstance(pub, dict) and pub.get("name"):
new_tags.append(f"publisher:{pub['name']}")
if "description" in book_data:
desc = book_data.get("description")
if isinstance(desc, dict) and "value" in desc:
desc = desc.get("value")
if desc:
desc_str = str(desc).strip()
if desc_str:
new_tags.append(f"description:{desc_str[:200]}")
page_count = book_data.get("number_of_pages")
if isinstance(page_count, int) and page_count > 0:
new_tags.append(f"pages:{page_count}")
identifiers = book_data.get("identifiers")
if isinstance(identifiers, dict):
def _first(value: Any) -> Any:
if isinstance(value, list) and value:
return value[0]
return value
for key, ns in (
("openlibrary", "openlibrary"),
("lccn", "lccn"),
("oclc", "oclc"),
("goodreads", "goodreads"),
("librarything", "librarything"),
("doi", "doi"),
("internet_archive", "internet_archive"),
):
val = _first(identifiers.get(key))
if val:
new_tags.append(f"{ns}:{val}")
log(f"Found {len(new_tags)} tag(s) from ISBN lookup")
return new_tags
@staticmethod
def scrape_openlibrary_metadata(olid: str) -> List[str]:
"""Scrape tags for an OpenLibrary ID using the .json API endpoint."""
new_tags: List[str] = []
olid_text = str(olid or "").strip()
if not olid_text:
return []
# Normalize OLID to the common "OL<digits>M" form when possible.
olid_norm = olid_text
try:
if not olid_norm.startswith("OL"):
olid_norm = f"OL{olid_norm}"
if not olid_norm.endswith("M"):
olid_norm = f"{olid_norm}M"
except Exception:
olid_norm = olid_text
# Ensure we always include a scrapeable identifier tag.
new_tags.append(f"openlibrary:{olid_norm}")
# Accept OL9674499M, 9674499M, or just digits.
olid_clean = olid_text.replace("OL", "").replace("M", "")
if not olid_clean.isdigit():
olid_clean = olid_text
if not olid_text.startswith("OL"):
url = f"https://openlibrary.org/books/OL{olid_clean}M.json"
else:
url = f"https://openlibrary.org/books/{olid_text}.json"
try:
with HTTPClient() as client:
response = client.get(url)
response.raise_for_status()
data = json_module.loads(response.content.decode("utf-8"))
except Exception as exc:
log(f"Failed to fetch OpenLibrary metadata: {exc}", file=sys.stderr)
return []
if not isinstance(data, dict) or not data:
log(f"No OpenLibrary metadata found for: {olid_text}")
return []
if "title" in data:
new_tags.append(f"title:{data['title']}")
authors = data.get("authors")
if isinstance(authors, list):
for author in authors[:3]:
if isinstance(author, dict) and author.get("name"):
new_tags.append(f"author:{author['name']}")
continue
# Common OL shape: {"key": "/authors/OL...A"} or {"author": {"key": ...}}
author_key = None
if isinstance(author, dict):
if isinstance(author.get("author"), dict):
author_key = author.get("author", {}).get("key")
if not author_key:
author_key = author.get("key")
if isinstance(author_key, str) and author_key.startswith("/"):
try:
author_url = f"https://openlibrary.org{author_key}.json"
with HTTPClient(timeout=10) as client:
author_resp = client.get(author_url)
author_resp.raise_for_status()
author_data = json_module.loads(author_resp.content.decode("utf-8"))
if isinstance(author_data, dict) and author_data.get("name"):
new_tags.append(f"author:{author_data['name']}")
continue
except Exception:
pass
if isinstance(author, str) and author:
new_tags.append(f"author:{author}")
if data.get("publish_date"):
new_tags.append(f"publish_date:{data['publish_date']}")
publishers = data.get("publishers")
if isinstance(publishers, list) and publishers:
pub = publishers[0]
if isinstance(pub, dict) and pub.get("name"):
new_tags.append(f"publisher:{pub['name']}")
elif isinstance(pub, str) and pub:
new_tags.append(f"publisher:{pub}")
if "description" in data:
desc = data.get("description")
if isinstance(desc, dict) and "value" in desc:
desc = desc.get("value")
if desc:
desc_str = str(desc).strip()
if desc_str:
new_tags.append(f"description:{desc_str[:200]}")
page_count = data.get("number_of_pages")
if isinstance(page_count, int) and page_count > 0:
new_tags.append(f"pages:{page_count}")
subjects = data.get("subjects")
if isinstance(subjects, list):
for subject in subjects[:10]:
if isinstance(subject, str):
subject_clean = subject.strip()
if subject_clean and subject_clean not in new_tags:
new_tags.append(subject_clean)
identifiers = data.get("identifiers")
if isinstance(identifiers, dict):
def _first(value: Any) -> Any:
if isinstance(value, list) and value:
return value[0]
return value
for key, ns in (
("isbn_10", "isbn_10"),
("isbn_13", "isbn_13"),
("lccn", "lccn"),
("oclc_numbers", "oclc"),
("goodreads", "goodreads"),
("internet_archive", "internet_archive"),
):
val = _first(identifiers.get(key))
if val:
new_tags.append(f"{ns}:{val}")
# Some editions expose a direct Archive.org identifier as "ocaid".
ocaid = data.get("ocaid")
if isinstance(ocaid, str) and ocaid.strip():
new_tags.append(f"internet_archive:{ocaid.strip()}")
log(f"Found {len(new_tags)} tag(s) from OpenLibrary lookup")
return new_tags
def search(
self,
query: str,
@@ -155,7 +697,70 @@ class OpenLibrary(SearchProvider):
if not isinstance(docs, list):
return []
for doc in docs[: int(limit)]:
# Availability enrichment can be slow if done sequentially (it may require multiple
# network calls per row). Do it concurrently to keep the pipeline responsive.
docs = docs[: int(limit)]
def _compute_availability(doc_dict: Dict[str, Any]) -> Tuple[str, str, str, str]:
edition_id_local = _resolve_edition_id(doc_dict)
if not edition_id_local:
return "no-olid", "", "", ""
ia_val_local = doc_dict.get("ia") or []
if isinstance(ia_val_local, str):
ia_val_local = [ia_val_local]
if not isinstance(ia_val_local, list):
ia_val_local = []
ia_ids_local = [str(x) for x in ia_val_local if x]
session_local = requests.Session()
try:
archive_id_local = _resolve_archive_id(session_local, edition_id_local, ia_ids_local)
except Exception:
archive_id_local = ""
if not archive_id_local:
return "no-archive", "", "", ""
# Prefer the fastest signal first: OpenLibrary lendable status.
lendable_local, reason_local = _check_lendable(session_local, edition_id_local)
if lendable_local:
return "borrow", reason_local, archive_id_local, ""
# Not lendable: check whether it's directly downloadable (public domain uploads, etc.).
try:
can_direct, pdf_url = self._archive_check_direct_download(archive_id_local)
if can_direct and pdf_url:
return "download", reason_local, archive_id_local, str(pdf_url)
except Exception:
pass
return "unavailable", reason_local, archive_id_local, ""
availability_rows: List[Tuple[str, str, str, str]] = [("unknown", "", "", "") for _ in range(len(docs))]
if docs:
log(f"[openlibrary] Enriching availability for {len(docs)} result(s)...")
max_workers = min(8, max(1, len(docs)))
done = 0
with futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
future_to_index = {
executor.submit(_compute_availability, doc_dict): i
for i, doc_dict in enumerate(docs)
if isinstance(doc_dict, dict)
}
for fut in futures.as_completed(list(future_to_index.keys())):
i = future_to_index[fut]
try:
availability_rows[i] = fut.result()
except Exception:
availability_rows[i] = ("unknown", "", "", "")
done += 1
if done in {1, len(future_to_index)} or (done % 10 == 0):
log(f"[openlibrary] Availability: {done}/{len(future_to_index)}")
log("[openlibrary] Availability enrichment complete")
for idx, doc in enumerate(docs):
if not isinstance(doc, dict):
continue
@@ -172,6 +777,7 @@ class OpenLibrary(SearchProvider):
year = str(year_val) if year_val is not None else ""
edition_id = _resolve_edition_id(doc)
work_key = doc.get("key") if isinstance(doc.get("key"), str) else ""
ia_val = doc.get("ia") or []
if isinstance(ia_val, str):
@@ -193,9 +799,21 @@ class OpenLibrary(SearchProvider):
("Title", book_title),
("Author", ", ".join(authors_list)),
("Year", year),
("Avail", ""),
("OLID", edition_id),
]
# Determine availability using the concurrently computed enrichment.
availability, availability_reason, archive_id, direct_url = ("unknown", "", "", "")
if 0 <= idx < len(availability_rows):
availability, availability_reason, archive_id, direct_url = availability_rows[idx]
# Patch the display column.
for idx, (name, _val) in enumerate(columns):
if name == "Avail":
columns[idx] = ("Avail", availability)
break
annotations: List[str] = []
if isbn_13:
annotations.append(f"isbn_13:{isbn_13}")
@@ -203,12 +821,18 @@ class OpenLibrary(SearchProvider):
annotations.append(f"isbn_10:{isbn_10}")
if ia_ids:
annotations.append("archive")
if availability in {"download", "borrow"}:
annotations.append(availability)
results.append(
SearchResult(
table="openlibrary",
title=book_title,
path=(f"https://openlibrary.org/books/{edition_id}" if edition_id else "https://openlibrary.org"),
path=(
f"https://openlibrary.org/books/{edition_id}" if edition_id else (
f"https://openlibrary.org{work_key}" if isinstance(work_key, str) and work_key.startswith("/") else "https://openlibrary.org"
)
),
detail=(
(f"By: {', '.join(authors_list)}" if authors_list else "")
+ (f" ({year})" if year else "")
@@ -218,11 +842,16 @@ class OpenLibrary(SearchProvider):
columns=columns,
full_metadata={
"openlibrary_id": edition_id,
"openlibrary_key": work_key,
"authors": authors_list,
"year": year,
"isbn_10": isbn_10,
"isbn_13": isbn_13,
"ia": ia_ids,
"availability": availability,
"availability_reason": availability_reason,
"archive_id": archive_id,
"direct_url": direct_url,
"raw": doc,
},
)
@@ -256,9 +885,7 @@ class OpenLibrary(SearchProvider):
# 1) Direct download if available.
try:
from API.archive_client import check_direct_download
can_direct, pdf_url = check_direct_download(archive_id)
can_direct, pdf_url = self._archive_check_direct_download(archive_id)
except Exception:
can_direct, pdf_url = False, ""
@@ -272,10 +899,7 @@ class OpenLibrary(SearchProvider):
# 2) Borrow flow (credentials required).
try:
from API.archive_client import BookNotAvailableError, credential_openlibrary, download as archive_download
from API.archive_client import get_book_infos, loan, login
email, password = credential_openlibrary(self.config or {})
email, password = self._credential_archive(self.config or {})
if not email or not password:
log("[openlibrary] Archive credentials missing; cannot borrow", file=sys.stderr)
return None
@@ -285,13 +909,13 @@ class OpenLibrary(SearchProvider):
log(f"[openlibrary] Not lendable: {reason}", file=sys.stderr)
return None
session = login(email, password)
session = self._archive_login(email, password)
try:
session = loan(session, archive_id, verbose=False)
except BookNotAvailableError:
session = self._archive_loan(session, archive_id, verbose=False)
except self.BookNotAvailableError:
log("[openlibrary] Book not available to borrow", file=sys.stderr)
return None
except SystemExit:
except Exception:
log("[openlibrary] Borrow failed", file=sys.stderr)
return None
@@ -301,7 +925,7 @@ class OpenLibrary(SearchProvider):
last_exc: Optional[Exception] = None
for u in urls:
try:
title_raw, links, _metadata = get_book_infos(session, u)
title_raw, links, _metadata = self._archive_get_book_infos(session, u)
if title_raw:
title = sanitize_filename(title_raw)
break
@@ -315,7 +939,7 @@ class OpenLibrary(SearchProvider):
temp_dir = tempfile.mkdtemp(prefix=f"{title}_", dir=str(output_dir))
try:
images = archive_download(session=session, n_threads=10, directory=temp_dir, links=links, scale=3, book_id=archive_id)
images = self._archive_download(session=session, n_threads=10, directory=temp_dir, links=links, scale=3, book_id=archive_id)
try:
import img2pdf # type: ignore

View File

@@ -642,7 +642,7 @@ def _download_direct_file(
return DownloadMediaResult(
path=file_path,
info=info,
tags=tags,
tag=tags,
source_url=url,
hash_value=hash_value,
)

View File

@@ -36,6 +36,7 @@ mime_maps = {
"mp3": { "ext": ".mp3", "mimes": ["audio/mpeg", "audio/mp3"] },
"m4a": { "ext": ".m4a", "mimes": ["audio/mp4", "audio/x-m4a"] },
"ogg": { "ext": ".ogg", "mimes": ["audio/ogg"] },
"opus": { "ext": ".opus", "mimes": ["audio/opus"] },
"flac": { "ext": ".flac", "mimes": ["audio/flac"] },
"wav": { "ext": ".wav", "mimes": ["audio/wav", "audio/x-wav", "audio/vnd.wave"] },
"wma": { "ext": ".wma", "mimes": ["audio/x-ms-wma"] },
@@ -98,3 +99,13 @@ def get_type_from_ext(ext: str) -> str:
return type_name
return 'other'
# Canonical supported extension set for all stores/cmdlets.
# Derived from mime_maps so there is a single source of truth.
ALL_SUPPORTED_EXTENSIONS: set[str] = {
spec["ext"].lower()
for group in mime_maps.values()
for spec in group.values()
if isinstance(spec, dict) and isinstance(spec.get("ext"), str) and spec.get("ext")
}

View File

@@ -30,6 +30,8 @@ def _resolve_file_hash(db_hash: Optional[str], file_path: Path) -> Optional[str]
return _normalize_hash(file_path.stem)
class Folder(Store):
""""""
# Track which locations have already been migrated to avoid repeated migrations
@@ -359,6 +361,17 @@ class Folder(Store):
else:
shutil.copy2(str(file_path), str(save_file))
debug(f"Local copy: {save_file}", file=sys.stderr)
# Best-effort: capture duration for media
duration_value: float | None = None
try:
from SYS.utils import ffprobe
probe = ffprobe(str(save_file))
duration = probe.get("duration")
if isinstance(duration, (int, float)) and duration > 0:
duration_value = float(duration)
except Exception:
duration_value = None
# Save to database
with API_folder_store(Path(self._location)) as db:
@@ -368,7 +381,8 @@ class Folder(Store):
db.save_metadata(save_file, {
'hash': file_hash,
'ext': ext_clean,
'size': file_path.stat().st_size
'size': file_path.stat().st_size,
'duration': duration_value,
})
# Add tags if provided
@@ -405,6 +419,21 @@ class Folder(Store):
results = []
search_dir = Path(self._location).expanduser()
def _url_like_pattern(value: str) -> str:
# Interpret user patterns as substring matches (with optional glob wildcards).
v = (value or "").strip().lower()
if not v or v == "*":
return "%"
v = v.replace("%", "\\%").replace("_", "\\_")
v = v.replace("*", "%").replace("?", "_")
if "%" not in v and "_" not in v:
return f"%{v}%"
if not v.startswith("%"):
v = "%" + v
if not v.endswith("%"):
v = v + "%"
return v
tokens = [t.strip() for t in query.split(',') if t.strip()]
if not match_all and len(tokens) == 1 and _normalize_hash(query):
@@ -453,6 +482,8 @@ class Folder(Store):
try:
with DatabaseAPI(search_dir) as api:
if tokens and len(tokens) > 1:
url_fetch_limit = (limit or 45) * 50
def _like_pattern(term: str) -> str:
return term.replace('*', '%').replace('?', '_')
@@ -473,6 +504,11 @@ class Folder(Store):
h = api.get_file_hash_by_hash(normalized_hash)
return {h} if h else set()
if namespace == 'url':
if not pattern or pattern == '*':
return api.get_file_hashes_with_any_url(limit=url_fetch_limit)
return api.get_file_hashes_by_url_like(_url_like_pattern(pattern), limit=url_fetch_limit)
if namespace == 'store':
if pattern not in {'local', 'file', 'filesystem'}:
return set()
@@ -562,6 +598,29 @@ class Folder(Store):
if limit is not None and len(results) >= limit:
return results
return results
if namespace == "url":
if not pattern or pattern == "*":
rows = api.get_files_with_any_url(limit)
else:
rows = api.get_files_by_url_like(_url_like_pattern(pattern), limit)
for file_hash, file_path_str, size_bytes, ext in rows:
if not file_path_str:
continue
file_path = Path(file_path_str)
if not file_path.exists():
continue
if size_bytes is None:
try:
size_bytes = file_path.stat().st_size
except OSError:
size_bytes = None
tags = api.get_tags_for_file(file_hash)
entry = _create_entry(file_path, tags, size_bytes, file_hash)
results.append(entry)
if limit is not None and len(results) >= limit:
return results
return results
query_pattern = f"{namespace}:%"
rows = api.get_files_by_namespace_pattern(query_pattern, limit)
@@ -592,126 +651,59 @@ class Folder(Store):
if limit is not None and len(results) >= limit:
return results
elif not match_all:
# Strict tag-based search only (no filename/path searching).
terms = [t.strip() for t in query_lower.replace(',', ' ').split() if t.strip()]
if not terms:
terms = [query_lower]
debug(f"Performing filename/tag search for terms: {terms}")
fetch_limit = (limit or 45) * 50
conditions = ["LOWER(f.file_path) LIKE ?" for _ in terms]
params = [f"%{t}%" for t in terms]
rows = api.get_files_by_multiple_path_conditions(conditions, params, fetch_limit)
debug(f"Found {len(rows)} filename matches in DB (before whole-word filter)")
word_regex = None
if len(terms) == 1:
term = terms[0]
has_wildcard = '*' in term or '?' in term
if has_wildcard:
try:
from fnmatch import translate
word_regex = re.compile(translate(term), re.IGNORECASE)
except Exception:
word_regex = None
else:
try:
pattern = r'(?<![a-zA-Z0-9])' + re.escape(term) + r'(?![a-zA-Z0-9])'
word_regex = re.compile(pattern, re.IGNORECASE)
except Exception:
word_regex = None
seen_files = set()
for file_id, file_path_str, size_bytes, file_hash in rows:
if not file_path_str or file_path_str in seen_files:
continue
if word_regex:
p = Path(file_path_str)
if not word_regex.search(p.name):
# AND semantics across terms: each term must match at least one tag.
hits: dict[str, dict[str, Any]] = {}
for term in terms:
tag_pattern = f"%{term}%"
term_rows = api.get_files_by_namespace_pattern(tag_pattern, fetch_limit)
for file_hash, file_path_str, size_bytes, ext in term_rows:
if not file_path_str:
continue
seen_files.add(file_path_str)
file_path = Path(file_path_str)
if file_path.exists():
if size_bytes is None:
size_bytes = file_path.stat().st_size
tags = api.get_tags_for_file(file_hash)
entry = _create_entry(file_path, tags, size_bytes, file_hash)
results.append(entry)
if limit is not None and len(results) >= limit:
return results
entry = hits.get(file_hash)
if entry:
entry["count"] += 1
if size_bytes is not None:
entry["size"] = size_bytes
else:
hits[file_hash] = {
"path": file_path_str,
"size": size_bytes,
"hash": file_hash,
"count": 1,
}
if terms:
title_hits: dict[str, dict[str, Any]] = {}
for term in terms:
title_pattern = f"title:%{term}%"
title_rows = api.get_files_by_title_tag_pattern(title_pattern, fetch_limit)
for file_hash, file_path_str, size_bytes, ext in title_rows:
if not file_path_str:
continue
entry = title_hits.get(file_hash)
if entry:
entry["count"] += 1
if size_bytes is not None:
entry["size"] = size_bytes
else:
title_hits[file_hash] = {
"path": file_path_str,
"size": size_bytes,
"hash": file_hash,
"count": 1,
}
if title_hits:
required = len(terms)
for file_hash, info in title_hits.items():
if info.get("count") != required:
continue
file_path_str = info.get("path")
if not file_path_str or file_path_str in seen_files:
continue
file_path = Path(file_path_str)
if not file_path.exists():
continue
seen_files.add(file_path_str)
size_bytes = info.get("size")
if size_bytes is None:
try:
size_bytes = file_path.stat().st_size
except OSError:
size_bytes = None
tags = api.get_tags_for_file(file_hash)
entry = _create_entry(file_path, tags, size_bytes, info.get("hash"))
results.append(entry)
if limit is not None and len(results) >= limit:
return results
query_pattern = f"%{query_lower}%"
tag_rows = api.get_files_by_simple_tag_pattern(query_pattern, limit)
for file_hash, file_path_str, size_bytes, ext in tag_rows:
required = len(terms)
seen_files: set[str] = set()
for file_hash, info in hits.items():
if info.get("count") != required:
continue
file_path_str = info.get("path")
if not file_path_str or file_path_str in seen_files:
continue
seen_files.add(file_path_str)
file_path = Path(file_path_str)
if file_path.exists():
if size_bytes is None:
if not file_path.exists():
continue
seen_files.add(file_path_str)
size_bytes = info.get("size")
if size_bytes is None:
try:
size_bytes = file_path.stat().st_size
tags = api.get_tags_for_file(file_hash)
entry = _create_entry(file_path, tags, size_bytes, file_hash)
results.append(entry)
if limit is not None and len(results) >= limit:
return results
except OSError:
size_bytes = None
tags = api.get_tags_for_file(file_hash)
entry_obj = _create_entry(file_path, tags, size_bytes, info.get("hash"))
results.append(entry_obj)
if limit is not None and len(results) >= limit:
break
else:
rows = api.get_all_files(limit)
@@ -726,10 +718,8 @@ class Folder(Store):
entry = _create_entry(file_path, tags, size_bytes, file_hash)
results.append(entry)
if results:
debug(f"Returning {len(results)} results from DB")
else:
debug("No results found in DB")
backend_label = str(getattr(self, "_name", "") or getattr(self, "NAME", "") or "folder")
debug(f"[folder:{backend_label}] {len(results)} result(s)")
return results
except Exception as e:
@@ -938,9 +928,11 @@ class Folder(Store):
file_hash = file_identifier
if self._location:
try:
from metadata import normalize_urls
with API_folder_store(Path(self._location)) as db:
meta = db.get_metadata(file_hash) or {}
return list(meta.get("url") or [])
urls = normalize_urls(meta.get("url"))
return urls
except Exception as exc:
debug(f"Local DB get_metadata failed: {exc}")
return []
@@ -955,11 +947,13 @@ class Folder(Store):
file_hash = file_identifier
if self._location:
try:
from metadata import normalize_urls
with API_folder_store(Path(self._location)) as db:
meta = db.get_metadata(file_hash) or {}
existing_urls = list(meta.get("url") or [])
existing_urls = normalize_urls(meta.get("url"))
incoming_urls = normalize_urls(url)
changed = False
for u in list(url or []):
for u in list(incoming_urls or []):
if not u:
continue
if u not in existing_urls:
@@ -982,10 +976,11 @@ class Folder(Store):
file_hash = file_identifier
if self._location:
try:
from metadata import normalize_urls
with API_folder_store(Path(self._location)) as db:
meta = db.get_metadata(file_hash) or {}
existing_urls = list(meta.get("url") or [])
remove_set = {u for u in (url or []) if u}
existing_urls = normalize_urls(meta.get("url"))
remove_set = {u for u in normalize_urls(url) if u}
if not remove_set:
return False
new_urls = [u for u in existing_urls if u not in remove_set]

View File

@@ -264,6 +264,170 @@ class HydrusNetwork(Store):
debug(f"Searching Hydrus for: {query}")
def _extract_urls(meta_obj: Any) -> list[str]:
if not isinstance(meta_obj, dict):
return []
raw = meta_obj.get("url")
if raw is None:
raw = meta_obj.get("urls")
if isinstance(raw, str):
val = raw.strip()
return [val] if val else []
if isinstance(raw, list):
out: list[str] = []
for item in raw:
if not isinstance(item, str):
continue
s = item.strip()
if s:
out.append(s)
return out
return []
def _iter_url_filtered_metadata(url_value: str | None, want_any: bool, fetch_limit: int) -> list[dict[str, Any]]:
"""Best-effort URL search by scanning Hydrus metadata with include_file_url=True."""
# First try a fast system predicate if Hydrus supports it.
candidate_file_ids: list[int] = []
try:
if want_any:
predicate = "system:has url"
url_search = client.search_files(
tags=[predicate],
return_hashes=False,
return_file_ids=True,
return_file_count=False,
)
ids = url_search.get("file_ids", []) if isinstance(url_search, dict) else []
if isinstance(ids, list):
candidate_file_ids = [int(x) for x in ids if isinstance(x, (int, float, str)) and str(x).strip().isdigit()]
except Exception:
candidate_file_ids = []
if not candidate_file_ids:
# Fallback: scan from system:everything and filter by URL substring.
everything = client.search_files(
tags=["system:everything"],
return_hashes=False,
return_file_ids=True,
return_file_count=False,
)
ids = everything.get("file_ids", []) if isinstance(everything, dict) else []
if isinstance(ids, list):
candidate_file_ids = [int(x) for x in ids if isinstance(x, (int, float))]
if not candidate_file_ids:
return []
needle = (url_value or "").strip().lower()
chunk_size = 200
out: list[dict[str, Any]] = []
for start in range(0, len(candidate_file_ids), chunk_size):
if len(out) >= fetch_limit:
break
chunk = candidate_file_ids[start : start + chunk_size]
try:
payload = client.fetch_file_metadata(
file_ids=chunk,
include_file_url=True,
include_service_keys_to_tags=True,
include_duration=True,
include_size=True,
include_mime=True,
)
except Exception:
continue
metas = payload.get("metadata", []) if isinstance(payload, dict) else []
if not isinstance(metas, list):
continue
for meta in metas:
if not isinstance(meta, dict):
continue
urls = _extract_urls(meta)
if not urls:
continue
if want_any:
out.append(meta)
if len(out) >= fetch_limit:
break
continue
if not needle:
continue
if any(needle in u.lower() for u in urls):
out.append(meta)
if len(out) >= fetch_limit:
break
return out
query_lower = query.lower().strip()
# Special case: url:* and url:<value>
metadata_list: list[dict[str, Any]] | None = None
if ":" in query_lower and not query_lower.startswith(":"):
namespace, pattern = query_lower.split(":", 1)
namespace = namespace.strip().lower()
pattern = pattern.strip()
if namespace == "url":
if not pattern or pattern == "*":
metadata_list = _iter_url_filtered_metadata(None, want_any=True, fetch_limit=int(limit) if limit else 100)
else:
# Fast-path: exact URL via /add_url/get_url_files when a full URL is provided.
try:
if pattern.startswith("http://") or pattern.startswith("https://"):
from API.HydrusNetwork import HydrusRequestSpec
spec = HydrusRequestSpec(method="GET", endpoint="/add_url/get_url_files", query={"url": pattern})
response = client._perform_request(spec) # type: ignore[attr-defined]
hashes: list[str] = []
file_ids: list[int] = []
if isinstance(response, dict):
raw_hashes = response.get("hashes") or response.get("file_hashes")
if isinstance(raw_hashes, list):
hashes = [str(h).strip() for h in raw_hashes if isinstance(h, str) and str(h).strip()]
raw_ids = response.get("file_ids")
if isinstance(raw_ids, list):
for item in raw_ids:
try:
file_ids.append(int(item))
except (TypeError, ValueError):
continue
if file_ids:
payload = client.fetch_file_metadata(
file_ids=file_ids,
include_file_url=True,
include_service_keys_to_tags=True,
include_duration=True,
include_size=True,
include_mime=True,
)
metas = payload.get("metadata", []) if isinstance(payload, dict) else []
if isinstance(metas, list):
metadata_list = [m for m in metas if isinstance(m, dict)]
elif hashes:
payload = client.fetch_file_metadata(
hashes=hashes,
include_file_url=True,
include_service_keys_to_tags=True,
include_duration=True,
include_size=True,
include_mime=True,
)
metas = payload.get("metadata", []) if isinstance(payload, dict) else []
if isinstance(metas, list):
metadata_list = [m for m in metas if isinstance(m, dict)]
except Exception:
metadata_list = None
# Fallback: substring scan
if metadata_list is None:
metadata_list = _iter_url_filtered_metadata(pattern, want_any=False, fetch_limit=int(limit) if limit else 100)
# Parse the query into tags
# Handle both simple tags and complex queries
# "*" means "match all" - use system:everything tag in Hydrus
@@ -271,7 +435,6 @@ class HydrusNetwork(Store):
# Use system:everything to match all files in Hydrus
tags = ["system:everything"]
else:
query_lower = query.lower().strip()
# If query doesn't have a namespace (no ':'), search all files and filter by title/tags
# If query has explicit namespace, use it as a tag search
if ':' not in query_lower:
@@ -286,30 +449,36 @@ class HydrusNetwork(Store):
debug(f"Found 0 result(s)")
return []
# Search files with the tags
search_result = client.search_files(
tags=tags,
return_hashes=True,
return_file_ids=True
)
# Extract file IDs from search result
file_ids = search_result.get("file_ids", [])
hashes = search_result.get("hashes", [])
if not file_ids and not hashes:
debug(f"Found 0 result(s)")
return []
# Fetch metadata for the found files
# Search files with the tags (unless url: search already produced metadata)
results = []
query_lower = query.lower().strip()
# Split by comma or space for AND logic
search_terms = set(query_lower.replace(',', ' ').split()) # For substring matching
if file_ids:
metadata = client.fetch_file_metadata(file_ids=file_ids)
metadata_list = metadata.get("metadata", [])
if metadata_list is None:
search_result = client.search_files(
tags=tags,
return_hashes=True,
return_file_ids=True
)
file_ids = search_result.get("file_ids", []) if isinstance(search_result, dict) else []
hashes = search_result.get("hashes", []) if isinstance(search_result, dict) else []
if not file_ids and not hashes:
debug(f"Found 0 result(s)")
return []
if file_ids:
metadata = client.fetch_file_metadata(file_ids=file_ids)
metadata_list = metadata.get("metadata", [])
elif hashes:
metadata = client.fetch_file_metadata(hashes=hashes)
metadata_list = metadata.get("metadata", [])
else:
metadata_list = []
if not isinstance(metadata_list, list):
metadata_list = []
for meta in metadata_list:
if len(results) >= limit:

View File

@@ -119,6 +119,37 @@ class Store:
self._backend_errors: Dict[str, str] = {}
self._load_backends()
def _maybe_register_temp_alias(self, store_type: str, backend_name: str, kwargs: Dict[str, Any], backend: BaseStore) -> None:
"""If a folder backend points at config['temp'], also expose it as the 'temp' backend.
This keeps config compatibility (e.g. existing 'default') while presenting the temp
directory under a clearer name.
"""
try:
if _normalize_store_type(store_type) != "folder":
return
temp_value = self._config.get("temp")
if not temp_value:
return
path_value = kwargs.get("PATH") or kwargs.get("path")
if not path_value:
return
temp_path = Path(str(temp_value)).expanduser().resolve()
backend_path = Path(str(path_value)).expanduser().resolve()
if backend_path != temp_path:
return
# If the user already has a dedicated temp backend, do nothing.
if "temp" in self._backends:
return
# Keep original name working, but add an alias.
if backend_name != "temp":
self._backends["temp"] = backend
except Exception:
return
def _load_backends(self) -> None:
store_cfg = self._config.get("store")
if not isinstance(store_cfg, dict):
@@ -161,6 +192,9 @@ class Store:
backend_name = str(kwargs.get("NAME") or instance_name)
self._backends[backend_name] = backend
# If this is the configured temp directory, also alias it as 'temp'.
self._maybe_register_temp_alias(store_type, backend_name, kwargs, backend)
except Exception as exc:
err_text = str(exc)
self._backend_errors[str(instance_name)] = err_text
@@ -177,11 +211,24 @@ class Store:
return sorted(self._backends.keys())
def list_searchable_backends(self) -> list[str]:
searchable: list[str] = []
# De-duplicate backends by instance (aliases can point at the same object).
def _rank(name: str) -> int:
n = str(name or "").strip().lower()
if n == "temp":
return 0
if n == "default":
return 2
return 1
chosen: Dict[int, str] = {}
for name, backend in self._backends.items():
if type(backend).search is not BaseStore.search:
searchable.append(name)
return sorted(searchable)
if type(backend).search is BaseStore.search:
continue
key = id(backend)
prev = chosen.get(key)
if prev is None or _rank(name) < _rank(prev):
chosen[key] = name
return sorted(chosen.values())
def __getitem__(self, backend_name: str) -> BaseStore:
if backend_name not in self._backends:

View File

@@ -5,10 +5,9 @@ from __future__ import annotations
import json
import sys
import inspect
from collections.abc import Iterable as IterableABC
from SYS.logger import log, debug
from SYS.logger import log
from pathlib import Path
from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Set
from dataclasses import dataclass, field
@@ -690,7 +689,9 @@ def get_field(obj: Any, field: str, default: Optional[Any] = None) -> Any:
get_field(result, "table", "unknown") # With default
"""
# Handle lists by accessing the first element
if isinstance(obj, list) and obj:
if isinstance(obj, list):
if not obj:
return default
obj = obj[0]
if isinstance(obj, dict):
@@ -702,8 +703,9 @@ def get_field(obj: Any, field: str, default: Optional[Any] = None) -> Any:
return value
# For PipeObjects, also check the extra field
if hasattr(obj, 'extra') and isinstance(obj.extra, dict):
return obj.extra.get(field, default)
extra_val = getattr(obj, 'extra', None)
if isinstance(extra_val, dict):
return extra_val.get(field, default)
return default
@@ -1118,7 +1120,7 @@ def create_pipe_object_result(
Returns:
Dict with all PipeObject fields for emission
"""
result = {
result: Dict[str, Any] = {
'source': source,
'id': identifier,
'path': file_path,
@@ -1546,14 +1548,11 @@ def coerce_to_pipe_object(value: Any, default_path: Optional[str] = None) -> mod
extra = {k: v for k, v in value.items() if k not in known_keys}
# Extract URL: prefer direct url field, then url list
url_val = value.get("url")
if not url_val:
url = value.get("url") or value.get("url") or []
if url and isinstance(url, list) and len(url) > 0:
url_val = url[0]
# Preserve url in extra if multiple url exist
if url and len(url) > 1:
extra["url"] = url
from metadata import normalize_urls
url_list = normalize_urls(value.get("url"))
url_val = url_list[0] if url_list else None
if len(url_list) > 1:
extra["url"] = url_list
# Extract relationships
rels = value.get("relationships") or {}

View File

@@ -1,14 +1,16 @@
from __future__ import annotations
from typing import Any, Dict, Optional, Sequence, Tuple, List, Union
from typing import Any, Dict, Optional, Sequence, Tuple, List
from pathlib import Path
import sys
import shutil
import tempfile
import models
import pipeline as ctx
from API import HydrusNetwork as hydrus_wrapper
from SYS.logger import log, debug
from SYS.utils_constant import ALL_SUPPORTED_EXTENSIONS
from Store import Store
from ._shared import (
Cmdlet, CmdletArg, parse_cmdlet_args, SharedArgs,
@@ -20,8 +22,8 @@ from API.folder import read_sidecar, find_sidecar, write_sidecar, API_folder_sto
from SYS.utils import sha256_file, unique_path
from metadata import write_metadata
# Use official Hydrus supported filetypes from hydrus_wrapper
SUPPORTED_MEDIA_EXTENSIONS = hydrus_wrapper.ALL_SUPPORTED_EXTENSIONS
# Canonical supported filetypes for all stores/cmdlets
SUPPORTED_MEDIA_EXTENSIONS = ALL_SUPPORTED_EXTENSIONS
class Add_File(Cmdlet):
"""Add file into the DB"""
@@ -53,93 +55,210 @@ class Add_File(Cmdlet):
def run(self, result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
"""Main execution entry point."""
# Parse arguments
parsed = parse_cmdlet_args(args, self)
# Initialize state
path_arg = parsed.get("path")
location = parsed.get("store") # Fixed: was "storage", should be "store"
location = parsed.get("store")
provider_name = parsed.get("provider")
delete_after = parsed.get("delete", False)
# Coerce result to PipeObject; if result is a list, prefer the first element
effective_result = result
if isinstance(result, list) and result:
first_item = result[0]
# Prefer first item if it's a dict or PipeObject
if isinstance(first_item, (dict, )):
effective_result = first_item
pipe_obj = coerce_to_pipe_object(effective_result, path_arg)
stage_ctx = ctx.get_stage_context()
is_last_stage = (stage_ctx is None) or bool(getattr(stage_ctx, "is_last_stage", False))
# Decide which items to process.
# - If user provided -path, treat this invocation as single-item.
# - Otherwise, if piped input is a list, ingest each item.
if path_arg:
items_to_process: List[Any] = [result]
elif isinstance(result, list) and result:
items_to_process = list(result)
else:
items_to_process = [result]
# Debug: Log input result details
debug(f"[add-file] INPUT result type={type(result).__name__}")
if isinstance(result, list):
debug(f"[add-file] INPUT result is list with {len(result)} items")
if result and isinstance(result[0], dict):
first = result[0]
hash_val = first.get('hash')
hash_str = hash_val[:12] + "..." if hash_val else "N/A"
debug(f"[add-file] First item details: title={first.get('title')}, hash={hash_str}, store={first.get('store', 'N/A')}")
elif isinstance(result, dict):
hash_val = result.get('hash')
hash_str = hash_val[:12] + "..." if hash_val else "N/A"
debug(f"[add-file] INPUT result is dict: title={result.get('title')}, hash={hash_str}, store={result.get('store', 'N/A')}")
# Debug: Log parsed arguments
debug(f"[add-file] PARSED args: location={location}, provider={provider_name}, delete={delete_after}")
# Resolve source - returns (media_path_or_url, file_hash)
media_path_or_url, file_hash = self._resolve_source(result, path_arg, pipe_obj, config)
debug(f"[add-file] RESOLVED source: path={media_path_or_url}, hash={file_hash[:12] if file_hash else 'N/A'}...")
if not media_path_or_url:
debug(f"[add-file] ERROR: Could not resolve source file/URL")
return 1
# Update pipe_obj with resolved path
pipe_obj.path = str(media_path_or_url) if isinstance(media_path_or_url, (str, Path)) else str(media_path_or_url)
# Check if it's a URL before validating as file
if isinstance(media_path_or_url, str) and media_path_or_url.lower().startswith(("http://", "https://", "magnet:", "torrent:")):
debug(f"Detected URL target, delegating to download-data: {media_path_or_url}")
return self._delegate_to_download_data(result, media_path_or_url, location, provider_name, args, config)
collected_payloads: List[Dict[str, Any]] = []
successes = 0
failures = 0
# Convert to Path and validate
media_path = Path(media_path_or_url) if isinstance(media_path_or_url, str) else media_path_or_url
# Validate source
if not self._validate_source(media_path):
debug(f"[add-file] ERROR: Source validation failed for {media_path}")
return 1
# Only run the search-store refresh when add-file is the last stage.
# In the middle of a pipeline, downstream cmdlets should receive the emitted
# storage payload directly (no need to re-search and risk duplicate emits).
auto_search_store_after_add = bool(is_last_stage) and len(items_to_process) == 1
# Debug: Log execution path decision
debug(f"[add-file] DECISION POINT: provider={provider_name}, location={location}")
debug(f" media_path={media_path}, exists={media_path.exists()}")
for item in items_to_process:
pipe_obj = coerce_to_pipe_object(item, path_arg)
# Execute transfer based on destination (using Store registry)
if provider_name:
debug(f"[add-file] ROUTE: file provider upload")
return self._handle_provider_upload(media_path, provider_name, pipe_obj, config, delete_after)
elif location:
# Check if location is a registered backend name
temp_dir_to_cleanup: Optional[Path] = None
delete_after_item = delete_after
try:
store = Store(config)
backends = store.list_backends()
if location in backends:
debug(f"[add-file] ROUTE: storage backend '{location}'")
return self._handle_storage_backend(media_path, location, pipe_obj, config, delete_after)
else:
# Treat as local export path
debug(f"[add-file] ROUTE: local export to path '{location}'")
return self._handle_local_export(media_path, location, pipe_obj, config, delete_after)
except Exception as exc:
debug(f"[add-file] ERROR: Failed to resolve location: {exc}")
log(f"Invalid location: {location}", file=sys.stderr)
return 1
else:
debug(f"[add-file] ERROR: No location or provider specified")
log(f"No storage location or provider specified", file=sys.stderr)
return 1
media_path_or_url, file_hash = self._resolve_source(item, path_arg, pipe_obj, config)
debug(f"[add-file] RESOLVED source: path={media_path_or_url}, hash={file_hash[:12] if file_hash else 'N/A'}...")
if not media_path_or_url:
failures += 1
continue
# Update pipe_obj with resolved path
pipe_obj.path = str(media_path_or_url)
# URL targets: prefer provider-aware download for OpenLibrary selections.
if isinstance(media_path_or_url, str) and media_path_or_url.lower().startswith(
("http://", "https://", "magnet:", "torrent:")
):
table = None
full_metadata = None
if isinstance(pipe_obj.extra, dict):
table = pipe_obj.extra.get("table")
full_metadata = pipe_obj.extra.get("full_metadata")
is_openlibrary = (str(table or "").lower() == "openlibrary") or ("openlibrary.org/books/" in media_path_or_url.lower())
if is_openlibrary:
# Enrich tags from OpenLibrary metadata so the stored file has book tags (author/pages/etc).
try:
from Provider.openlibrary import OpenLibrary as _OpenLibrary
olid = None
archive_id = None
if isinstance(full_metadata, dict):
olid = full_metadata.get("openlibrary_id") or full_metadata.get("openlibrary")
archive_id = full_metadata.get("archive_id")
if not olid:
import re
m = re.search(r"/books/(OL\d+M)", str(media_path_or_url), flags=re.IGNORECASE)
if m:
olid = m.group(1)
scraped_tags: List[str] = []
if olid:
scraped_tags.extend(_OpenLibrary.scrape_openlibrary_metadata(str(olid)) or [])
if archive_id:
scraped_tags.append(f"internet_archive:{archive_id}")
if scraped_tags:
existing = list(pipe_obj.tag or [])
pipe_obj.tag = merge_sequences(existing, scraped_tags, case_sensitive=False)
except Exception:
pass
from ProviderCore.registry import get_search_provider
from ProviderCore.base import SearchResult
provider = get_search_provider("openlibrary", config)
if provider is None:
log("[add-file] OpenLibrary provider not available", file=sys.stderr)
failures += 1
continue
temp_dir_to_cleanup = Path(tempfile.mkdtemp(prefix="medios_openlibrary_"))
sr = SearchResult(
table="openlibrary",
title=str(getattr(pipe_obj, "title", None) or "Unknown"),
path=str(media_path_or_url),
full_metadata=full_metadata if isinstance(full_metadata, dict) else {},
)
downloaded = provider.download(sr, temp_dir_to_cleanup)
if downloaded is None:
log("[add-file] OpenLibrary download failed", file=sys.stderr)
failures += 1
continue
downloaded_path = Path(downloaded)
if downloaded_path.exists() and downloaded_path.is_dir():
log(
"[add-file] OpenLibrary download produced a directory (missing img2pdf?). Cannot ingest.",
file=sys.stderr,
)
failures += 1
continue
media_path_or_url = str(downloaded_path)
pipe_obj.path = str(downloaded_path)
delete_after_item = True
# For non-provider URLs, or if still a URL after provider attempt, delegate to download-media.
if isinstance(media_path_or_url, str) and media_path_or_url.lower().startswith(
("http://", "https://", "magnet:", "torrent:")
):
code = self._delegate_to_download_data(item, media_path_or_url, location, provider_name, args, config)
if code == 0:
successes += 1
else:
failures += 1
continue
media_path = Path(media_path_or_url) if isinstance(media_path_or_url, str) else media_path_or_url
if not self._validate_source(media_path):
failures += 1
continue
if provider_name:
code = self._handle_provider_upload(media_path, provider_name, pipe_obj, config, delete_after_item)
if code == 0:
successes += 1
else:
failures += 1
continue
if location:
try:
store = Store(config)
backends = store.list_backends()
if location in backends:
code = self._handle_storage_backend(
item,
media_path,
location,
pipe_obj,
config,
delete_after_item,
collect_payloads=collected_payloads,
suppress_last_stage_overlay=is_last_stage and len(items_to_process) > 1,
auto_search_store=auto_search_store_after_add,
)
else:
code = self._handle_local_export(media_path, location, pipe_obj, config, delete_after_item)
except Exception as exc:
debug(f"[add-file] ERROR: Failed to resolve location: {exc}")
log(f"Invalid location: {location}", file=sys.stderr)
failures += 1
continue
if code == 0:
successes += 1
else:
failures += 1
continue
log("No destination specified", file=sys.stderr)
failures += 1
finally:
if temp_dir_to_cleanup is not None:
try:
shutil.rmtree(temp_dir_to_cleanup, ignore_errors=True)
except Exception:
pass
# If we processed multiple storage ingests, present a single consolidated overlay table.
if is_last_stage and len(items_to_process) > 1 and collected_payloads:
try:
from result_table import ResultTable
table = ResultTable("Result")
for payload in collected_payloads:
table.add_result(payload)
# Make this the active selectable table so @.. returns here (and playlist table is kept in history).
ctx.set_last_result_table(table, collected_payloads, subject=collected_payloads)
except Exception:
pass
if successes > 0:
return 0
return 1
@staticmethod
def _resolve_source(
@@ -149,10 +268,7 @@ class Add_File(Cmdlet):
config: Dict[str, Any],
) -> Tuple[Optional[Path | str], Optional[str]]:
"""Resolve the source file path from args or pipeline result.
PRIORITY: hash+store pattern is preferred over path-based resolution.
This ensures consistency when @N selections pass hash+store identifiers.
Returns (media_path_or_url, file_hash)
where media_path_or_url can be a Path object or a URL string.
"""
@@ -161,8 +277,9 @@ class Add_File(Cmdlet):
result_hash = result.get("hash")
result_store = result.get("store")
if result_hash and result_store:
debug(f"[add-file] Using hash+store from result: hash={result_hash[:12]}..., store={result_store}")
# Use get_file to retrieve from the specific store
debug(
f"[add-file] Using hash+store from result: hash={str(result_hash)[:12]}..., store={result_store}"
)
try:
store = Store(config)
if result_store in store.list_backends():
@@ -170,16 +287,15 @@ class Add_File(Cmdlet):
media_path = backend.get_file(result_hash)
if isinstance(media_path, Path) and media_path.exists():
pipe_obj.path = str(media_path)
debug(f"[add-file] Retrieved file from {result_store}: {media_path}")
return media_path, result_hash
if isinstance(media_path, str) and media_path.lower().startswith(("http://", "https://")):
return media_path, str(result_hash)
if isinstance(media_path, str) and media_path.lower().startswith(
("http://", "https://", "magnet:", "torrent:")
):
pipe_obj.path = media_path
debug(f"[add-file] Retrieved URL from {result_store}: {media_path}")
return media_path, result_hash
return media_path, str(result_hash)
except Exception as exc:
debug(f"[add-file] Failed to retrieve via hash+store: {exc}")
# PRIORITY 2: Try explicit path argument
if path_arg:
media_path = Path(path_arg)
@@ -196,10 +312,9 @@ class Add_File(Cmdlet):
file_hash = pipe_path_str.split(":", 1)[1]
media_path, success = Add_File._fetch_hydrus_path(file_hash, config)
return media_path, file_hash if success else None
# Check if pipe_path is a URL - skip to URL handling below
if not pipe_path_str.lower().startswith(("http://", "https://", "magnet:", "torrent:")):
media_path = Path(pipe_path_str)
return media_path, None
if pipe_path_str.lower().startswith(("http://", "https://", "magnet:", "torrent:")):
return pipe_path_str, None
return Path(pipe_path_str), None
# PRIORITY 4: Try from pipe_obj.url (for streaming url without downloaded file)
pipe_url = getattr(pipe_obj, "url", None)
@@ -248,8 +363,9 @@ class Add_File(Cmdlet):
# Look for path or path-like keys
path_candidate = first_item.get("path") or first_item.get("filepath") or first_item.get("file")
# If the dict includes a 'paths' list (multi-part/section download), prefer the first file
if not path_candidate and isinstance(first_item.get("paths"), (list, tuple)) and first_item.get("paths"):
path_candidate = first_item.get("paths")[0]
paths_val = first_item.get("paths")
if not path_candidate and isinstance(paths_val, (list, tuple)) and paths_val:
path_candidate = paths_val[0]
if path_candidate:
debug(f"Resolved path from result dict: {path_candidate}")
try:
@@ -361,10 +477,12 @@ class Add_File(Cmdlet):
selection_args = result["_selection_args"]
if selection_args:
dl_args.extend(selection_args)
elif hasattr(result, 'extra') and isinstance(result.extra, dict) and "_selection_args" in result.extra:
selection_args = result.extra["_selection_args"]
if selection_args:
dl_args.extend(selection_args)
else:
extra_val = getattr(result, "extra", None)
if isinstance(extra_val, dict) and "_selection_args" in extra_val:
selection_args = extra_val["_selection_args"]
if selection_args:
dl_args.extend(selection_args)
# download-media doesn't support -storage flag
# It downloads to the configured directory, then add-file will handle storage
@@ -375,18 +493,32 @@ class Add_File(Cmdlet):
@staticmethod
def _get_url(result: Any, pipe_obj: models.PipeObject) -> List[str]:
url: List[str] = []
try:
if isinstance(pipe_obj.extra, dict):
url = list(pipe_obj.extra.get("url") or pipe_obj.extra.get("url") or [])
except Exception:
pass
from metadata import normalize_urls
if not url and isinstance(result, dict):
url = list(result.get("url") or result.get("url") or [])
if not url:
url = list(extract_url_from_result(result) or [])
return url
# Prefer explicit PipeObject.url if present
urls: List[str] = []
try:
urls = normalize_urls(getattr(pipe_obj, "url", None))
except Exception:
urls = []
# Then check extra.url
if not urls:
try:
if isinstance(pipe_obj.extra, dict):
urls = normalize_urls(pipe_obj.extra.get("url"))
except Exception:
pass
# Then check result dict
if not urls and isinstance(result, dict):
urls = normalize_urls(result.get("url"))
# Finally, try extractor helper
if not urls:
urls = normalize_urls(extract_url_from_result(result))
return urls
@staticmethod
def _get_relationships(result: Any, pipe_obj: models.PipeObject) -> Optional[Dict[str, Any]]:
@@ -405,10 +537,36 @@ class Add_File(Cmdlet):
@staticmethod
def _get_duration(result: Any, pipe_obj: models.PipeObject) -> Optional[float]:
if getattr(pipe_obj, "duration", None) is not None:
return pipe_obj.duration
def _parse_duration(value: Any) -> Optional[float]:
if value is None:
return None
if isinstance(value, (int, float)):
return float(value) if value > 0 else None
if isinstance(value, str):
s = value.strip()
if not s:
return None
try:
candidate = float(s)
return candidate if candidate > 0 else None
except ValueError:
pass
if ":" in s:
parts = [p.strip() for p in s.split(":") if p.strip()]
if len(parts) in {2, 3} and all(p.isdigit() for p in parts):
nums = [int(p) for p in parts]
if len(nums) == 2:
minutes, seconds = nums
return float(minutes * 60 + seconds)
hours, minutes, seconds = nums
return float(hours * 3600 + minutes * 60 + seconds)
return None
parsed = _parse_duration(getattr(pipe_obj, "duration", None))
if parsed is not None:
return parsed
try:
return extract_duration(result)
return _parse_duration(extract_duration(result))
except Exception:
return None
@@ -442,19 +600,20 @@ class Add_File(Cmdlet):
ctx.set_current_stage_table(None)
@staticmethod
def _emit_storage_result(payload: Dict[str, Any]) -> None:
def _emit_storage_result(payload: Dict[str, Any], *, overlay: bool = True, emit: bool = True) -> None:
"""Emit a storage-style result payload.
- Always emits the dict downstream (when in a pipeline).
- If this is the last stage (or not in a pipeline), prints a search-store-like table
and sets an overlay table/items for @N selection.
"""
# Always emit for downstream commands (no-op if not in a pipeline)
ctx.emit(payload)
# Emit for downstream commands (no-op if not in a pipeline)
if emit:
ctx.emit(payload)
stage_ctx = ctx.get_stage_context()
is_last = (stage_ctx is None) or bool(getattr(stage_ctx, "is_last_stage", False))
if not is_last:
if not is_last or not overlay:
return
try:
@@ -470,6 +629,53 @@ class Add_File(Cmdlet):
except Exception:
pass
@staticmethod
def _try_emit_search_store_by_hash(*, store: str, hash_value: str, config: Dict[str, Any]) -> bool:
"""Run search-store for a single hash so the final table/payload is consistent.
Important: `add-file` is treated as an action command by the CLI, so the CLI only
prints tables for it when a display overlay exists. After running search-store,
this copies the resulting table into the display overlay (when this is the last
stage) so the canonical store table is what the user sees and can select from.
Returns True if search-store ran successfully, else False.
"""
try:
from cmdlet.search_store import CMDLET as search_store_cmdlet
args = ["-store", str(store), f"hash:{str(hash_value)}"]
log(f"[add-file] Refresh: search-store -store {store} \"hash:{hash_value}\"", file=sys.stderr)
# Run search-store under a temporary stage context so its ctx.emit() calls
# don't interfere with the outer add-file pipeline stage.
prev_ctx = ctx.get_stage_context()
temp_ctx = ctx.PipelineStageContext(stage_index=0, total_stages=1, worker_id=getattr(prev_ctx, "worker_id", None))
ctx.set_stage_context(temp_ctx)
try:
code = search_store_cmdlet.run(None, args, config)
finally:
ctx.set_stage_context(prev_ctx)
if code != 0:
return False
# Promote the search-store result to a display overlay so the CLI prints it
# for action commands like add-file.
stage_ctx = ctx.get_stage_context()
is_last = (stage_ctx is None) or bool(getattr(stage_ctx, "is_last_stage", False))
if is_last:
try:
table = ctx.get_last_result_table()
items = ctx.get_last_result_items()
if table is not None and items:
ctx.set_last_result_table_overlay(table, items, subject={"store": store, "hash": hash_value})
except Exception:
pass
return True
except Exception as exc:
debug(f"[add-file] Failed to run search-store after add-file: {type(exc).__name__}: {exc}")
return False
@staticmethod
def _prepare_metadata(
result: Any,
@@ -664,8 +870,9 @@ class Add_File(Cmdlet):
if not username or not filename:
debug(f"[add-file] ERROR: Could not extract soulseek metadata from result (type={type(result).__name__})")
if hasattr(result, "extra"):
debug(f"[add-file] Result extra keys: {list(result.extra.keys())}")
extra_val = getattr(result, "extra", None)
if isinstance(extra_val, dict):
debug(f"[add-file] Result extra keys: {list(extra_val.keys())}")
return None
if not username or not filename:
@@ -769,28 +976,55 @@ class Add_File(Cmdlet):
@staticmethod
def _handle_storage_backend(
result: Any,
media_path: Path,
backend_name: str,
pipe_obj: models.PipeObject,
config: Dict[str, Any],
delete_after: bool,
*,
collect_payloads: Optional[List[Dict[str, Any]]] = None,
suppress_last_stage_overlay: bool = False,
auto_search_store: bool = True,
) -> int:
"""Handle uploading to a registered storage backend (e.g., 'test' folder store, 'hydrus', etc.)."""
log(f"Adding file to storage backend '{backend_name}': {media_path.name}", file=sys.stderr)
delete_after_effective = bool(delete_after)
if not delete_after_effective:
# When download-media is piped into add-file, the downloaded artifact is a temp file.
# After it is persisted to a storage backend, delete the temp copy to avoid duplicates.
try:
if (
str(backend_name or "").strip().lower() != "temp"
and getattr(pipe_obj, "is_temp", False)
and getattr(pipe_obj, "action", None) == "cmdlet:download-media"
):
from config import resolve_output_dir
temp_dir = resolve_output_dir(config)
try:
if media_path.resolve().is_relative_to(temp_dir.expanduser().resolve()):
delete_after_effective = True
debug(f"[add-file] Auto-delete temp source after ingest: {media_path}")
except Exception:
# If path resolution fails, fall back to non-destructive behavior
pass
except Exception:
pass
try:
store = Store(config)
backend = store[backend_name]
# Prepare metadata from pipe_obj and sidecars
tags, url, title, f_hash = Add_File._prepare_metadata(None, media_path, pipe_obj, config)
tags, url, title, f_hash = Add_File._prepare_metadata(result, media_path, pipe_obj, config)
# Call backend's add_file with full metadata
# Backend returns hash as identifier
file_identifier = backend.add_file(
media_path,
title=title,
tags=tags,
tag=tags,
url=url
)
log(f"✓ File added to '{backend_name}': {file_identifier}", file=sys.stderr)
@@ -822,6 +1056,14 @@ class Add_File(Cmdlet):
# Keep hash/store for downstream commands (get-tag, get-file, etc.).
resolved_hash = file_identifier if len(file_identifier) == 64 else (f_hash or file_identifier or "unknown")
# If we have url(s), ensure they get associated with the destination file.
# This mirrors `add-url` behavior but avoids emitting extra pipeline noise.
if url:
try:
backend.add_url(resolved_hash, list(url))
except Exception:
pass
meta: Dict[str, Any] = {}
try:
meta = backend.get_metadata(resolved_hash) or {}
@@ -865,9 +1107,30 @@ class Add_File(Cmdlet):
"tag": list(tags or []),
"url": list(url or []),
}
Add_File._emit_storage_result(payload)
if collect_payloads is not None:
try:
collect_payloads.append(payload)
except Exception:
pass
# Keep the add-file 1-row summary overlay (when last stage), then emit the
# canonical search-store payload/table for piping/selection consistency.
if auto_search_store and resolved_hash and resolved_hash != "unknown":
# Show the add-file summary (overlay only) but let search-store provide the downstream payload.
Add_File._emit_storage_result(payload, overlay=not suppress_last_stage_overlay, emit=False)
ok = Add_File._try_emit_search_store_by_hash(
store=backend_name,
hash_value=resolved_hash,
config=config,
)
if not ok:
# Fall back to emitting the add-file payload so downstream stages still receive an item.
ctx.emit(payload)
else:
Add_File._emit_storage_result(payload, overlay=not suppress_last_stage_overlay, emit=True)
Add_File._cleanup_after_success(media_path, delete_source=delete_after)
Add_File._cleanup_after_success(media_path, delete_source=delete_after_effective)
return 0
except Exception as exc:

View File

@@ -3,7 +3,6 @@ from __future__ import annotations
from typing import Any, Dict, Sequence
import sys
from . import register
import pipeline as ctx
from ._shared import Cmdlet, CmdletArg, SharedArgs, parse_cmdlet_args, get_field, normalize_hash
from SYS.logger import log
@@ -12,19 +11,24 @@ from Store import Store
class Add_Url(Cmdlet):
"""Add URL associations to files via hash+store."""
NAME = "add-url"
SUMMARY = "Associate a URL with a file"
USAGE = "@1 | add-url <url>"
ARGS = [
SharedArgs.HASH,
SharedArgs.STORE,
CmdletArg("url", required=True, description="URL to associate"),
]
DETAIL = [
"- Associates URL with file identified by hash+store",
"- Multiple url can be comma-separated",
]
def __init__(self) -> None:
super().__init__(
name="add-url",
summary="Associate a URL with a file",
usage="@1 | add-url <url>",
arg=[
SharedArgs.HASH,
SharedArgs.STORE,
CmdletArg("url", required=True, description="URL to associate"),
],
detail=[
"- Associates URL with file identified by hash+store",
"- Multiple url can be comma-separated",
],
exec=self.run,
)
self.register()
def run(self, result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
"""Add URL to file via hash+store backend."""
@@ -78,8 +82,7 @@ class Add_Url(Cmdlet):
return 1
# Register cmdlet
register(["add-url", "add_url"])(Add_Url)
CMDLET = Add_Url()

View File

@@ -3,7 +3,6 @@ from __future__ import annotations
from typing import Any, Dict, Sequence
import sys
from . import register
import pipeline as ctx
from ._shared import Cmdlet, CmdletArg, SharedArgs, parse_cmdlet_args, get_field, normalize_hash
from SYS.logger import log
@@ -12,19 +11,24 @@ from Store import Store
class Delete_Url(Cmdlet):
"""Delete URL associations from files via hash+store."""
NAME = "delete-url"
SUMMARY = "Remove a URL association from a file"
USAGE = "@1 | delete-url <url>"
ARGS = [
SharedArgs.HASH,
SharedArgs.STORE,
CmdletArg("url", required=True, description="URL to remove"),
]
DETAIL = [
"- Removes URL association from file identified by hash+store",
"- Multiple url can be comma-separated",
]
def __init__(self) -> None:
super().__init__(
name="delete-url",
summary="Remove a URL association from a file",
usage="@1 | delete-url <url>",
arg=[
SharedArgs.HASH,
SharedArgs.STORE,
CmdletArg("url", required=True, description="URL to remove"),
],
detail=[
"- Removes URL association from file identified by hash+store",
"- Multiple url can be comma-separated",
],
exec=self.run,
)
self.register()
def run(self, result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
"""Delete URL from file via hash+store backend."""
@@ -78,5 +82,4 @@ class Delete_Url(Cmdlet):
return 1
# Register cmdlet
register(["delete-url", "del-url", "delete_url"])(Delete_Url)
CMDLET = Delete_Url()

View File

@@ -190,9 +190,11 @@ class Download_File(Cmdlet):
# If this looks like a provider item and providers are available, prefer provider.download()
downloaded_path: Optional[Path] = None
attempted_provider_download = False
if table and get_search_provider and SearchResult:
provider = get_search_provider(str(table), config)
if provider is not None:
attempted_provider_download = True
sr = SearchResult(
table=str(table),
title=str(title or "Unknown"),
@@ -202,6 +204,19 @@ class Download_File(Cmdlet):
debug(f"[download-file] Downloading provider item via {table}: {sr.title}")
downloaded_path = provider.download(sr, final_output_dir)
# OpenLibrary: if provider download failed, do NOT try to download the OpenLibrary page HTML.
if downloaded_path is None and attempted_provider_download and str(table or "").lower() == "openlibrary":
availability = None
reason = None
if isinstance(full_metadata, dict):
availability = full_metadata.get("availability")
reason = full_metadata.get("availability_reason")
msg = "[download-file] OpenLibrary item not downloadable"
if availability or reason:
msg += f" (availability={availability or ''} reason={reason or ''})"
log(msg, file=sys.stderr)
continue
# Fallback: if we have a direct HTTP URL, download it directly
if downloaded_path is None and isinstance(target, str) and target.startswith("http"):
debug(f"[download-file] Provider item looks like direct URL, downloading: {target}")

View File

@@ -693,6 +693,7 @@ def probe_url(url: str, no_playlist: bool = False, timeout_seconds: int = 15) ->
return
# Extract relevant fields
webpage_url = info.get("webpage_url") or info.get("original_url") or info.get("url")
result_container[0] = {
"extractor": info.get("extractor", ""),
"title": info.get("title", ""),
@@ -700,7 +701,9 @@ def probe_url(url: str, no_playlist: bool = False, timeout_seconds: int = 15) ->
"duration": info.get("duration"),
"uploader": info.get("uploader"),
"description": info.get("description"),
"url": url,
# Keep both the requested and canonical URL forms; callers should prefer webpage_url.
"requested_url": url,
"webpage_url": webpage_url,
}
except Exception as exc:
log(f"Probe error for {url}: {exc}")
@@ -1220,9 +1223,359 @@ class Download_Media(Cmdlet):
log(f"Invalid clip format: {clip_spec}", file=sys.stderr)
return 1
quiet_mode = bool(config.get("_quiet_background_output")) if isinstance(config, dict) else False
storage = None
hydrus_available = True
try:
from Store import Store
storage = Store(config=config or {}, suppress_debug=True)
from API.HydrusNetwork import is_hydrus_available
hydrus_available = bool(is_hydrus_available(config or {}))
except Exception:
storage = None
def _preflight_url_duplicate(candidate_url: str, extra_urls: Optional[Sequence[str]] = None) -> bool:
# NOTE: download-media sets _quiet_background_output=True when running in a pipeline to
# reduce background noise. URL de-dup is interactive and must still run in pipelines.
if storage is None:
debug("Preflight URL check skipped: storage unavailable")
return True
debug(f"Preflight URL check: candidate={candidate_url}")
try:
from metadata import normalize_urls
except Exception:
normalize_urls = None # type: ignore[assignment]
needles: List[str] = []
if normalize_urls is not None:
for raw in [candidate_url, *(list(extra_urls) if extra_urls else [])]:
try:
needles.extend(normalize_urls(raw))
except Exception:
continue
# Fallback: always have at least one needle
if not needles:
needles = [str(candidate_url)]
# Deduplicate needles (preserve order)
seen_needles: List[str] = []
for needle in needles:
if needle and needle not in seen_needles:
seen_needles.append(needle)
needles = seen_needles
try:
debug(f"Preflight URL needles: {needles}")
except Exception:
pass
url_matches: List[Dict[str, Any]] = []
try:
from Store.HydrusNetwork import HydrusNetwork
# Avoid searching the temp/download directory backend during dedup.
# We only want to warn about duplicates in real stores.
backend_names_all = storage.list_searchable_backends()
backend_names: List[str] = []
skipped: List[str] = []
for backend_name in backend_names_all:
try:
backend = storage[backend_name]
except Exception:
continue
try:
if str(backend_name).strip().lower() == "temp":
skipped.append(backend_name)
continue
except Exception:
pass
# Heuristic: if a Folder backend points at the configured temp output dir, skip it.
try:
backend_location = getattr(backend, "_location", None)
if backend_location and final_output_dir:
backend_path = Path(str(backend_location)).expanduser().resolve()
temp_path = Path(str(final_output_dir)).expanduser().resolve()
if backend_path == temp_path:
skipped.append(backend_name)
continue
except Exception:
pass
backend_names.append(backend_name)
try:
if skipped:
debug(f"Preflight backends: {backend_names} (skipped temp: {skipped})")
else:
debug(f"Preflight backends: {backend_names}")
except Exception:
pass
for backend_name in backend_names:
backend = storage[backend_name]
if isinstance(backend, HydrusNetwork) and not hydrus_available:
continue
backend_hits: List[Dict[str, Any]] = []
for needle in needles:
try:
backend_hits = backend.search(f"url:{needle}", limit=25) or []
if backend_hits:
break
except Exception:
continue
if backend_hits:
url_matches.extend([dict(x) if isinstance(x, dict) else {"title": str(x)} for x in backend_hits])
if len(url_matches) >= 25:
url_matches = url_matches[:25]
break
except Exception:
url_matches = []
if not url_matches:
debug("Preflight URL check: no matches")
return True
table = ResultTable(f"URL already exists ({len(url_matches)} match(es))")
results_list: List[Dict[str, Any]] = []
for item in url_matches:
if "title" not in item:
item["title"] = item.get("name") or item.get("target") or item.get("path") or "Result"
table.add_result(item)
results_list.append(item)
pipeline_context.set_current_stage_table(table)
pipeline_context.set_last_result_table(table, results_list)
print(f"\n{table}")
response = input("Continue anyway? (y/n): ").strip().lower()
if response not in {"y", "yes"}:
return False
return True
def _canonicalize_url_for_storage(requested_url: str) -> str:
# Prefer yt-dlp's canonical webpage URL (e.g. strips timestamps/redirects).
# Fall back to the requested URL if probing fails.
# Important: when playlist item selection is used, avoid probing (can hang on large playlists).
if playlist_items:
return str(requested_url)
try:
pr = probe_url(requested_url, no_playlist=False, timeout_seconds=15)
if isinstance(pr, dict):
for key in ("webpage_url", "original_url", "url", "requested_url"):
value = pr.get(key)
if isinstance(value, str) and value.strip():
return value.strip()
except Exception:
pass
return str(requested_url)
# Check if we need to show format selection
playlist_items = str(parsed.get("item")) if parsed.get("item") else None
ytdl_format = parsed.get("format")
playlist_selection_handled = False
def _parse_at_selection(choice: str, *, max_index: int) -> Optional[List[int]]:
"""Parse @ selection syntax (@2, @2-5, @{1,3,5}, @2,5,7) into 1-based indices."""
raw = str(choice or "").strip()
if not raw:
return None
if raw.lower() in {"q", "quit", "cancel"}:
return None
if raw == "@*" or raw == "*":
return list(range(1, max_index + 1))
if raw.startswith("@"):
raw = raw[1:].strip()
if raw.startswith("{") and raw.endswith("}"):
raw = raw[1:-1].strip()
if not raw:
return None
indices: set[int] = set()
for part in raw.split(","):
part = part.strip()
if not part:
continue
if "-" in part:
left, right = [p.strip() for p in part.split("-", 1)]
if not left or not right:
return None
try:
start = int(left)
end = int(right)
except ValueError:
return None
if start < 1 or end < 1:
return None
if end < start:
start, end = end, start
for i in range(start, end + 1):
if 1 <= i <= max_index:
indices.add(i)
else:
try:
i = int(part)
except ValueError:
return None
if 1 <= i <= max_index:
indices.add(i)
if not indices:
return None
return sorted(indices)
def _maybe_prompt_playlist_items(url: str) -> Optional[Dict[str, Any]]:
"""If URL appears to be a playlist/channel/collection, prompt user for @ selection.
Returns:
- None if URL is not a playlist-like multi-entry page (or probe fails)
- Dict with keys:
- cancel: bool
- playlist_items: Optional[str] (None means download all)
- selected_urls: Optional[List[str]] (expanded per-entry urls when available)
"""
try:
pr = probe_url(url, no_playlist=False, timeout_seconds=15)
except Exception:
pr = None
if not isinstance(pr, dict):
return None
entries = pr.get("entries")
if not isinstance(entries, list) or len(entries) <= 1:
return None
# Display table (limit rows to keep output reasonable)
max_rows = 200
display_entries = entries[:max_rows]
total = len(entries)
def _entry_to_url(entry: Any) -> Optional[str]:
if not isinstance(entry, dict):
return None
# Prefer explicit absolute URLs when present
for key in ("webpage_url", "original_url", "url"):
v = entry.get(key)
if isinstance(v, str) and v.strip():
s = v.strip()
try:
if urlparse(s).scheme in {"http", "https"}:
return s
except Exception:
return s
# Best-effort YouTube fallback from id
entry_id = entry.get("id")
if isinstance(entry_id, str) and entry_id.strip():
extractor_name = str(pr.get("extractor") or pr.get("extractor_key") or "").lower()
if "youtube" in extractor_name:
return f"https://www.youtube.com/watch?v={entry_id.strip()}"
return None
table = ResultTable()
table.title = f"Playlist items ({total}{' shown ' + str(len(display_entries)) if total > max_rows else ''})"
table.set_source_command("download-media", [url])
try:
table.set_preserve_order(True)
except Exception:
pass
results_list: List[Dict[str, Any]] = []
for idx, entry in enumerate(display_entries, 1):
title = None
uploader = None
duration = None
try:
if isinstance(entry, dict):
title = entry.get("title")
uploader = entry.get("uploader") or pr.get("uploader")
duration = entry.get("duration")
except Exception:
pass
row: Dict[str, Any] = {
"table": "download-media",
"title": str(title or f"Item {idx}"),
"detail": str(uploader or ""),
"media_kind": "playlist-item",
"playlist_index": idx,
"columns": [
("#", str(idx)),
("Title", str(title or "")),
("Duration", str(duration or "")),
("Uploader", str(uploader or "")),
],
}
results_list.append(row)
table.add_result(row)
pipeline_context.set_current_stage_table(table)
pipeline_context.set_last_result_table(table, results_list)
print(f"\n{table}")
choice = input("Select items to download (@N, @2-5, @{1,3}, @*, or 'q' to cancel): ").strip()
if not choice or choice.lower() in {"q", "quit", "cancel"}:
return {"cancel": True, "playlist_items": None, "selected_urls": []}
if choice.strip() == "@*" or choice.strip() == "*":
# @* means all entries, not just displayed rows.
selected_urls: List[str] = []
for entry in entries:
u = _entry_to_url(entry)
if u and u not in selected_urls:
selected_urls.append(u)
# Only expand when we can derive URLs for all entries; otherwise fall back to yt-dlp playlist handling.
if len(selected_urls) == len(entries):
return {"cancel": False, "playlist_items": None, "selected_urls": selected_urls}
return {"cancel": False, "playlist_items": None, "selected_urls": []}
parsed_indices = _parse_at_selection(choice, max_index=len(display_entries))
if not parsed_indices:
log("Invalid selection. Use @N, @2-5, @{1,3}, or @*", file=sys.stderr)
return {"cancel": True, "playlist_items": None, "selected_urls": []}
selected_urls: List[str] = []
for i in parsed_indices:
try:
entry = display_entries[i - 1]
except Exception:
continue
u = _entry_to_url(entry)
if u and u not in selected_urls:
selected_urls.append(u)
# If we can expand per-entry URLs, return them.
if selected_urls and len(selected_urls) == len(parsed_indices):
return {"cancel": False, "playlist_items": None, "selected_urls": selected_urls}
# yt-dlp accepts comma-separated 1-based indices for playlist_items
return {"cancel": False, "playlist_items": ",".join(str(i) for i in parsed_indices), "selected_urls": []}
# Playlist/multi-entry detection: if the URL has multiple items and the user didn't
# specify -item, prompt for @ selection (supports @* for all).
if len(supported_url) == 1 and not playlist_items and not ytdl_format:
candidate_url = supported_url[0]
selection_info = _maybe_prompt_playlist_items(candidate_url)
if selection_info is not None:
playlist_selection_handled = True
if bool(selection_info.get("cancel")):
return 0
selected_urls = selection_info.get("selected_urls")
if isinstance(selected_urls, list) and selected_urls:
# Expand playlist/channel URL into per-entry URLs so that de-dup preflight
# and downloads operate per file.
supported_url = selected_urls
playlist_items = None
else:
playlist_items = selection_info.get("playlist_items")
# If no -item, no explicit -format specified, and single URL, show the format table.
# Do NOT stop to show formats when -audio is used (auto-pick) or when -clip is used.
@@ -1232,8 +1585,15 @@ class Download_Media(Cmdlet):
and not playlist_items
and not ytdl_format
and len(supported_url) == 1
and not playlist_selection_handled
):
url = supported_url[0]
canonical_url = _canonicalize_url_for_storage(url)
if not _preflight_url_duplicate(canonical_url, extra_urls=[url]):
log(f"Skipping download: {url}", file=sys.stderr)
return 0
formats = list_formats(url, no_playlist=False)
if formats and len(formats) > 1:
@@ -1379,12 +1739,18 @@ class Download_Media(Cmdlet):
# Download each URL
downloaded_count = 0
clip_sections_spec = self._build_clip_sections_spec(clip_range)
quiet_mode = bool(config.get("_quiet_background_output")) if isinstance(config, dict) else False
for url in supported_url:
try:
debug(f"Processing: {url}")
canonical_url = _canonicalize_url_for_storage(url)
# Preflight: warn if URL already exists in storage backends.
if not _preflight_url_duplicate(canonical_url, extra_urls=[url]):
log(f"Skipping download: {url}", file=sys.stderr)
continue
# If playlist_items is specified but looks like a format ID (e.g. from table selection),
# treat it as a format selector instead of playlist items.
# This handles the case where @N selection passes -item <format_id>
@@ -1532,24 +1898,17 @@ class Download_Media(Cmdlet):
if title and f"title:{title}" not in tag:
tag.insert(0, f"title:{title}")
# Build a single canonical URL field; prefer yt-dlp provided webpage_url or info.url,
# but fall back to the original requested URL. If multiple unique urls are available,
# join them into a comma-separated string.
urls_to_consider: List[str] = []
# Store the canonical URL for de-dup/search purposes.
# Prefer yt-dlp's webpage_url, and do not mix in the raw requested URL (which may contain timestamps).
final_url = None
try:
page_url = info.get("webpage_url") or info.get("url")
page_url = info.get("webpage_url") or info.get("original_url") or info.get("url")
if page_url:
urls_to_consider.append(str(page_url))
final_url = str(page_url)
except Exception:
pass
if url:
urls_to_consider.append(str(url))
seen_urls: List[str] = []
for u in urls_to_consider:
if u and u not in seen_urls:
seen_urls.append(u)
final_url = ",".join(seen_urls) if seen_urls else None
final_url = None
if not final_url and url:
final_url = str(url)
# Construct canonical PipeObject dict: hash, store, path, url, title, tags
# Prefer explicit backend names (storage_name/storage_location). If none, default to PATH
@@ -1561,6 +1920,7 @@ class Download_Media(Cmdlet):
"url": final_url,
"tag": tag,
"action": "cmdlet:download-media",
"is_temp": True,
# download_mode removed (deprecated), keep media_kind
"store": getattr(opts, "storage_name", None) or getattr(opts, "storage_location", None) or "PATH",
"media_kind": "video" if opts.mode == "video" else "audio",

View File

@@ -184,6 +184,32 @@ class Get_Metadata(Cmdlet):
mime_type = metadata.get("mime") or metadata.get("ext", "")
file_size = metadata.get("size")
duration_seconds = metadata.get("duration")
if duration_seconds is None:
duration_seconds = metadata.get("duration_seconds")
if duration_seconds is None:
duration_seconds = metadata.get("length")
if duration_seconds is None and isinstance(metadata.get("duration_ms"), (int, float)):
try:
duration_seconds = float(metadata["duration_ms"]) / 1000.0
except Exception:
duration_seconds = None
if isinstance(duration_seconds, str):
s = duration_seconds.strip()
if s:
try:
duration_seconds = float(s)
except ValueError:
if ":" in s:
parts = [p.strip() for p in s.split(":") if p.strip()]
if len(parts) in {2, 3} and all(p.isdigit() for p in parts):
nums = [int(p) for p in parts]
if len(nums) == 2:
duration_seconds = float(nums[0] * 60 + nums[1])
else:
duration_seconds = float(nums[0] * 3600 + nums[1] * 60 + nums[2])
else:
duration_seconds = None
pages = metadata.get("pages")
url = metadata.get("url") or []
imported_ts = self._extract_imported_ts(metadata)

View File

@@ -12,7 +12,13 @@ from __future__ import annotations
import sys
from SYS.logger import log, debug
try:
from Provider.openlibrary import OpenLibrary
_ol_scrape_isbn_metadata = OpenLibrary.scrape_isbn_metadata
_ol_scrape_openlibrary_metadata = OpenLibrary.scrape_openlibrary_metadata
except Exception:
_ol_scrape_isbn_metadata = None # type: ignore[assignment]
_ol_scrape_openlibrary_metadata = None # type: ignore[assignment]
from Provider.metadata_provider import get_metadata_provider, list_metadata_providers
import subprocess
from pathlib import Path
@@ -31,6 +37,10 @@ except ImportError:
extract_title = None
_scrape_isbn_metadata = _ol_scrape_isbn_metadata # type: ignore[assignment]
_scrape_openlibrary_metadata = _ol_scrape_openlibrary_metadata # type: ignore[assignment]
@@ -691,249 +701,22 @@ def _extract_url_formats(formats: list) -> List[Tuple[str, str]]:
def _scrape_isbn_metadata(isbn: str) -> List[str]:
"""Scrape metadata for an ISBN using Open Library API."""
new_tags = []
if _ol_scrape_isbn_metadata is None:
log("OpenLibrary scraper unavailable", file=sys.stderr)
return []
try:
from ..API.HTTP import HTTPClient
import json as json_module
isbn_clean = isbn.replace('-', '').strip()
url = f"https://openlibrary.org/api/books?bibkeys=ISBN:{isbn_clean}&jscmd=data&format=json"
try:
with HTTPClient() as client:
response = client.get(url)
response.raise_for_status()
data = json_module.loads(response.content.decode('utf-8'))
except Exception as e:
log(f"Failed to fetch ISBN metadata: {e}", file=sys.stderr)
return []
if not data:
log(f"No ISBN metadata found for: {isbn}")
return []
book_data = next(iter(data.values()), None)
if not book_data:
return []
if 'title' in book_data:
new_tags.append(f"title:{book_data['title']}")
if 'authors' in book_data and isinstance(book_data['authors'], list):
for author in book_data['authors'][:3]:
if 'name' in author:
new_tags.append(f"author:{author['name']}")
if 'publish_date' in book_data:
new_tags.append(f"publish_date:{book_data['publish_date']}")
if 'publishers' in book_data and isinstance(book_data['publishers'], list):
for pub in book_data['publishers'][:1]:
if 'name' in pub:
new_tags.append(f"publisher:{pub['name']}")
if 'description' in book_data:
desc = book_data['description']
if isinstance(desc, dict) and 'value' in desc:
desc = desc['value']
if desc:
desc_str = str(desc).strip()
# Include description if available (limit to 200 chars to keep it manageable)
if len(desc_str) > 0:
new_tags.append(f"description:{desc_str[:200]}")
if 'number_of_pages' in book_data:
page_count = book_data['number_of_pages']
if page_count and isinstance(page_count, int) and page_count > 0:
new_tags.append(f"pages:{page_count}")
if 'identifiers' in book_data and isinstance(book_data['identifiers'], dict):
identifiers = book_data['identifiers']
if 'openlibrary' in identifiers:
ol_ids = identifiers['openlibrary']
if isinstance(ol_ids, list) and ol_ids:
new_tags.append(f"openlibrary:{ol_ids[0]}")
elif isinstance(ol_ids, str):
new_tags.append(f"openlibrary:{ol_ids}")
if 'lccn' in identifiers:
lccn_list = identifiers['lccn']
if isinstance(lccn_list, list) and lccn_list:
new_tags.append(f"lccn:{lccn_list[0]}")
elif isinstance(lccn_list, str):
new_tags.append(f"lccn:{lccn_list}")
if 'oclc' in identifiers:
oclc_list = identifiers['oclc']
if isinstance(oclc_list, list) and oclc_list:
new_tags.append(f"oclc:{oclc_list[0]}")
elif isinstance(oclc_list, str):
new_tags.append(f"oclc:{oclc_list}")
if 'goodreads' in identifiers:
goodreads_list = identifiers['goodreads']
if isinstance(goodreads_list, list) and goodreads_list:
new_tags.append(f"goodreads:{goodreads_list[0]}")
elif isinstance(goodreads_list, str):
new_tags.append(f"goodreads:{goodreads_list}")
if 'librarything' in identifiers:
lt_list = identifiers['librarything']
if isinstance(lt_list, list) and lt_list:
new_tags.append(f"librarything:{lt_list[0]}")
elif isinstance(lt_list, str):
new_tags.append(f"librarything:{lt_list}")
if 'doi' in identifiers:
doi_list = identifiers['doi']
if isinstance(doi_list, list) and doi_list:
new_tags.append(f"doi:{doi_list[0]}")
elif isinstance(doi_list, str):
new_tags.append(f"doi:{doi_list}")
if 'internet_archive' in identifiers:
ia_list = identifiers['internet_archive']
if isinstance(ia_list, list) and ia_list:
new_tags.append(f"internet_archive:{ia_list[0]}")
elif isinstance(ia_list, str):
new_tags.append(f"internet_archive:{ia_list}")
log(f"Found {len(new_tags)} tag(s) from ISBN lookup")
return new_tags
return list(_ol_scrape_isbn_metadata(isbn))
except Exception as e:
log(f"ISBN scraping error: {e}", file=sys.stderr)
return []
def _scrape_openlibrary_metadata(olid: str) -> List[str]:
"""Scrape metadata for an OpenLibrary ID using the .json API endpoint.
Fetches from https://openlibrary.org/books/{OLID}.json and extracts:
- Title, authors, publish date, publishers
- Description
- Subjects as freeform tags (without namespace prefix)
- Identifiers (ISBN, LCCN, OCLC, etc.)
"""
new_tags = []
if _ol_scrape_openlibrary_metadata is None:
log("OpenLibrary scraper unavailable", file=sys.stderr)
return []
try:
from ..API.HTTP import HTTPClient
import json as json_module
# Format: OL9674499M or just 9674499M
olid_clean = olid.replace('OL', '').replace('M', '')
if not olid_clean.isdigit():
olid_clean = olid
# Ensure we have the full OLID format for the URL
if not olid.startswith('OL'):
url = f"https://openlibrary.org/books/OL{olid_clean}M.json"
else:
url = f"https://openlibrary.org/books/{olid}.json"
try:
with HTTPClient() as client:
response = client.get(url)
response.raise_for_status()
data = json_module.loads(response.content.decode('utf-8'))
except Exception as e:
log(f"Failed to fetch OpenLibrary metadata: {e}", file=sys.stderr)
return []
if not data:
log(f"No OpenLibrary metadata found for: {olid}")
return []
# Add title
if 'title' in data:
new_tags.append(f"title:{data['title']}")
# Add authors
if 'authors' in data and isinstance(data['authors'], list):
for author in data['authors'][:3]:
if isinstance(author, dict) and 'name' in author:
new_tags.append(f"author:{author['name']}")
elif isinstance(author, str):
new_tags.append(f"author:{author}")
# Add publish date
if 'publish_date' in data:
new_tags.append(f"publish_date:{data['publish_date']}")
# Add publishers
if 'publishers' in data and isinstance(data['publishers'], list):
for pub in data['publishers'][:1]:
if isinstance(pub, dict) and 'name' in pub:
new_tags.append(f"publisher:{pub['name']}")
elif isinstance(pub, str):
new_tags.append(f"publisher:{pub}")
# Add description
if 'description' in data:
desc = data['description']
if isinstance(desc, dict) and 'value' in desc:
desc = desc['value']
if desc:
desc_str = str(desc).strip()
if len(desc_str) > 0:
new_tags.append(f"description:{desc_str[:200]}")
# Add number of pages
if 'number_of_pages' in data:
page_count = data['number_of_pages']
if page_count and isinstance(page_count, int) and page_count > 0:
new_tags.append(f"pages:{page_count}")
# Add subjects as FREEFORM tags (no namespace prefix)
if 'subjects' in data and isinstance(data['subjects'], list):
for subject in data['subjects'][:10]:
if subject and isinstance(subject, str):
subject_clean = str(subject).strip()
if subject_clean and subject_clean not in new_tags:
new_tags.append(subject_clean)
# Add identifiers
if 'identifiers' in data and isinstance(data['identifiers'], dict):
identifiers = data['identifiers']
if 'isbn_10' in identifiers:
isbn_10_list = identifiers['isbn_10']
if isinstance(isbn_10_list, list) and isbn_10_list:
new_tags.append(f"isbn_10:{isbn_10_list[0]}")
elif isinstance(isbn_10_list, str):
new_tags.append(f"isbn_10:{isbn_10_list}")
if 'isbn_13' in identifiers:
isbn_13_list = identifiers['isbn_13']
if isinstance(isbn_13_list, list) and isbn_13_list:
new_tags.append(f"isbn_13:{isbn_13_list[0]}")
elif isinstance(isbn_13_list, str):
new_tags.append(f"isbn_13:{isbn_13_list}")
if 'lccn' in identifiers:
lccn_list = identifiers['lccn']
if isinstance(lccn_list, list) and lccn_list:
new_tags.append(f"lccn:{lccn_list[0]}")
elif isinstance(lccn_list, str):
new_tags.append(f"lccn:{lccn_list}")
if 'oclc_numbers' in identifiers:
oclc_list = identifiers['oclc_numbers']
if isinstance(oclc_list, list) and oclc_list:
new_tags.append(f"oclc:{oclc_list[0]}")
elif isinstance(oclc_list, str):
new_tags.append(f"oclc:{oclc_list}")
if 'goodreads' in identifiers:
goodreads_list = identifiers['goodreads']
if isinstance(goodreads_list, list) and goodreads_list:
new_tags.append(f"goodreads:{goodreads_list[0]}")
elif isinstance(goodreads_list, str):
new_tags.append(f"goodreads:{goodreads_list}")
log(f"Found {len(new_tags)} tag(s) from OpenLibrary lookup")
return new_tags
return list(_ol_scrape_openlibrary_metadata(olid))
except Exception as e:
log(f"OpenLibrary scraping error: {e}", file=sys.stderr)
return []

View File

@@ -1,28 +1,40 @@
from __future__ import annotations
from typing import Any, Dict, Sequence
from dataclasses import dataclass
from typing import Any, Dict, List, Sequence
import sys
from . import register
import pipeline as ctx
from ._shared import Cmdlet, CmdletArg, SharedArgs, parse_cmdlet_args, get_field, normalize_hash
from ._shared import Cmdlet, SharedArgs, parse_cmdlet_args, get_field, normalize_hash
from SYS.logger import log
from Store import Store
@dataclass
class UrlItem:
url: str
hash: str
store: str
class Get_Url(Cmdlet):
"""Get url associated with files via hash+store."""
NAME = "get-url"
SUMMARY = "List url associated with a file"
USAGE = "@1 | get-url"
ARGS = [
SharedArgs.HASH,
SharedArgs.STORE,
]
DETAIL = [
"- Lists all url associated with file identified by hash+store",
]
def __init__(self) -> None:
super().__init__(
name="get-url",
summary="List url associated with a file",
usage="@1 | get-url",
arg=[
SharedArgs.HASH,
SharedArgs.STORE,
],
detail=[
"- Lists all url associated with file identified by hash+store",
],
exec=self.run,
)
self.register()
def run(self, result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
"""Get url for file via hash+store backend."""
@@ -53,18 +65,34 @@ class Get_Url(Cmdlet):
urls = backend.get_url(file_hash)
if urls:
for u in urls:
# Emit rich object for pipeline compatibility
ctx.emit({
"url": u,
"hash": file_hash,
"store": store_name,
})
return 0
else:
ctx.emit("No url found")
return 0
from result_table import ResultTable
title = str(get_field(result, "title") or "").strip()
table_title = "Title"
if title:
table_title = f"Title: {title}"
table = ResultTable(table_title, max_columns=1).set_preserve_order(True)
table.set_source_command("get-url", [])
items: List[UrlItem] = []
for u in list(urls or []):
u = str(u or "").strip()
if not u:
continue
row = table.add_row()
row.add_column("Url", u)
item = UrlItem(url=u, hash=file_hash, store=str(store_name))
items.append(item)
ctx.emit(item)
# Make this a real result table so @.. / @,, can navigate it
ctx.set_last_result_table(table if items else None, items, subject=result)
if not items:
log("No url found", file=sys.stderr)
return 0
except KeyError:
log(f"Error: Storage backend '{store_name}' not configured")
@@ -74,7 +102,6 @@ class Get_Url(Cmdlet):
return 1
# Register cmdlet
register(["get-url", "get_url"])(Get_Url)
CMDLET = Get_Url()

View File

@@ -3,7 +3,6 @@ from __future__ import annotations
from typing import Any, Dict, Sequence, List, Optional, Tuple
from pathlib import Path
from dataclasses import dataclass, field
from collections import OrderedDict
import re
import json
@@ -11,57 +10,9 @@ import sys
from SYS.logger import log, debug
from ._shared import Cmdlet, CmdletArg, get_field, should_show_help
from ._shared import Cmdlet, CmdletArg, get_field, should_show_help, normalize_hash, first_title_tag
import pipeline as ctx
# Optional dependencies
try:
import mutagen # type: ignore
except ImportError: # pragma: no cover
mutagen = None # type: ignore
try:
from config import get_hydrus_url, resolve_output_dir
except Exception: # pragma: no cover
get_hydrus_url = None # type: ignore
resolve_output_dir = None # type: ignore
try:
from API.HydrusNetwork import HydrusNetwork, HydrusRequestError
except ImportError: # pragma: no cover
HydrusNetwork = None # type: ignore
HydrusRequestError = RuntimeError # type: ignore
try:
from SYS.utils import sha256_file
except ImportError: # pragma: no cover
sha256_file = None # type: ignore
try:
from SYS.utils_constant import mime_maps
except ImportError: # pragma: no cover
mime_maps = {} # type: ignore
@dataclass(slots=True)
class SearchRecord:
path: str
size_bytes: int | None = None
duration_seconds: str | None = None
tag: str | None = None
hash: str | None = None
def as_dict(self) -> dict[str, str]:
payload: dict[str, str] = {"path": self.path}
if self.size_bytes is not None:
payload["size"] = str(self.size_bytes)
if self.duration_seconds:
payload["duration"] = self.duration_seconds
if self.tag:
payload["tag"] = self.tag
if self.hash:
payload["hash"] = self.hash
return payload
STORAGE_ORIGINS = {"local", "hydrus", "folder"}
@@ -86,12 +37,15 @@ class Search_Store(Cmdlet):
detail=[
"Search across storage backends: Folder stores and Hydrus instances",
"Use -store to search a specific backend by name",
"URL search: url:* (any URL) or url:<value> (URL substring)",
"Filter results by: tag, size, type, duration",
"Results include hash for downstream commands (get-file, add-tag, etc.)",
"Examples:",
"search-store foo # Search all storage backends",
"search-store -store home '*' # Search 'home' Hydrus instance",
"search-store -store test 'video' # Search 'test' folder store",
"search-store 'url:*' # Files that have any URL",
"search-store 'url:youtube.com' # Files whose URL contains substring",
"search-store song -type audio # Search for audio files",
"search-store movie -tag action # Search with tag filter",
],
@@ -100,6 +54,40 @@ class Search_Store(Cmdlet):
self.register()
# --- Helper methods -------------------------------------------------
@staticmethod
def _parse_hash_query(query: str) -> List[str]:
"""Parse a `hash:` query into a list of normalized 64-hex SHA256 hashes.
Supported examples:
- hash:<h1>,<h2>,<h3>
- Hash: <h1> <h2> <h3>
- hash:{<h1>, <h2>}
"""
q = str(query or "").strip()
if not q:
return []
m = re.match(r"^hash(?:es)?\s*:\s*(.+)$", q, flags=re.IGNORECASE)
if not m:
return []
rest = (m.group(1) or "").strip()
if rest.startswith("{") and rest.endswith("}"):
rest = rest[1:-1].strip()
if rest.startswith("[") and rest.endswith("]"):
rest = rest[1:-1].strip()
# Split on commas and whitespace.
raw_parts = [p.strip() for p in re.split(r"[\s,]+", rest) if p.strip()]
out: List[str] = []
for part in raw_parts:
h = normalize_hash(part)
if not h:
continue
if h not in out:
out.append(h)
return out
@staticmethod
def _normalize_extension(ext_value: Any) -> str:
"""Sanitize extension strings to alphanumerics and cap at 5 chars."""
@@ -150,10 +138,10 @@ class Search_Store(Cmdlet):
# Parse arguments
query = ""
tag_filters: List[str] = []
size_filter: Optional[Tuple[str, int]] = None
duration_filter: Optional[Tuple[str, float]] = None
type_filter: Optional[str] = None
_tag_filters: List[str] = []
_size_filter: Optional[Tuple[str, int]] = None
_duration_filter: Optional[Tuple[str, float]] = None
_type_filter: Optional[str] = None
storage_backend: Optional[str] = None
limit = 100
searched_backends: List[str] = []
@@ -166,7 +154,7 @@ class Search_Store(Cmdlet):
storage_backend = args_list[i + 1]
i += 2
elif low in {"-tag", "--tag"} and i + 1 < len(args_list):
tag_filters.append(args_list[i + 1])
_tag_filters.append(args_list[i + 1])
i += 2
elif low in {"-limit", "--limit"} and i + 1 < len(args_list):
try:
@@ -175,7 +163,7 @@ class Search_Store(Cmdlet):
limit = 100
i += 2
elif low in {"-type", "--type"} and i + 1 < len(args_list):
type_filter = args_list[i + 1].lower()
_type_filter = args_list[i + 1].lower()
i += 2
elif not arg.startswith("-"):
query = f"{query} {arg}".strip() if query else arg
@@ -195,6 +183,8 @@ class Search_Store(Cmdlet):
if store_filter and not storage_backend:
storage_backend = store_filter
hash_query = self._parse_hash_query(query)
if not query:
log("Provide a search query", file=sys.stderr)
return 1
@@ -230,12 +220,136 @@ class Search_Store(Cmdlet):
table_title += f" [{storage_backend}]"
table = ResultTable(table_title)
if hash_query:
try:
table.set_preserve_order(True)
except Exception:
pass
from Store import Store
storage = Store(config=config or {})
from Store._base import Store as BaseStore
backend_to_search = storage_backend or None
if hash_query:
# Explicit hash list search: build rows from backend metadata.
backends_to_try: List[str] = []
if backend_to_search:
backends_to_try = [backend_to_search]
else:
backends_to_try = list(storage.list_backends())
found_any = False
for h in hash_query:
resolved_backend_name: Optional[str] = None
resolved_backend = None
for backend_name in backends_to_try:
try:
backend = storage[backend_name]
except Exception:
continue
try:
# If get_metadata works, consider it a hit; get_file can be optional (e.g. remote URL).
meta = backend.get_metadata(h)
if meta is None:
continue
resolved_backend_name = backend_name
resolved_backend = backend
break
except Exception:
continue
if resolved_backend_name is None or resolved_backend is None:
continue
found_any = True
searched_backends.append(resolved_backend_name)
# Resolve a path/URL string if possible
path_str: Optional[str] = None
try:
maybe_path = resolved_backend.get_file(h)
if isinstance(maybe_path, Path):
path_str = str(maybe_path)
elif isinstance(maybe_path, str) and maybe_path:
path_str = maybe_path
except Exception:
path_str = None
meta_obj: Dict[str, Any] = {}
try:
meta_obj = resolved_backend.get_metadata(h) or {}
except Exception:
meta_obj = {}
tags_list: List[str] = []
try:
tag_result = resolved_backend.get_tag(h)
if isinstance(tag_result, tuple) and tag_result:
maybe_tags = tag_result[0]
else:
maybe_tags = tag_result
if isinstance(maybe_tags, list):
tags_list = [str(t).strip() for t in maybe_tags if isinstance(t, str) and str(t).strip()]
except Exception:
tags_list = []
title_from_tag: Optional[str] = None
try:
title_tag = first_title_tag(tags_list)
if title_tag and ":" in title_tag:
title_from_tag = title_tag.split(":", 1)[1].strip()
except Exception:
title_from_tag = None
title = title_from_tag or meta_obj.get("title") or meta_obj.get("name")
if not title and path_str:
try:
title = Path(path_str).stem
except Exception:
title = path_str
ext_val = meta_obj.get("ext") or meta_obj.get("extension")
if not ext_val and path_str:
try:
ext_val = Path(path_str).suffix
except Exception:
ext_val = None
size_bytes = meta_obj.get("size")
if size_bytes is None:
size_bytes = meta_obj.get("size_bytes")
try:
size_bytes_int: Optional[int] = int(size_bytes) if size_bytes is not None else None
except Exception:
size_bytes_int = None
payload: Dict[str, Any] = {
"title": str(title or h),
"hash": h,
"store": resolved_backend_name,
"path": path_str,
"ext": self._normalize_extension(ext_val),
"size_bytes": size_bytes_int,
"tag": tags_list,
}
table.add_result(payload)
results_list.append(payload)
ctx.emit(payload)
if found_any:
ctx.set_last_result_table(table, results_list)
db.append_worker_stdout(worker_id, json.dumps(results_list, indent=2))
db.update_worker_status(worker_id, 'completed')
return 0
log("No results found", file=sys.stderr)
db.append_worker_stdout(worker_id, json.dumps([], indent=2))
db.update_worker_status(worker_id, 'completed')
return 0
if backend_to_search:
searched_backends.append(backend_to_search)
target_backend = storage[backend_to_search]
@@ -243,7 +357,9 @@ class Search_Store(Cmdlet):
log(f"Backend '{backend_to_search}' does not support searching", file=sys.stderr)
db.update_worker_status(worker_id, 'error')
return 1
debug(f"[search-store] Searching '{backend_to_search}'")
results = target_backend.search(query, limit=limit)
debug(f"[search-store] '{backend_to_search}' -> {len(results or [])} result(s)")
else:
from API.HydrusNetwork import is_hydrus_available
hydrus_available = is_hydrus_available(config or {})
@@ -257,7 +373,9 @@ class Search_Store(Cmdlet):
continue
searched_backends.append(backend_name)
debug(f"[search-store] Searching '{backend_name}'")
backend_results = backend.search(query, limit=limit - len(all_results))
debug(f"[search-store] '{backend_name}' -> {len(backend_results or [])} result(s)")
if backend_results:
all_results.extend(backend_results)
if len(all_results) >= limit:
@@ -317,11 +435,6 @@ class Search_Store(Cmdlet):
results_list.append(normalized)
ctx.emit(normalized)
# Debug: Verify table rows match items list
debug(f"[search-store] Added {len(table.rows)} rows to table, {len(results_list)} items to results_list")
if len(table.rows) != len(results_list):
debug(f"[search-store] WARNING: Table/items mismatch! rows={len(table.rows)} items={len(results_list)}", file=sys.stderr)
ctx.set_last_result_table(table, results_list)
db.append_worker_stdout(worker_id, json.dumps(results_list, indent=2))
else:

View File

@@ -3,14 +3,12 @@ import re
import subprocess
import sys
import shutil
import sqlite3
import requests
from SYS.logger import log, debug
from urllib.parse import urlsplit, urlunsplit, unquote
from collections import deque
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple
from models import PipeObject, FileRelationshipTracker, _get_file_hash
from models import FileRelationshipTracker
try:
import musicbrainzngs # type: ignore
except ImportError: # pragma: no cover
@@ -332,6 +330,112 @@ def _generate_hydrus_url_variants(url: str) -> List[str]:
return variants
def normalize_urls(value: Any) -> List[str]:
"""Normalize a URL field into a stable, deduplicated list.
Accepts:
- None
- a single URL string (optionally containing multiple URLs)
- a list/tuple/set of URL strings
This helper is used by cmdlets/stores/pipeline to keep `url` consistent.
"""
def _iter_raw_urls(raw: Any) -> Iterable[str]:
if raw is None:
return
if isinstance(raw, str):
text = raw.strip()
if not text:
return
# Support legacy prefixes like "url:https://...".
if text.lower().startswith("url:"):
text = text.split(":", 1)[1].strip()
# Prefer extracting obvious URLs to avoid splitting inside query strings.
matches = re.findall(r"https?://[^\s,]+", text, flags=re.IGNORECASE)
if matches:
for m in matches:
yield m
return
# Fallback: split on commas/whitespace.
for token in text.replace("\n", " ").replace("\r", " ").replace(",", " ").split():
if token:
yield token
return
if isinstance(raw, (list, tuple, set)):
for item in raw:
if item is None:
continue
if isinstance(item, str):
if item.strip():
yield item
else:
text = str(item).strip()
if text:
yield text
return
# Last resort: string-coerce.
text = str(raw).strip()
if text:
yield text
def _canonicalize(url_text: str) -> Optional[str]:
u = str(url_text or "").strip()
if not u:
return None
# Trim common wrappers and trailing punctuation.
u = u.strip("<>\"' ")
u = u.rstrip(")].,;\"")
if not u:
return None
lower = u.lower()
if not (lower.startswith("http://") or lower.startswith("https://")):
return u
try:
parsed = urlsplit(u)
except Exception:
return u
scheme = (parsed.scheme or "").lower()
netloc = (parsed.netloc or "").lower()
path = unquote(parsed.path or "")
query = parsed.query or ""
# Normalize default ports.
if scheme == "http" and netloc.endswith(":80"):
netloc = netloc[:-3]
elif scheme == "https" and netloc.endswith(":443"):
netloc = netloc[:-4]
# Prefer no trailing slash except root.
if path and path != "/":
path = path.rstrip("/")
# Fragments are not part of the resource.
return urlunsplit((scheme, netloc, path, query, ""))
seen: Set[str] = set()
out: List[str] = []
for raw_url in _iter_raw_urls(value):
canonical = _canonicalize(raw_url)
if not canonical:
continue
if canonical in seen:
continue
seen.add(canonical)
out.append(canonical)
return out
def value_normalize(value: str) -> str:
"""Normalize whitespace: collapse internal spaces, strip, remove newlines."""
value = value.replace("\n", " ").replace("\r", " ")
@@ -358,6 +462,7 @@ def import_pending_sidecars(db_root: Path, db: Any) -> None:
continue
# Ensure file entry exists
file_id: Optional[int] = None
try:
cursor = db.connection.cursor() if db.connection else None
if cursor:
@@ -394,10 +499,16 @@ def import_pending_sidecars(db_root: Path, db: Any) -> None:
try:
cursor = db.connection.cursor() if db.connection else None
if cursor:
file_hash_value: Optional[str] = None
if hasattr(db, 'get_file_hash'):
try:
file_hash_value = db.get_file_hash(file_id)
except Exception:
file_hash_value = None
for tag in tags:
cursor.execute(
'INSERT OR IGNORE INTO tags (hash, tag) VALUES (?, ?)',
(file_hash_value, tag) if hasattr(db, 'get_file_hash') else (None, tag)
(file_hash_value, tag)
)
db.connection.commit()
except Exception:
@@ -663,128 +774,6 @@ def fetch_musicbrainz_tags(mbid: str, entity: str) -> Dict[str, object]:
return {"source": "musicbrainz", "id": mbid, "tag": tags, "entity": entity}
def fetch_openlibrary_tags(ol_id: str) -> Dict[str, object]:
"""Fetch metadata tags from OpenLibrary.
Args:
ol_id: OpenLibrary ID (e.g., 'OL123456M' for a book)
Returns:
Dictionary with 'tag' key containing list of extracted tags
"""
import urllib.request
# Normalize OL ID
ol_id = ol_id.strip().upper()
if not ol_id.startswith('OL'):
ol_id = f'OL{ol_id}'
# Fetch from OpenLibrary API
url = f"https://openlibrary.org/books/{ol_id}.json"
tags: List[str] = []
try:
with urllib.request.urlopen(url, timeout=10) as response:
data = json.loads(response.read().decode('utf-8'))
except Exception as e:
raise ValueError(f"Failed to fetch OpenLibrary data for {ol_id}: {e}")
# Add OpenLibrary ID tag
_add_tag(tags, "openlibrary", ol_id)
# Extract title
_add_tag(tags, "title", data.get("title"))
# Extract subtitle if present
if data.get("subtitle"):
_add_tag(tags, "subtitle", data["subtitle"])
# Extract authors
authors = data.get("authors", [])
author_names: List[str] = []
for author in authors:
if isinstance(author, dict):
name = author.get("name")
else:
name = str(author)
if name:
author_names.append(name)
if author_names:
_extend_tags(tags, "author", author_names)
# Extract publication details
if data.get("publish_date"):
_add_tag(tags, "publish_date", data["publish_date"])
# Extract year if present
year_match = re.search(r'\b(\d{4})\b', str(data.get("publish_date", "")))
if year_match:
_add_tag(tags, "year", year_match.group(1))
# Extract publishers
publishers = data.get("publishers", [])
if publishers:
publisher_names = []
for pub in publishers:
if isinstance(pub, dict):
name = pub.get("name")
else:
name = str(pub)
if name:
publisher_names.append(name)
if publisher_names:
_extend_tags(tags, "publisher", publisher_names)
# Extract languages
languages = data.get("languages", [])
if languages:
lang_codes = []
for lang in languages:
if isinstance(lang, dict):
code = lang.get("key", "").split("/")[-1]
else:
code = str(lang).split("/")[-1]
if code and code != "":
lang_codes.append(code)
if lang_codes:
_extend_tags(tags, "language", lang_codes)
# Extract ISBN
isbns = data.get("isbn_10", []) + data.get("isbn_13", [])
if isbns:
for isbn in isbns[:1]: # Just take first one
if len(str(isbn)) == 10:
_add_tag(tags, "isbn_10", isbn)
elif len(str(isbn)) == 13:
_add_tag(tags, "isbn_13", isbn)
# Extract page count
_add_tag(tags, "pages", data.get("number_of_pages"))
# Extract genres/subjects (OpenLibrary calls them subjects)
# Subjects are added as plain freeform tags (no namespace prefix)
subjects = data.get("subjects", [])
if subjects:
for subject in subjects[:10]: # Limit to 10 subjects
if isinstance(subject, dict):
name = subject.get("name")
else:
name = str(subject)
if name:
# Add subject as plain tag without "subject:" prefix
normalized = value_normalize(str(name))
if normalized:
tags.append(normalized)
# Extract OpenLibrary description
description = data.get("description")
if description:
if isinstance(description, dict):
description = description.get("value")
_add_tag(tags, "summary", description)
return {"source": "openlibrary", "id": ol_id, "tag": tags}
def _append_unique(target: List[str], seen: Set[str], value: Optional[str]) -> None:
"""Append a single value if not already in seen set (deduplication)."""
if value is None:
@@ -1545,7 +1534,7 @@ def _derive_sidecar_path(media_path: Path) -> Path:
return preferred
def _read_sidecar_metadata(sidecar_path: Path) -> tuple[Optional[str], List[str], List[str]]:
def _read_sidecar_metadata(sidecar_path: Path) -> tuple[Optional[str], List[str], List[str]]: # pyright: ignore[reportUnusedFunction]
"""Read hash, tags, and url from sidecar file.
Consolidated with read_tags_from_file - this extracts extra metadata (hash, url).
@@ -1559,7 +1548,7 @@ def _read_sidecar_metadata(sidecar_path: Path) -> tuple[Optional[str], List[str]
hash_value: Optional[str] = None
tags: List[str] = []
url: List[str] = []
urls: List[str] = []
for raw_line in raw.splitlines():
line = raw_line.strip()
@@ -1574,15 +1563,15 @@ def _read_sidecar_metadata(sidecar_path: Path) -> tuple[Optional[str], List[str]
url_part = line.split(':', 1)[1].strip() if ':' in line else ''
if url_part:
for url_segment in url_part.split(','):
for url in url_segment.split():
url_clean = url.strip()
if url_clean and url_clean not in url:
url.append(url_clean)
for url_token in url_segment.split():
url_clean = url_token.strip()
if url_clean and url_clean not in urls:
urls.append(url_clean)
else:
# Everything else is a tag (including relationship: lines)
tags.append(line)
return hash_value, tags, url
return hash_value, tags, urls
@@ -1827,63 +1816,6 @@ def apply_title_to_path(media_path: Path, tags: Iterable[str]) -> Path:
return destination
def _collect_search_roots(payload: Dict[str, Any]) -> List[Path]:
roots: List[Path] = []
for key in ('paths', 'search_paths', 'roots', 'directories'):
raw = payload.get(key)
if not raw:
continue
entries = raw if isinstance(raw, (list, tuple, set)) else [raw]
for entry in entries:
if not entry:
continue
try:
candidate = Path(str(entry)).expanduser()
except Exception:
continue
roots.append(candidate)
if load_config is not None and resolve_output_dir is not None:
try:
config = load_config()
except Exception:
config = None
if isinstance(config, dict) and config:
try:
default_root = resolve_output_dir(config)
except Exception:
default_root = None
if default_root is not None:
roots.append(default_root)
return roots
def _locate_sidecar_by_hash(hash_value: str, roots: Iterable[Path]) -> Optional[Path]:
target = f'hash:{hash_value.strip().lower()}'
for root in roots:
try:
root_path = root.expanduser()
except Exception:
continue
if not root_path.exists() or not root_path.is_dir():
continue
for pattern in ('*.tag',):
try:
iterator = root_path.rglob(pattern)
except OSError:
continue
for candidate in iterator:
if not candidate.is_file():
continue
try:
with candidate.open('r', encoding='utf-8', errors='ignore') as handle:
for line in handle:
if line.strip().lower() == target:
return candidate
except OSError:
continue
return None
def sync_sidecar(payload: Dict[str, Any]) -> Dict[str, Any]:
path_value = payload.get('path')
if not path_value:
@@ -2506,8 +2438,8 @@ def write_tags_to_file(
# Add known url if provided - each on separate line to prevent corruption
if url:
for url in url:
content_lines.append(f"url:{url}")
for url_item in url:
content_lines.append(f"url:{url_item}")
# Add tags
if tags:
@@ -2642,10 +2574,10 @@ def detect_metadata_request(tag: str) -> Optional[Dict[str, str]]:
def expand_metadata_tag(payload: Dict[str, Any]) -> Dict[str, Any]:
tag = payload.get('tag')
if not isinstance(tag, str):
return {'tags': []}
return {'tag': []}
trimmed = value_normalize(tag)
if not trimmed:
return {'tags': []}
return {'tag': []}
request = detect_metadata_request(trimmed)
tags: List[str] = []
seen: Set[str] = set()
@@ -2653,7 +2585,7 @@ def expand_metadata_tag(payload: Dict[str, Any]) -> Dict[str, Any]:
_append_unique(tags, seen, request['base'])
else:
_append_unique(tags, seen, trimmed)
return {'tags': tags}
return {'tag': tags}
try:
if request['source'] == 'imdb':
data = imdb_tag(request['id'])
@@ -2662,8 +2594,15 @@ def expand_metadata_tag(payload: Dict[str, Any]) -> Dict[str, Any]:
except Exception as exc: # pragma: no cover - network/service errors
return {'tag': tags, 'error': str(exc)}
# Add tags from fetched data (no namespace, just unique append)
for tag in (data.get('tag') or []):
_append_unique(tags, seen, tag)
raw_tags = data.get('tag') if isinstance(data, dict) else None
if isinstance(raw_tags, str):
tag_iter: Iterable[str] = [raw_tags]
elif isinstance(raw_tags, (list, tuple, set)):
tag_iter = [t for t in raw_tags if isinstance(t, str)]
else:
tag_iter = []
for tag_value in tag_iter:
_append_unique(tags, seen, tag_value)
result = {
'tag': tags,
'source': request['source'],
@@ -3082,14 +3021,14 @@ def expand_tag_lists(tags_set: Set[str]) -> Set[str]:
# Load adjective.json from workspace root
adjective_path = Path(__file__).parent / "adjective.json"
if not adjective_path.exists():
log.debug(f"adjective.json not found at {adjective_path}")
debug(f"adjective.json not found at {adjective_path}")
return tags_set
try:
with open(adjective_path, 'r') as f:
adjective_lists = json.load(f)
except Exception as e:
log.error(f"Error loading adjective.json: {e}")
debug(f"Error loading adjective.json: {e}")
return tags_set
expanded_tags = set()
@@ -3108,10 +3047,10 @@ def expand_tag_lists(tags_set: Set[str]) -> Set[str]:
if matched_list:
# Add all tags from the list
expanded_tags.update(matched_list)
log.info(f"Expanded {tag} to {len(matched_list)} tags")
debug(f"Expanded {tag} to {len(matched_list)} tags")
else:
# List not found, log warning but don't add the reference
log.warning(f"Tag list '{list_name}' not found in adjective.json")
debug(f"Tag list '{list_name}' not found in adjective.json")
else:
# Regular tag, keep as is
expanded_tags.add(tag)
@@ -3194,98 +3133,6 @@ def build_book_tags(
return deduped
def fetch_openlibrary_metadata_tags(isbn: Optional[str] = None, olid: Optional[str] = None) -> List[str]:
"""Fetch book metadata from OpenLibrary and return as tags.
Args:
isbn: ISBN number (with or without isbn: prefix)
olid: OpenLibrary ID
Returns:
List of tags extracted from OpenLibrary metadata
"""
metadata_tags = []
# Try OLID first (preferred), then ISBN
url = None
if olid:
# Clean up OLID format
olid_clean = str(olid).replace('OL', '').replace('M', '').replace('W', '')
if olid_clean.isdigit():
url = f"https://openlibrary.org/books/OL{olid_clean}M.json"
else:
url = f"https://openlibrary.org/books/{olid}.json"
elif isbn:
# Clean up ISBN
isbn_clean = str(isbn).replace('isbn:', '').strip()
url = f"https://openlibrary.org/isbn/{isbn_clean}.json"
if not url:
return metadata_tags
try:
response = requests.get(url, timeout=10)
if response.status_code != 200:
return metadata_tags
data = response.json()
if not data:
return metadata_tags
# Extract title
if 'title' in data:
metadata_tags.append(f"title:{data['title']}")
# Extract authors
if 'authors' in data and isinstance(data['authors'], list):
for author in data['authors'][:3]:
if isinstance(author, dict) and 'name' in author:
metadata_tags.append(f"author:{author['name']}")
elif isinstance(author, str):
metadata_tags.append(f"author:{author}")
# Extract publish date
if 'publish_date' in data:
metadata_tags.append(f"publish_date:{data['publish_date']}")
# Extract publishers
if 'publishers' in data and isinstance(data['publishers'], list):
for pub in data['publishers'][:1]:
if isinstance(pub, dict) and 'name' in pub:
metadata_tags.append(f"publisher:{pub['name']}")
elif isinstance(pub, str):
metadata_tags.append(f"publisher:{pub}")
# Extract number of pages
if 'number_of_pages' in data:
page_count = data['number_of_pages']
if page_count and isinstance(page_count, int) and page_count > 0:
metadata_tags.append(f"pages:{page_count}")
# Extract language
if 'languages' in data and isinstance(data['languages'], list) and data['languages']:
lang = data['languages'][0]
if isinstance(lang, dict) and 'key' in lang:
lang_code = lang['key'].split('/')[-1]
metadata_tags.append(f"language:{lang_code}")
elif isinstance(lang, str):
metadata_tags.append(f"language:{lang}")
# Extract subjects as freeform tags (limit to 5)
if 'subjects' in data and isinstance(data['subjects'], list):
for subject in data['subjects'][:5]:
if subject and isinstance(subject, str):
subject_clean = str(subject).strip()
if subject_clean:
metadata_tags.append(subject_clean)
except Exception as e:
debug(f"⚠ Failed to fetch OpenLibrary metadata: {e}")
return metadata_tags
def enrich_playlist_entries(entries: list, extractor: str) -> list:
"""Enrich playlist entries with full metadata by fetching individual entry info.
@@ -3312,7 +3159,7 @@ def enrich_playlist_entries(entries: list, extractor: str) -> list:
if entry_url and is_url_supported_by_ytdlp(entry_url):
try:
import yt_dlp
ydl_opts = {
ydl_opts: Any = {
"quiet": True,
"no_warnings": True,
"skip_download": True,
@@ -3690,294 +3537,3 @@ def extract_url_formats(formats: list) -> List[Tuple[str, str]]:
return []
def scrape_isbn_metadata(isbn: str) -> List[str]:
"""Scrape metadata for an ISBN using Open Library API."""
new_tags = []
try:
from API.HTTP import HTTPClient
import json as json_module
isbn_clean = isbn.replace('-', '').strip()
url = f"https://openlibrary.org/api/books?bibkeys=ISBN:{isbn_clean}&jscmd=data&format=json"
try:
with HTTPClient() as client:
response = client.get(url)
response.raise_for_status()
data = json_module.loads(response.content.decode('utf-8'))
except Exception as e:
log(f"Failed to fetch ISBN metadata: {e}", file=sys.stderr)
return []
if not data:
log(f"No ISBN metadata found for: {isbn}")
return []
book_data = next(iter(data.values()), None)
if not book_data:
return []
if 'title' in book_data:
new_tags.append(f"title:{book_data['title']}")
if 'authors' in book_data and isinstance(book_data['authors'], list):
for author in book_data['authors'][:3]:
if 'name' in author:
new_tags.append(f"author:{author['name']}")
if 'publish_date' in book_data:
new_tags.append(f"publish_date:{book_data['publish_date']}")
if 'publishers' in book_data and isinstance(book_data['publishers'], list):
for pub in book_data['publishers'][:1]:
if 'name' in pub:
new_tags.append(f"publisher:{pub['name']}")
if 'description' in book_data:
desc = book_data['description']
if isinstance(desc, dict) and 'value' in desc:
desc = desc['value']
if desc:
desc_str = str(desc).strip()
# Include description if available (limit to 200 chars to keep it manageable)
if len(desc_str) > 0:
new_tags.append(f"description:{desc_str[:200]}")
if 'number_of_pages' in book_data:
page_count = book_data['number_of_pages']
if page_count and isinstance(page_count, int) and page_count > 0:
new_tags.append(f"pages:{page_count}")
if 'identifiers' in book_data and isinstance(book_data['identifiers'], dict):
identifiers = book_data['identifiers']
if 'openlibrary' in identifiers:
ol_ids = identifiers['openlibrary']
if isinstance(ol_ids, list) and ol_ids:
new_tags.append(f"openlibrary:{ol_ids[0]}")
elif isinstance(ol_ids, str):
new_tags.append(f"openlibrary:{ol_ids}")
if 'lccn' in identifiers:
lccn_list = identifiers['lccn']
if isinstance(lccn_list, list) and lccn_list:
new_tags.append(f"lccn:{lccn_list[0]}")
elif isinstance(lccn_list, str):
new_tags.append(f"lccn:{lccn_list}")
if 'oclc' in identifiers:
oclc_list = identifiers['oclc']
if isinstance(oclc_list, list) and oclc_list:
new_tags.append(f"oclc:{oclc_list[0]}")
elif isinstance(oclc_list, str):
new_tags.append(f"oclc:{oclc_list}")
if 'goodreads' in identifiers:
goodreads_list = identifiers['goodreads']
if isinstance(goodreads_list, list) and goodreads_list:
new_tags.append(f"goodreads:{goodreads_list[0]}")
elif isinstance(goodreads_list, str):
new_tags.append(f"goodreads:{goodreads_list}")
if 'librarything' in identifiers:
lt_list = identifiers['librarything']
if isinstance(lt_list, list) and lt_list:
new_tags.append(f"librarything:{lt_list[0]}")
elif isinstance(lt_list, str):
new_tags.append(f"librarything:{lt_list}")
if 'doi' in identifiers:
doi_list = identifiers['doi']
if isinstance(doi_list, list) and doi_list:
new_tags.append(f"doi:{doi_list[0]}")
elif isinstance(doi_list, str):
new_tags.append(f"doi:{doi_list}")
if 'internet_archive' in identifiers:
ia_list = identifiers['internet_archive']
if isinstance(ia_list, list) and ia_list:
new_tags.append(f"internet_archive:{ia_list[0]}")
elif isinstance(ia_list, str):
new_tags.append(f"internet_archive:{ia_list}")
log(f"Found {len(new_tags)} tag(s) from ISBN lookup")
return new_tags
except Exception as e:
log(f"ISBN scraping error: {e}", file=sys.stderr)
return []
def scrape_openlibrary_metadata(olid: str) -> List[str]:
"""Scrape metadata for an OpenLibrary ID using the .json API endpoint.
Fetches from https://openlibrary.org/books/{OLID}.json and extracts:
- Title, authors, publish date, publishers
- Description
- Subjects as freeform tags (without namespace prefix)
- Identifiers (ISBN, LCCN, OCLC, etc.)
"""
new_tags = []
try:
from API.HTTP import HTTPClient
import json as json_module
# Format: OL9674499M or just 9674499M
olid_clean = olid.replace('OL', '').replace('M', '')
if not olid_clean.isdigit():
olid_clean = olid
# Ensure we have the full OLID format for the URL
if not olid.startswith('OL'):
url = f"https://openlibrary.org/books/OL{olid_clean}M.json"
else:
url = f"https://openlibrary.org/books/{olid}.json"
try:
with HTTPClient() as client:
response = client.get(url)
response.raise_for_status()
data = json_module.loads(response.content.decode('utf-8'))
except Exception as e:
log(f"Failed to fetch OpenLibrary metadata: {e}", file=sys.stderr)
return []
if not data:
log(f"No OpenLibrary metadata found for: {olid}")
return []
# Add title
if 'title' in data:
new_tags.append(f"title:{data['title']}")
# Add authors
if 'authors' in data and isinstance(data['authors'], list):
for author in data['authors'][:3]:
if isinstance(author, dict) and 'name' in author:
new_tags.append(f"author:{author['name']}")
elif isinstance(author, str):
new_tags.append(f"author:{author}")
# Add publish date
if 'publish_date' in data:
new_tags.append(f"publish_date:{data['publish_date']}")
# Add publishers
if 'publishers' in data and isinstance(data['publishers'], list):
for pub in data['publishers'][:1]:
if isinstance(pub, dict) and 'name' in pub:
new_tags.append(f"publisher:{pub['name']}")
elif isinstance(pub, str):
new_tags.append(f"publisher:{pub}")
# Add description
if 'description' in data:
desc = data['description']
if isinstance(desc, dict) and 'value' in desc:
desc = desc['value']
if desc:
desc_str = str(desc).strip()
if len(desc_str) > 0:
new_tags.append(f"description:{desc_str[:200]}")
# Add number of pages
if 'number_of_pages' in data:
page_count = data['number_of_pages']
if page_count and isinstance(page_count, int) and page_count > 0:
new_tags.append(f"pages:{page_count}")
# Add subjects as FREEFORM tags (no namespace prefix)
if 'subjects' in data and isinstance(data['subjects'], list):
for subject in data['subjects'][:10]:
if subject and isinstance(subject, str):
subject_clean = str(subject).strip()
if subject_clean and subject_clean not in new_tags:
new_tags.append(subject_clean)
# Add identifiers
if 'identifiers' in data and isinstance(data['identifiers'], dict):
identifiers = data['identifiers']
if 'isbn_10' in identifiers:
isbn_10_list = identifiers['isbn_10']
if isinstance(isbn_10_list, list) and isbn_10_list:
new_tags.append(f"isbn_10:{isbn_10_list[0]}")
elif isinstance(isbn_10_list, str):
new_tags.append(f"isbn_10:{isbn_10_list}")
if 'isbn_13' in identifiers:
isbn_13_list = identifiers['isbn_13']
if isinstance(isbn_13_list, list) and isbn_13_list:
new_tags.append(f"isbn_13:{isbn_13_list[0]}")
elif isinstance(isbn_13_list, str):
new_tags.append(f"isbn_13:{isbn_13_list}")
if 'lccn' in identifiers:
lccn_list = identifiers['lccn']
if isinstance(lccn_list, list) and lccn_list:
new_tags.append(f"lccn:{lccn_list[0]}")
elif isinstance(lccn_list, str):
new_tags.append(f"lccn:{lccn_list}")
if 'oclc_numbers' in identifiers:
oclc_list = identifiers['oclc_numbers']
if isinstance(oclc_list, list) and oclc_list:
new_tags.append(f"oclc:{oclc_list[0]}")
elif isinstance(oclc_list, str):
new_tags.append(f"oclc:{oclc_list}")
if 'goodreads' in identifiers:
goodreads_list = identifiers['goodreads']
if isinstance(goodreads_list, list) and goodreads_list:
new_tags.append(f"goodreads:{goodreads_list[0]}")
elif isinstance(goodreads_list, str):
new_tags.append(f"goodreads:{goodreads_list}")
log(f"Found {len(new_tags)} tag(s) from OpenLibrary lookup")
return new_tags
except Exception as e:
log(f"OpenLibrary scraping error: {e}", file=sys.stderr)
return []
def perform_metadata_scraping(tags_list: List[str]) -> List[str]:
"""Perform scraping based on identifiers in tags.
Priority order:
1. openlibrary: (preferred - more complete metadata)
2. isbn_10 or isbn (fallback)
"""
identifiers = extract_scrapable_identifiers(tags_list)
if not identifiers:
log("No scrapable identifiers found (openlibrary, ISBN, musicbrainz, imdb)")
return []
log(f"Found scrapable identifiers: {', '.join(identifiers.keys())}")
new_tags = []
# Prefer OpenLibrary over ISBN (more complete metadata)
if 'openlibrary' in identifiers:
olid = identifiers['openlibrary']
if olid:
log(f"Scraping OpenLibrary: {olid}")
new_tags.extend(scrape_openlibrary_metadata(olid))
elif 'isbn_13' in identifiers or 'isbn_10' in identifiers or 'isbn' in identifiers:
isbn = identifiers.get('isbn_13') or identifiers.get('isbn_10') or identifiers.get('isbn')
if isbn:
log(f"Scraping ISBN: {isbn}")
new_tags.extend(scrape_isbn_metadata(isbn))
existing_tags_lower = {tag.lower() for tag in tags_list}
scraped_unique = []
seen = set()
for tag in new_tags:
tag_lower = tag.lower()
if tag_lower not in existing_tags_lower and tag_lower not in seen:
scraped_unique.append(tag)
seen.add(tag_lower)
if scraped_unique:
log(f"Added {len(scraped_unique)} new tag(s) from scraping")
return scraped_unique

View File

@@ -150,6 +150,35 @@ class PipeObject:
# Truncate key if needed
key_display = key if len(key) <= 15 else key[:12] + "..."
debug(f"{key_display:<15}: {val_display:<42}")
# If we have structured provider metadata, expand it for debugging.
full_md = self.extra.get("full_metadata")
if isinstance(full_md, dict) and full_md:
debug("├─────────────────────────────────────────────────────────────┤")
debug("│ full_metadata: │")
for md_key in sorted(full_md.keys(), key=lambda x: str(x)):
md_val = full_md.get(md_key)
if isinstance(md_val, (str, int, float)) or md_val is None or isinstance(md_val, bool):
md_display = str(md_val)
elif isinstance(md_val, list):
if len(md_val) <= 6 and all(isinstance(x, (str, int, float, bool)) or x is None for x in md_val):
md_display = "[" + ", ".join(str(x) for x in md_val) + "]"
else:
md_display = f"list({len(md_val)})"
elif isinstance(md_val, dict):
# Avoid dumping huge nested dicts (like raw provider docs).
keys = list(md_val.keys())
preview = ",".join(str(k) for k in keys[:6])
md_display = f"dict({len(keys)})[{preview}{',...' if len(keys) > 6 else ''}]"
else:
md_str = str(md_val)
md_display = md_str if len(md_str) <= 40 else md_str[:37] + "..."
md_key_display = str(md_key)
md_key_display = md_key_display if len(md_key_display) <= 15 else md_key_display[:12] + "..."
if len(md_display) > 42:
md_display = md_display[:39] + "..."
debug(f"{md_key_display:<15}: {md_display:<42}")
if self.action:
debug("├─────────────────────────────────────────────────────────────┤")

View File

@@ -575,7 +575,12 @@ def restore_previous_result_table() -> bool:
_DISPLAY_ITEMS = []
_DISPLAY_TABLE = None
_DISPLAY_SUBJECT = None
return True
# If an underlying table exists, we're done.
# Otherwise, fall through to history restore so @.. actually returns to the last table.
if _LAST_RESULT_TABLE is not None:
return True
if not _RESULT_TABLE_HISTORY:
return True
if not _RESULT_TABLE_HISTORY:
return False
@@ -613,7 +618,12 @@ def restore_next_result_table() -> bool:
_DISPLAY_ITEMS = []
_DISPLAY_TABLE = None
_DISPLAY_SUBJECT = None
return True
# If an underlying table exists, we're done.
# Otherwise, fall through to forward restore when available.
if _LAST_RESULT_TABLE is not None:
return True
if not _RESULT_TABLE_FORWARD:
return True
if not _RESULT_TABLE_FORWARD:
return False

336
test-login.py Normal file
View File

@@ -0,0 +1,336 @@
import requests
import random, string
from concurrent import futures
from tqdm import tqdm
import time
from datetime import datetime
import argparse
import os
import sys
import shutil
import json
import re
import base64
import hashlib
from Crypto.Cipher import AES
from Crypto.Util import Counter
def display_error(response, message):
print(message)
print(response)
print(response.text)
exit()
def get_book_infos(session, url):
r = session.get(url).text
infos_url = "https:" + r.split('"url":"')[1].split('"')[0].replace("\\u0026", "&")
response = session.get(infos_url)
data = response.json()['data']
title = data['brOptions']['bookTitle'].strip().replace(" ", "_")
title = ''.join( c for c in title if c not in '<>:"/\\|?*' ) # Filter forbidden chars in directory names (Windows & Linux)
title = title[:150] # Trim the title to avoid long file names
metadata = data['metadata']
links = []
for item in data['brOptions']['data']:
for page in item:
links.append(page['uri'])
if len(links) > 1:
print(f"[+] Found {len(links)} pages")
return title, links, metadata
else:
print(f"[-] Error while getting image links")
exit()
def login(email, password):
session = requests.Session()
response = session.get("https://archive.org/services/account/login/")
login_data = response.json()
if not login_data['success']:
display_error(response, "[-] Error while getting login token:")
login_token = login_data["value"]["token"]
headers = {"Content-Type": "application/x-www-form-urlencoded"}
data = {"username":email, "password":password, "t": login_token}
response = session.post("https://archive.org/services/account/login/", headers=headers, data=json.dumps(data))
try:
response_json = response.json()
except:
display_error(response, "[-] Error while login:")
if response_json["success"] == False:
if response_json["value"] == "bad_login":
print("[-] Invalid credentials!")
exit()
display_error(response, "[-] Error while login:")
else:
print("[+] Successful login")
return session
def loan(session, book_id, verbose=True):
data = {
"action": "grant_access",
"identifier": book_id
}
response = session.post("https://archive.org/services/loans/loan/searchInside.php", data=data)
data['action'] = "browse_book"
response = session.post("https://archive.org/services/loans/loan/", data=data)
if response.status_code == 400 :
try:
if response.json()["error"] == "This book is not available to borrow at this time. Please try again later.":
print("This book doesn't need to be borrowed")
return session
else :
display_error(response, "Something went wrong when trying to borrow the book.")
except: # The response is not in JSON format
display_error(response, "The book cannot be borrowed")
data['action'] = "create_token"
response = session.post("https://archive.org/services/loans/loan/", data=data)
if "token" in response.text:
if verbose:
print("[+] Successful loan")
return session
else:
display_error(response, "Something went wrong when trying to borrow the book, maybe you can't borrow this book.")
def return_loan(session, book_id):
data = {
"action": "return_loan",
"identifier": book_id
}
response = session.post("https://archive.org/services/loans/loan/", data=data)
if response.status_code == 200 and response.json()["success"]:
print("[+] Book returned")
else:
display_error(response, "Something went wrong when trying to return the book")
def image_name(pages, page, directory):
return f"{directory}/{(len(str(pages)) - len(str(page))) * '0'}{page}.jpg"
def deobfuscate_image(image_data, link, obf_header):
"""
@Author: https://github.com/justimm
Decrypts the first 1024 bytes of image_data using AES-CTR.
The obfuscation_header is expected in the form "1|<base64encoded_counter>"
where the base64-decoded counter is 16 bytes.
We derive the AES key by taking the SHA-1 digest of the image URL (with protocol/host removed)
and using the first 16 bytes.
For AES-CTR, we use a 16-byte counter block. The first 8 bytes are used as a fixed prefix,
and the remaining 8 bytes (interpreted as a big-endian integer) are used as the initial counter value.
"""
try:
version, counter_b64 = obf_header.split('|')
except Exception as e:
raise ValueError("Invalid X-Obfuscate header format") from e
if version != '1':
raise ValueError("Unsupported obfuscation version: " + version)
# Derive AES key: replace protocol/host in link with '/'
aesKey = re.sub(r"^https?:\/\/.*?\/", "/", link)
sha1_digest = hashlib.sha1(aesKey.encode('utf-8')).digest()
key = sha1_digest[:16]
# Decode the counter (should be 16 bytes)
counter_bytes = base64.b64decode(counter_b64)
if len(counter_bytes) != 16:
raise ValueError(f"Expected counter to be 16 bytes, got {len(counter_bytes)}")
prefix = counter_bytes[:8]
initial_value = int.from_bytes(counter_bytes[8:], byteorder='big')
# Create AES-CTR cipher with a 64-bit counter length.
ctr = Counter.new(64, prefix=prefix, initial_value=initial_value, little_endian=False)
cipher = AES.new(key, AES.MODE_CTR, counter=ctr)
decrypted_part = cipher.decrypt(image_data[:1024])
new_data = decrypted_part + image_data[1024:]
return new_data
def download_one_image(session, link, i, directory, book_id, pages):
headers = {
"Referer": "https://archive.org/",
"Accept": "image/avif,image/webp,image/apng,image/*,*/*;q=0.8",
"Sec-Fetch-Site": "same-site",
"Sec-Fetch-Mode": "no-cors",
"Sec-Fetch-Dest": "image",
}
retry = True
response = None
while retry:
try:
response = session.get(link, headers=headers)
if response.status_code == 403:
session = loan(session, book_id, verbose=False)
raise Exception("Borrow again")
elif response.status_code == 200:
retry = False
except:
time.sleep(1) # Wait 1 second before retrying
image = image_name(pages, i, directory)
obf_header = response.headers.get("X-Obfuscate")
image_content = None
if obf_header:
try:
image_content = deobfuscate_image(response.content, link, obf_header)
except Exception as e:
print(f"[ERROR] Deobfuscation failed: {e}")
return
else:
image_content = response.content
with open(image, "wb") as f:
f.write(image_content)
def download(session, n_threads, directory, links, scale, book_id):
print("Downloading pages...")
links = [f"{link}&rotate=0&scale={scale}" for link in links]
pages = len(links)
tasks = []
with futures.ThreadPoolExecutor(max_workers=n_threads) as executor:
for link in links:
i = links.index(link)
tasks.append(executor.submit(download_one_image, session=session, link=link, i=i, directory=directory, book_id=book_id, pages=pages))
for task in tqdm(futures.as_completed(tasks), total=len(tasks)):
pass
images = [image_name(pages, i, directory) for i in range(len(links))]
return images
def make_pdf(pdf, title, directory):
file = title+".pdf"
# Handle the case where multiple books with the same name are downloaded
i = 1
while os.path.isfile(os.path.join(directory, file)):
file = f"{title}({i}).pdf"
i += 1
with open(os.path.join(directory, file),"wb") as f:
f.write(pdf)
print(f"[+] PDF saved as \"{file}\"")
if __name__ == "__main__":
my_parser = argparse.ArgumentParser()
my_parser.add_argument('-e', '--email', help='Your archive.org email', type=str, required=True)
my_parser.add_argument('-p', '--password', help='Your archive.org password', type=str, required=True)
my_parser.add_argument('-u', '--url', help='Link to the book (https://archive.org/details/XXXX). You can use this argument several times to download multiple books', action='append', type=str)
my_parser.add_argument('-d', '--dir', help='Output directory', type=str)
my_parser.add_argument('-f', '--file', help='File where are stored the URLs of the books to download', type=str)
my_parser.add_argument('-r', '--resolution', help='Image resolution (10 to 0, 0 is the highest), [default 3]', type=int, default=3)
my_parser.add_argument('-t', '--threads', help="Maximum number of threads, [default 50]", type=int, default=50)
my_parser.add_argument('-j', '--jpg', help="Output to individual JPG's rather than a PDF", action='store_true')
my_parser.add_argument('-m', '--meta', help="Output the metadata of the book to a json file (-j option required)", action='store_true')
if len(sys.argv) == 1:
my_parser.print_help(sys.stderr)
sys.exit(1)
args = my_parser.parse_args()
if args.url is None and args.file is None:
my_parser.error("At least one of --url and --file required")
email = args.email
password = args.password
scale = args.resolution
n_threads = args.threads
d = args.dir
if d == None:
d = os.getcwd()
elif not os.path.isdir(d):
print(f"Output directory does not exist!")
exit()
if args.url is not None:
urls = args.url
else:
if os.path.exists(args.file):
with open(args.file) as f:
urls = f.read().strip().split("\n")
else:
print(f"{args.file} does not exist!")
exit()
# Check the urls format
for url in urls:
if not url.startswith("https://archive.org/details/"):
print(f"{url} --> Invalid url. URL must starts with \"https://archive.org/details/\"")
exit()
print(f"{len(urls)} Book(s) to download")
session = login(email, password)
for url in urls:
book_id = list(filter(None, url.split("/")))[3]
print("="*40)
print(f"Current book: https://archive.org/details/{book_id}")
session = loan(session, book_id)
title, links, metadata = get_book_infos(session, url)
directory = os.path.join(d, title)
# Handle the case where multiple books with the same name are downloaded
i = 1
_directory = directory
while os.path.isdir(directory):
directory = f"{_directory}({i})"
i += 1
os.makedirs(directory)
if args.meta:
print("Writing metadata.json...")
with open(f"{directory}/metadata.json",'w') as f:
json.dump(metadata,f)
images = download(session, n_threads, directory, links, scale, book_id)
if not args.jpg: # Create pdf with images and remove the images folder
import img2pdf
# prepare PDF metadata
# sometimes archive metadata is missing
pdfmeta = { }
# ensure metadata are str
for key in ["title", "creator", "associated-names"]:
if key in metadata:
if isinstance(metadata[key], str):
pass
elif isinstance(metadata[key], list):
metadata[key] = "; ".join(metadata[key])
else:
raise Exception("unsupported metadata type")
# title
if 'title' in metadata:
pdfmeta['title'] = metadata['title']
# author
if 'creator' in metadata and 'associated-names' in metadata:
pdfmeta['author'] = metadata['creator'] + "; " + metadata['associated-names']
elif 'creator' in metadata:
pdfmeta['author'] = metadata['creator']
elif 'associated-names' in metadata:
pdfmeta['author'] = metadata['associated-names']
# date
if 'date' in metadata:
try:
pdfmeta['creationdate'] = datetime.strptime(metadata['date'][0:4], '%Y')
except:
pass
# keywords
pdfmeta['keywords'] = [f"https://archive.org/details/{book_id}"]
pdf = img2pdf.convert(images, **pdfmeta)
make_pdf(pdf, title, args.dir if args.dir != None else "")
try:
shutil.rmtree(directory)
except OSError as e:
print ("Error: %s - %s." % (e.filename, e.strerror))
return_loan(session, book_id)