dfdfsdd
This commit is contained in:
@@ -12,6 +12,7 @@ import sys
|
||||
import time
|
||||
|
||||
from SYS.logger import log, debug
|
||||
from SYS.utils_constant import ALL_SUPPORTED_EXTENSIONS as GLOBAL_SUPPORTED_EXTENSIONS
|
||||
import tempfile
|
||||
import logging
|
||||
from dataclasses import dataclass, field
|
||||
@@ -1103,9 +1104,7 @@ SUPPORTED_FILETYPES = {
|
||||
}
|
||||
|
||||
# Flatten to get all supported extensions
|
||||
ALL_SUPPORTED_EXTENSIONS = set()
|
||||
for category_extensions in SUPPORTED_FILETYPES.values():
|
||||
ALL_SUPPORTED_EXTENSIONS.update(category_extensions.keys())
|
||||
ALL_SUPPORTED_EXTENSIONS = set(GLOBAL_SUPPORTED_EXTENSIONS)
|
||||
|
||||
|
||||
# Global Hydrus client cache to reuse session keys
|
||||
|
||||
@@ -1,584 +0,0 @@
|
||||
"""Archive.org API client for borrowing and downloading books.
|
||||
|
||||
This module provides low-level functions for interacting with Archive.org:
|
||||
- Authentication (login, credential management)
|
||||
- Borrowing (loan, return_loan)
|
||||
- Book metadata extraction (get_book_infos, get_book_metadata)
|
||||
- Image downloading and deobfuscation
|
||||
- PDF creation with metadata
|
||||
|
||||
Used by Provider/openlibrary.py for the borrowing workflow.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
import hashlib
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from concurrent import futures
|
||||
from typing import Any, Dict, List, Optional, Sequence, Tuple
|
||||
|
||||
import requests
|
||||
|
||||
from SYS.logger import log, debug
|
||||
|
||||
try:
|
||||
from Crypto.Cipher import AES # type: ignore
|
||||
from Crypto.Util import Counter # type: ignore
|
||||
except ImportError:
|
||||
AES = None # type: ignore
|
||||
Counter = None # type: ignore
|
||||
|
||||
try:
|
||||
from tqdm import tqdm # type: ignore
|
||||
except ImportError:
|
||||
tqdm = None # type: ignore
|
||||
|
||||
|
||||
def credential_openlibrary(config: Dict[str, Any]) -> Tuple[Optional[str], Optional[str]]:
|
||||
"""Get OpenLibrary/Archive.org email and password from config.
|
||||
|
||||
Supports both formats:
|
||||
- New: {"provider": {"openlibrary": {"email": "...", "password": "..."}}}
|
||||
- Old: {"Archive": {"email": "...", "password": "..."}}
|
||||
{"archive_org_email": "...", "archive_org_password": "..."}
|
||||
|
||||
Returns: (email, password) tuple, each can be None
|
||||
"""
|
||||
if not isinstance(config, dict):
|
||||
return None, None
|
||||
|
||||
# Try new format first
|
||||
provider_config = config.get("provider", {})
|
||||
if isinstance(provider_config, dict):
|
||||
openlibrary_config = provider_config.get("openlibrary", {})
|
||||
if isinstance(openlibrary_config, dict):
|
||||
email = openlibrary_config.get("email")
|
||||
password = openlibrary_config.get("password")
|
||||
if email or password:
|
||||
return email, password
|
||||
|
||||
# Try old nested format
|
||||
archive_config = config.get("Archive")
|
||||
if isinstance(archive_config, dict):
|
||||
email = archive_config.get("email")
|
||||
password = archive_config.get("password")
|
||||
if email or password:
|
||||
return email, password
|
||||
|
||||
# Fall back to old flat format
|
||||
email = config.get("archive_org_email")
|
||||
password = config.get("archive_org_password")
|
||||
return email, password
|
||||
|
||||
|
||||
class BookNotAvailableError(Exception):
|
||||
"""Raised when a book is not available for borrowing (waitlisted/in use)."""
|
||||
pass
|
||||
|
||||
|
||||
def display_error(response: requests.Response, message: str) -> None:
|
||||
"""Display error and exit."""
|
||||
log(message, file=sys.stderr)
|
||||
log(response.text, file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def login(email: str, password: str) -> requests.Session:
|
||||
"""Login to archive.org.
|
||||
|
||||
Args:
|
||||
email: Archive.org email
|
||||
password: Archive.org password
|
||||
|
||||
Returns:
|
||||
Authenticated requests.Session
|
||||
|
||||
Raises:
|
||||
SystemExit on login failure
|
||||
"""
|
||||
session = requests.Session()
|
||||
session.get("https://archive.org/account/login", timeout=30)
|
||||
|
||||
data = {"username": email, "password": password}
|
||||
response = session.post("https://archive.org/account/login", data=data, timeout=30)
|
||||
|
||||
if "bad_login" in response.text:
|
||||
log("Invalid credentials!", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
if "Successful login" in response.text:
|
||||
debug("Successful login")
|
||||
return session
|
||||
display_error(response, "[-] Error while login:")
|
||||
sys.exit(1) # Unreachable but satisfies type checker
|
||||
|
||||
|
||||
def loan(session: requests.Session, book_id: str, verbose: bool = True) -> requests.Session:
|
||||
"""Borrow a book from archive.org (14-day loan).
|
||||
|
||||
Args:
|
||||
session: Authenticated requests.Session from login()
|
||||
book_id: Archive.org book identifier (e.g., 'ia_book_id')
|
||||
verbose: Whether to log messages
|
||||
|
||||
Returns:
|
||||
Session with active loan
|
||||
|
||||
Raises:
|
||||
SystemExit on loan failure
|
||||
"""
|
||||
data = {"action": "grant_access", "identifier": book_id}
|
||||
response = session.post("https://archive.org/services/loans/loan/searchInside.php", data=data, timeout=30)
|
||||
data["action"] = "browse_book"
|
||||
response = session.post("https://archive.org/services/loans/loan/", data=data, timeout=30)
|
||||
|
||||
if response.status_code == 400:
|
||||
try:
|
||||
if response.json()["error"] == "This book is not available to borrow at this time. Please try again later.":
|
||||
debug("Book is not available for borrowing (waitlisted or in use)")
|
||||
raise BookNotAvailableError("Book is waitlisted or in use")
|
||||
display_error(response, "Something went wrong when trying to borrow the book.")
|
||||
except BookNotAvailableError:
|
||||
raise
|
||||
except:
|
||||
display_error(response, "The book cannot be borrowed")
|
||||
|
||||
data["action"] = "create_token"
|
||||
response = session.post("https://archive.org/services/loans/loan/", data=data, timeout=30)
|
||||
|
||||
if "token" in response.text:
|
||||
if verbose:
|
||||
debug("Successful loan")
|
||||
return session
|
||||
display_error(response, "Something went wrong when trying to borrow the book.")
|
||||
sys.exit(1) # Unreachable but satisfies type checker
|
||||
|
||||
|
||||
def return_loan(session: requests.Session, book_id: str) -> None:
|
||||
"""Return a borrowed book.
|
||||
|
||||
Args:
|
||||
session: Authenticated requests.Session with active loan
|
||||
book_id: Archive.org book identifier
|
||||
"""
|
||||
data = {"action": "return_loan", "identifier": book_id}
|
||||
response = session.post("https://archive.org/services/loans/loan/", data=data, timeout=30)
|
||||
if response.status_code == 200 and response.json()["success"]:
|
||||
debug("Book returned")
|
||||
else:
|
||||
display_error(response, "Something went wrong when trying to return the book")
|
||||
|
||||
|
||||
def get_book_infos(session: requests.Session, url: str) -> Tuple[str, List[str], Dict[str, Any]]:
|
||||
"""Extract book information and page links from archive.org viewer.
|
||||
|
||||
Args:
|
||||
session: Authenticated requests.Session
|
||||
url: Book URL (e.g., https://archive.org/borrow/book_id or /details/book_id)
|
||||
|
||||
Returns:
|
||||
Tuple of (title, page_links, metadata)
|
||||
|
||||
Raises:
|
||||
RuntimeError: If page data cannot be extracted
|
||||
"""
|
||||
r = session.get(url, timeout=30).text
|
||||
|
||||
# Try to extract the infos URL from the response
|
||||
try:
|
||||
# Look for the "url" field in the response using regex
|
||||
# Matches "url":"//archive.org/..."
|
||||
import re
|
||||
match = re.search(r'"url"\s*:\s*"([^"]+)"', r)
|
||||
if not match:
|
||||
raise ValueError("No 'url' field found in response")
|
||||
|
||||
url_path = match.group(1)
|
||||
if url_path.startswith("//"):
|
||||
infos_url = "https:" + url_path
|
||||
else:
|
||||
infos_url = url_path
|
||||
|
||||
infos_url = infos_url.replace("\\u0026", "&")
|
||||
except (IndexError, ValueError, AttributeError) as e:
|
||||
# If URL extraction fails, raise with better error message
|
||||
raise RuntimeError(f"Failed to extract book info URL from response: {e}")
|
||||
|
||||
response = session.get(infos_url, timeout=30)
|
||||
data = response.json()["data"]
|
||||
title = data["brOptions"]["bookTitle"].strip().replace(" ", "_")
|
||||
title = "".join(c for c in title if c not in '<>:"/\\|?*') # Filter forbidden chars
|
||||
title = title[:150] # Trim to avoid long file names
|
||||
metadata = data["metadata"]
|
||||
links = []
|
||||
|
||||
# Safely extract page links from brOptions data
|
||||
try:
|
||||
br_data = data.get("brOptions", {}).get("data", [])
|
||||
for item in br_data:
|
||||
if isinstance(item, list):
|
||||
for page in item:
|
||||
if isinstance(page, dict) and "uri" in page:
|
||||
links.append(page["uri"])
|
||||
elif isinstance(item, dict) and "uri" in item:
|
||||
links.append(item["uri"])
|
||||
except (KeyError, IndexError, TypeError) as e:
|
||||
log(f"Warning: Error parsing page links: {e}", file=sys.stderr)
|
||||
# Continue with whatever links we found
|
||||
|
||||
if len(links) > 1:
|
||||
debug(f"Found {len(links)} pages")
|
||||
return title, links, metadata
|
||||
elif len(links) == 1:
|
||||
debug(f"Found {len(links)} page")
|
||||
return title, links, metadata
|
||||
else:
|
||||
log("Error while getting image links - no pages found", file=sys.stderr)
|
||||
raise RuntimeError("No pages found in book data")
|
||||
|
||||
|
||||
def image_name(pages: int, page: int, directory: str) -> str:
|
||||
"""Generate image filename for page.
|
||||
|
||||
Args:
|
||||
pages: Total number of pages
|
||||
page: Current page number (0-indexed)
|
||||
directory: Directory to save to
|
||||
|
||||
Returns:
|
||||
Full path to image file
|
||||
"""
|
||||
return f"{directory}/{(len(str(pages)) - len(str(page))) * '0'}{page}.jpg"
|
||||
|
||||
|
||||
def deobfuscate_image(image_data: bytes, link: str, obf_header: str) -> bytes:
|
||||
"""Decrypt obfuscated image data using AES-CTR.
|
||||
|
||||
This handles Archive.org's image obfuscation for borrowed books.
|
||||
Based on: https://github.com/justimm
|
||||
|
||||
Args:
|
||||
image_data: Encrypted image bytes
|
||||
link: Image URL (used to derive AES key)
|
||||
obf_header: X-Obfuscate header value (format: "1|BASE64_COUNTER")
|
||||
|
||||
Returns:
|
||||
Decrypted image bytes
|
||||
"""
|
||||
if not AES or not Counter:
|
||||
raise RuntimeError("Crypto library not available")
|
||||
|
||||
try:
|
||||
version, counter_b64 = obf_header.split("|")
|
||||
except Exception as e:
|
||||
raise ValueError("Invalid X-Obfuscate header format") from e
|
||||
|
||||
if version != "1":
|
||||
raise ValueError("Unsupported obfuscation version: " + version)
|
||||
|
||||
# Derive AES key from URL
|
||||
aesKey = re.sub(r"^https?:\/\/.*?\/", "/", link)
|
||||
sha1_digest = hashlib.sha1(aesKey.encode("utf-8")).digest()
|
||||
key = sha1_digest[:16]
|
||||
|
||||
# Decode counter
|
||||
counter_bytes = base64.b64decode(counter_b64)
|
||||
if len(counter_bytes) != 16:
|
||||
raise ValueError(f"Expected counter to be 16 bytes, got {len(counter_bytes)}")
|
||||
|
||||
prefix = counter_bytes[:8]
|
||||
initial_value = int.from_bytes(counter_bytes[8:], byteorder="big")
|
||||
|
||||
# Create AES-CTR cipher
|
||||
ctr = Counter.new(64, prefix=prefix, initial_value=initial_value, little_endian=False) # type: ignore
|
||||
cipher = AES.new(key, AES.MODE_CTR, counter=ctr) # type: ignore
|
||||
|
||||
decrypted_part = cipher.decrypt(image_data[:1024])
|
||||
new_data = decrypted_part + image_data[1024:]
|
||||
return new_data
|
||||
|
||||
|
||||
def download_one_image(
|
||||
session: requests.Session,
|
||||
link: str,
|
||||
i: int,
|
||||
directory: str,
|
||||
book_id: str,
|
||||
pages: int,
|
||||
) -> None:
|
||||
"""Download a single book page image.
|
||||
|
||||
Handles obfuscated images and re-borrowing on 403 errors.
|
||||
|
||||
Args:
|
||||
session: Authenticated requests.Session
|
||||
link: Direct image URL
|
||||
i: Page index (0-based)
|
||||
directory: Directory to save to
|
||||
book_id: Archive.org book ID (for re-borrowing on 403)
|
||||
pages: Total number of pages
|
||||
"""
|
||||
headers = {
|
||||
"Referer": "https://archive.org/",
|
||||
"Accept": "image/avif,image/webp,image/apng,image/*,*/*;q=0.8",
|
||||
"Sec-Fetch-Site": "same-site",
|
||||
"Sec-Fetch-Mode": "no-cors",
|
||||
"Sec-Fetch-Dest": "image",
|
||||
}
|
||||
retry = True
|
||||
response = None
|
||||
while retry:
|
||||
try:
|
||||
response = session.get(link, headers=headers, timeout=30)
|
||||
if response.status_code == 403:
|
||||
session = loan(session, book_id, verbose=False)
|
||||
raise Exception("Borrow again")
|
||||
if response.status_code == 200:
|
||||
retry = False
|
||||
except:
|
||||
time.sleep(1)
|
||||
|
||||
image = image_name(pages, i, directory)
|
||||
|
||||
if response is None:
|
||||
log(f"Failed to download page {i}", file=sys.stderr)
|
||||
return
|
||||
|
||||
obf_header = response.headers.get("X-Obfuscate")
|
||||
image_content = None
|
||||
if obf_header:
|
||||
try:
|
||||
image_content = deobfuscate_image(response.content, link, obf_header)
|
||||
except Exception as e:
|
||||
log(f"Deobfuscation failed: {e}", file=sys.stderr)
|
||||
return
|
||||
else:
|
||||
image_content = response.content
|
||||
|
||||
with open(image, "wb") as f:
|
||||
f.write(image_content)
|
||||
|
||||
|
||||
def download(
|
||||
session: requests.Session,
|
||||
n_threads: int,
|
||||
directory: str,
|
||||
links: List[str],
|
||||
scale: int,
|
||||
book_id: str,
|
||||
) -> List[str]:
|
||||
"""Download all book pages as images.
|
||||
|
||||
Uses thread pool for parallel downloads.
|
||||
|
||||
Args:
|
||||
session: Authenticated requests.Session
|
||||
n_threads: Number of download threads
|
||||
directory: Directory to save images to
|
||||
links: List of image url
|
||||
scale: Image resolution (0=highest, 10=lowest)
|
||||
book_id: Archive.org book ID (for re-borrowing)
|
||||
|
||||
Returns:
|
||||
List of downloaded image file paths
|
||||
"""
|
||||
debug("Downloading pages...")
|
||||
links = [f"{link}&rotate=0&scale={scale}" for link in links]
|
||||
pages = len(links)
|
||||
|
||||
tasks = []
|
||||
with futures.ThreadPoolExecutor(max_workers=n_threads) as executor:
|
||||
for link in links:
|
||||
i = links.index(link)
|
||||
tasks.append(
|
||||
executor.submit(
|
||||
download_one_image,
|
||||
session=session,
|
||||
link=link,
|
||||
i=i,
|
||||
directory=directory,
|
||||
book_id=book_id,
|
||||
pages=pages,
|
||||
)
|
||||
)
|
||||
if tqdm:
|
||||
for _ in tqdm(futures.as_completed(tasks), total=len(tasks)): # type: ignore
|
||||
pass
|
||||
else:
|
||||
for _ in futures.as_completed(tasks):
|
||||
pass
|
||||
|
||||
images = [image_name(pages, i, directory) for i in range(len(links))]
|
||||
return images
|
||||
|
||||
|
||||
def check_direct_download(book_id: str) -> Tuple[bool, str]:
|
||||
"""Check if a book can be downloaded directly without borrowing.
|
||||
|
||||
Searches Archive.org metadata for downloadable PDF files.
|
||||
|
||||
Args:
|
||||
book_id: Archive.org book identifier
|
||||
|
||||
Returns:
|
||||
Tuple of (can_download: bool, pdf_url: str)
|
||||
"""
|
||||
try:
|
||||
# First, try to get the metadata to find the actual PDF filename
|
||||
metadata_url = f"https://archive.org/metadata/{book_id}"
|
||||
response = requests.get(metadata_url, timeout=10)
|
||||
response.raise_for_status()
|
||||
metadata = response.json()
|
||||
|
||||
# Find PDF file in files list
|
||||
if "files" in metadata:
|
||||
for file_info in metadata["files"]:
|
||||
filename = file_info.get("name", "")
|
||||
if filename.endswith(".pdf") and file_info.get("source") == "original":
|
||||
# Found the original PDF
|
||||
pdf_filename = filename
|
||||
pdf_url = f"https://archive.org/download/{book_id}/{pdf_filename.replace(' ', '%20')}"
|
||||
|
||||
# Verify it's accessible
|
||||
check_response = requests.head(pdf_url, timeout=5, allow_redirects=True)
|
||||
if check_response.status_code == 200:
|
||||
return True, pdf_url
|
||||
|
||||
return False, ""
|
||||
|
||||
except Exception as e:
|
||||
log(f"Error checking direct download: {e}", file=sys.stderr)
|
||||
return False, ""
|
||||
|
||||
|
||||
def get_openlibrary_by_isbn(isbn: str) -> Dict[str, Any]:
|
||||
"""Fetch book data from OpenLibrary using ISBN.
|
||||
|
||||
Args:
|
||||
isbn: ISBN-10 or ISBN-13 to search for
|
||||
|
||||
Returns:
|
||||
Dictionary with book metadata from OpenLibrary
|
||||
"""
|
||||
try:
|
||||
# Try ISBN API first
|
||||
api_url = f"https://openlibrary.org/api/books?bibkeys=ISBN:{isbn}&jscmd=data&format=json"
|
||||
response = requests.get(api_url, timeout=10)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
if data:
|
||||
# Get first result
|
||||
key = list(data.keys())[0]
|
||||
return data[key]
|
||||
return {}
|
||||
except Exception as e:
|
||||
log(f"Error fetching OpenLibrary data by ISBN: {e}", file=sys.stderr)
|
||||
return {}
|
||||
|
||||
|
||||
def extract_isbn_from_metadata(metadata: Dict[str, Any]) -> str:
|
||||
"""Extract ISBN from archive.org metadata.
|
||||
|
||||
Looks for ISBN in various metadata fields.
|
||||
|
||||
Args:
|
||||
metadata: Archive.org metadata dictionary
|
||||
|
||||
Returns:
|
||||
ISBN string (clean, no hyphens) or empty string if not found
|
||||
"""
|
||||
# Try various common metadata fields
|
||||
isbn_fields = [
|
||||
"isbn", "ISBN", "isbn_13", "isbn_10", "isbns",
|
||||
"isbn-10", "isbn-13", "identifer_isbn"
|
||||
]
|
||||
|
||||
for field in isbn_fields:
|
||||
if field in metadata:
|
||||
isbn_val = metadata[field]
|
||||
if isinstance(isbn_val, list):
|
||||
isbn_val = isbn_val[0] if isbn_val else None
|
||||
if isbn_val and isinstance(isbn_val, str):
|
||||
# Clean ISBN (remove hyphens, spaces)
|
||||
isbn_clean = isbn_val.replace("-", "").replace(" ", "")
|
||||
if len(isbn_clean) in [10, 13]:
|
||||
return isbn_clean
|
||||
|
||||
return ""
|
||||
|
||||
|
||||
def normalize_url(url: str) -> str:
|
||||
"""Convert openlibrary.org URL to archive.org URL.
|
||||
|
||||
Looks up the actual Archive.org ID from OpenLibrary API.
|
||||
|
||||
Args:
|
||||
url: Book URL (archive.org or openlibrary.org format)
|
||||
|
||||
Returns:
|
||||
Normalized archive.org URL
|
||||
"""
|
||||
url = url.strip()
|
||||
|
||||
# Already archive.org format
|
||||
if url.startswith("https://archive.org/details/"):
|
||||
return url
|
||||
|
||||
# Convert openlibrary.org format by querying the OpenLibrary API
|
||||
if "openlibrary.org/books/" in url:
|
||||
try:
|
||||
# Extract the book ID (e.g., OL6796852M)
|
||||
parts = url.split("/books/")
|
||||
if len(parts) > 1:
|
||||
book_id = parts[1].split("/")[0]
|
||||
|
||||
# Query OpenLibrary API to get the book metadata
|
||||
api_url = f"https://openlibrary.org/books/{book_id}.json"
|
||||
response = requests.get(api_url, timeout=10)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
# Look for identifiers including internet_archive or ocaid
|
||||
# First try ocaid (Open Content Alliance ID) - this is most common
|
||||
if "ocaid" in data:
|
||||
ocaid = data["ocaid"]
|
||||
return f"https://archive.org/details/{ocaid}"
|
||||
|
||||
# Check for identifiers object
|
||||
if "identifiers" in data:
|
||||
identifiers = data["identifiers"]
|
||||
|
||||
# Look for internet_archive ID
|
||||
if "internet_archive" in identifiers:
|
||||
ia_ids = identifiers["internet_archive"]
|
||||
if isinstance(ia_ids, list) and ia_ids:
|
||||
ia_id = ia_ids[0]
|
||||
else:
|
||||
ia_id = ia_ids
|
||||
return f"https://archive.org/details/{ia_id}"
|
||||
|
||||
# If no IA identifier found, use the book ID as fallback
|
||||
log(f"No Internet Archive ID found for {book_id}. Attempting with OpenLibrary ID.", file=sys.stderr)
|
||||
return f"https://archive.org/details/{book_id}"
|
||||
|
||||
except requests.RequestException as e:
|
||||
log(f"Could not fetch OpenLibrary metadata: {e}", file=sys.stderr)
|
||||
# Fallback to using the book ID directly
|
||||
parts = url.split("/books/")
|
||||
if len(parts) > 1:
|
||||
book_id = parts[1].split("/")[0]
|
||||
return f"https://archive.org/details/{book_id}"
|
||||
except (KeyError, IndexError) as e:
|
||||
log(f"Error parsing OpenLibrary response: {e}", file=sys.stderr)
|
||||
# Fallback to using the book ID directly
|
||||
parts = url.split("/books/")
|
||||
if len(parts) > 1:
|
||||
book_id = parts[1].split("/")[0]
|
||||
return f"https://archive.org/details/{book_id}"
|
||||
|
||||
# Return original if can't parse
|
||||
return url
|
||||
142
API/folder.py
142
API/folder.py
@@ -407,38 +407,53 @@ class API_folder_store:
|
||||
logger.error(f"Error clearing worker log for {worker_id}: {exc}", exc_info=True)
|
||||
|
||||
def _migrate_metadata_schema(self, cursor) -> None:
|
||||
"""Import legacy metadata from old schema if present. Existing hash-based schema is ready to use."""
|
||||
"""Ensure metadata schema is up-to-date.
|
||||
|
||||
- If a legacy schema is detected, attempt to import/upgrade (best-effort).
|
||||
- If the hash-based schema exists, add any missing columns expected by current code.
|
||||
"""
|
||||
try:
|
||||
# Check if this is a fresh new database (hash-based schema)
|
||||
cursor.execute('PRAGMA table_info(metadata)')
|
||||
existing_columns = {row[1] for row in cursor.fetchall()}
|
||||
|
||||
# If hash column exists, we're already on the new schema
|
||||
if 'hash' in existing_columns:
|
||||
logger.info("Database is already using hash-based schema - no migration needed")
|
||||
return
|
||||
|
||||
# Legacy migration: If old schema exists, try to import data
|
||||
|
||||
# Legacy migration: If old schema exists, try to import data.
|
||||
# Old schema would have had: id (INTEGER PRIMARY KEY), file_hash (TEXT), etc.
|
||||
if 'id' in existing_columns and 'file_hash' in existing_columns:
|
||||
logger.info("Detected legacy metadata schema - importing to new hash-based schema")
|
||||
# This would be complex legacy migration - for now just note it
|
||||
logger.info("Legacy metadata table detected but import not yet implemented")
|
||||
if 'hash' not in existing_columns:
|
||||
if 'id' in existing_columns and 'file_hash' in existing_columns:
|
||||
logger.info("Detected legacy metadata schema - importing to new hash-based schema")
|
||||
# This would be complex legacy migration - for now just note it.
|
||||
logger.info("Legacy metadata table detected but import not yet implemented")
|
||||
return
|
||||
|
||||
# Unknown/unsupported schema; nothing we can safely do here.
|
||||
return
|
||||
|
||||
# Add any missing columns to the new schema
|
||||
for col_name, col_def in [('size', 'INTEGER'), ('ext', 'TEXT'),
|
||||
('type', 'TEXT'),
|
||||
('time_imported', 'TIMESTAMP DEFAULT CURRENT_TIMESTAMP'),
|
||||
('time_modified', 'TIMESTAMP DEFAULT CURRENT_TIMESTAMP')]:
|
||||
|
||||
# Hash-based schema exists: add any missing columns expected by current code.
|
||||
# These are safe ALTER TABLE additions for older DBs.
|
||||
column_specs = {
|
||||
'size': 'INTEGER',
|
||||
'ext': 'TEXT',
|
||||
'type': 'TEXT',
|
||||
'url': 'TEXT',
|
||||
'relationships': 'TEXT',
|
||||
'duration': 'REAL',
|
||||
'time_imported': 'TIMESTAMP DEFAULT CURRENT_TIMESTAMP',
|
||||
'time_modified': 'TIMESTAMP DEFAULT CURRENT_TIMESTAMP',
|
||||
'created_at': 'TIMESTAMP DEFAULT CURRENT_TIMESTAMP',
|
||||
'updated_at': 'TIMESTAMP DEFAULT CURRENT_TIMESTAMP',
|
||||
}
|
||||
|
||||
for col_name, col_def in column_specs.items():
|
||||
if col_name not in existing_columns:
|
||||
try:
|
||||
cursor.execute(f"ALTER TABLE metadata ADD COLUMN {col_name} {col_def}")
|
||||
existing_columns.add(col_name)
|
||||
logger.info(f"Added '{col_name}' column to metadata table")
|
||||
except Exception as e:
|
||||
logger.debug(f"Column '{col_name}' may already exist: {e}")
|
||||
|
||||
# Populate type column from ext if not already populated
|
||||
|
||||
# Populate type column from ext if not already populated.
|
||||
if 'type' in existing_columns and 'ext' in existing_columns:
|
||||
try:
|
||||
from SYS.utils_constant import get_type_from_ext
|
||||
@@ -451,7 +466,7 @@ class API_folder_store:
|
||||
logger.info(f"Populated type column for {len(rows)} metadata entries")
|
||||
except Exception as e:
|
||||
logger.debug(f"Could not populate type column: {e}")
|
||||
|
||||
|
||||
self.connection.commit()
|
||||
except Exception as e:
|
||||
logger.debug(f"Note: Schema import/migration completed with status: {e}")
|
||||
@@ -929,6 +944,13 @@ class API_folder_store:
|
||||
if not fields:
|
||||
return
|
||||
|
||||
# Ensure a metadata row exists so updates don't silently no-op.
|
||||
# This can happen for older DBs or entries created without explicit metadata.
|
||||
cursor.execute(
|
||||
"INSERT OR IGNORE INTO metadata (hash) VALUES (?)",
|
||||
(file_hash,),
|
||||
)
|
||||
|
||||
values.append(file_hash)
|
||||
|
||||
sql = f"UPDATE metadata SET {', '.join(fields)}, time_modified = CURRENT_TIMESTAMP, updated_at = CURRENT_TIMESTAMP WHERE hash = ?"
|
||||
@@ -1681,6 +1703,84 @@ class DatabaseAPI:
|
||||
)
|
||||
return {row[0] for row in cursor.fetchall()}
|
||||
|
||||
def get_file_hashes_with_any_url(self, limit: Optional[int] = None) -> Set[str]:
|
||||
"""Get hashes of files that have any non-empty URL metadata."""
|
||||
cursor = self.get_cursor()
|
||||
cursor.execute(
|
||||
"""
|
||||
SELECT DISTINCT f.hash
|
||||
FROM files f
|
||||
JOIN metadata m ON f.hash = m.hash
|
||||
WHERE m.url IS NOT NULL
|
||||
AND TRIM(m.url) != ''
|
||||
AND TRIM(m.url) != '[]'
|
||||
LIMIT ?
|
||||
""",
|
||||
(limit or 10000,),
|
||||
)
|
||||
return {row[0] for row in cursor.fetchall()}
|
||||
|
||||
def get_file_hashes_by_url_like(self, like_pattern: str, limit: Optional[int] = None) -> Set[str]:
|
||||
"""Get hashes of files whose URL metadata contains a substring (case-insensitive)."""
|
||||
cursor = self.get_cursor()
|
||||
cursor.execute(
|
||||
"""
|
||||
SELECT DISTINCT f.hash
|
||||
FROM files f
|
||||
JOIN metadata m ON f.hash = m.hash
|
||||
WHERE m.url IS NOT NULL
|
||||
AND LOWER(m.url) LIKE ?
|
||||
LIMIT ?
|
||||
""",
|
||||
(like_pattern.lower(), limit or 10000),
|
||||
)
|
||||
return {row[0] for row in cursor.fetchall()}
|
||||
|
||||
def get_files_with_any_url(self, limit: Optional[int] = None) -> List[tuple]:
|
||||
"""Get files that have any non-empty URL metadata.
|
||||
|
||||
Returns (hash, file_path, size, ext) tuples.
|
||||
"""
|
||||
cursor = self.get_cursor()
|
||||
cursor.execute(
|
||||
"""
|
||||
SELECT f.hash, f.file_path,
|
||||
COALESCE((SELECT size FROM metadata WHERE hash = f.hash), 0) as size,
|
||||
COALESCE((SELECT ext FROM metadata WHERE hash = f.hash), '') as ext
|
||||
FROM files f
|
||||
JOIN metadata m ON f.hash = m.hash
|
||||
WHERE m.url IS NOT NULL
|
||||
AND TRIM(m.url) != ''
|
||||
AND TRIM(m.url) != '[]'
|
||||
ORDER BY f.file_path
|
||||
LIMIT ?
|
||||
""",
|
||||
(limit or 10000,),
|
||||
)
|
||||
return cursor.fetchall()
|
||||
|
||||
def get_files_by_url_like(self, like_pattern: str, limit: Optional[int] = None) -> List[tuple]:
|
||||
"""Get files whose URL metadata contains a substring (case-insensitive).
|
||||
|
||||
Returns (hash, file_path, size, ext) tuples.
|
||||
"""
|
||||
cursor = self.get_cursor()
|
||||
cursor.execute(
|
||||
"""
|
||||
SELECT f.hash, f.file_path,
|
||||
COALESCE((SELECT size FROM metadata WHERE hash = f.hash), 0) as size,
|
||||
COALESCE((SELECT ext FROM metadata WHERE hash = f.hash), '') as ext
|
||||
FROM files f
|
||||
JOIN metadata m ON f.hash = m.hash
|
||||
WHERE m.url IS NOT NULL
|
||||
AND LOWER(m.url) LIKE ?
|
||||
ORDER BY f.file_path
|
||||
LIMIT ?
|
||||
""",
|
||||
(like_pattern.lower(), limit or 10000),
|
||||
)
|
||||
return cursor.fetchall()
|
||||
|
||||
def get_file_metadata(self, file_hashes: Set[str], limit: Optional[int] = None) -> List[tuple]:
|
||||
"""Get metadata for files given their hashes. Returns (hash, file_path, size, extension) tuples."""
|
||||
if not file_hashes:
|
||||
|
||||
35
CLI.py
35
CLI.py
@@ -1498,6 +1498,9 @@ def _execute_pipeline(tokens: list):
|
||||
elif table_type == 'soulseek':
|
||||
print(f"Auto-piping Soulseek selection to download-file")
|
||||
stages.append(['download-file'])
|
||||
elif table_type == 'openlibrary':
|
||||
print(f"Auto-piping OpenLibrary selection to download-file")
|
||||
stages.append(['download-file'])
|
||||
elif source_cmd == 'search-file' and source_args and 'youtube' in source_args:
|
||||
# Legacy check
|
||||
print(f"Auto-piping YouTube selection to .pipe")
|
||||
@@ -1667,6 +1670,35 @@ def _execute_pipeline(tokens: list):
|
||||
filtered_pipe_objs = [coerce_to_pipe_object(item) for item in filtered]
|
||||
piped_result = filtered_pipe_objs if len(filtered_pipe_objs) > 1 else filtered_pipe_objs[0]
|
||||
print(f"Selected {len(filtered)} item(s) using {cmd_name}")
|
||||
|
||||
# If selection is the last stage and looks like a provider result,
|
||||
# auto-initiate the borrow/download flow.
|
||||
if stage_index + 1 >= len(stages):
|
||||
try:
|
||||
from ProviderCore.registry import get_search_provider as _get_search_provider
|
||||
except Exception:
|
||||
_get_search_provider = None
|
||||
|
||||
if _get_search_provider is not None:
|
||||
selected_list = filtered_pipe_objs
|
||||
provider_table: Optional[str] = None
|
||||
try:
|
||||
for obj in selected_list:
|
||||
extra = getattr(obj, "extra", None)
|
||||
if isinstance(extra, dict) and extra.get("table"):
|
||||
provider_table = str(extra.get("table"))
|
||||
break
|
||||
except Exception:
|
||||
provider_table = None
|
||||
|
||||
if provider_table:
|
||||
try:
|
||||
provider = _get_search_provider(provider_table, config)
|
||||
except Exception:
|
||||
provider = None
|
||||
if provider is not None:
|
||||
print("Auto-downloading selection via download-file")
|
||||
stages.append(["download-file"])
|
||||
continue
|
||||
else:
|
||||
print(f"No items matched selection {cmd_name}\n")
|
||||
@@ -1736,13 +1768,14 @@ def _execute_pipeline(tokens: list):
|
||||
}
|
||||
# Display-only commands (just show data, don't modify or search)
|
||||
display_only_commands = {
|
||||
'get-url', 'get_url', 'get-note', 'get_note',
|
||||
'get-note', 'get_note',
|
||||
'get-relationship', 'get_relationship', 'get-file', 'get_file',
|
||||
'check-file-status', 'check_file_status'
|
||||
}
|
||||
# Commands that manage their own table/history state (e.g. get-tag)
|
||||
self_managing_commands = {
|
||||
'get-tag', 'get_tag', 'tags',
|
||||
'get-url', 'get_url',
|
||||
'search-file', 'search_file'
|
||||
}
|
||||
|
||||
|
||||
@@ -1,19 +1,38 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
from concurrent import futures
|
||||
import hashlib
|
||||
import json as json_module
|
||||
import re
|
||||
import shutil
|
||||
import sys
|
||||
import tempfile
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
import requests
|
||||
|
||||
from API.HTTP import HTTPClient
|
||||
from ProviderCore.base import SearchProvider, SearchResult
|
||||
from ProviderCore.download import download_file, sanitize_filename
|
||||
from cli_syntax import get_field, get_free_text, parse_query
|
||||
from SYS.logger import log
|
||||
from SYS.utils import unique_path
|
||||
|
||||
try:
|
||||
from Crypto.Cipher import AES # type: ignore
|
||||
from Crypto.Util import Counter # type: ignore
|
||||
except ImportError:
|
||||
AES = None # type: ignore
|
||||
Counter = None # type: ignore
|
||||
|
||||
try:
|
||||
from tqdm import tqdm # type: ignore
|
||||
except ImportError:
|
||||
tqdm = None # type: ignore
|
||||
|
||||
|
||||
def _looks_like_isbn(text: str) -> bool:
|
||||
t = (text or "").replace("-", "").strip()
|
||||
@@ -38,6 +57,13 @@ def _resolve_edition_id(doc: Dict[str, Any]) -> str:
|
||||
edition_key = doc.get("edition_key")
|
||||
if isinstance(edition_key, list) and edition_key:
|
||||
return str(edition_key[0]).strip()
|
||||
if isinstance(edition_key, str) and edition_key.strip():
|
||||
return edition_key.strip()
|
||||
|
||||
# Often present even when edition_key is missing.
|
||||
cover_edition_key = doc.get("cover_edition_key")
|
||||
if isinstance(cover_edition_key, str) and cover_edition_key.strip():
|
||||
return cover_edition_key.strip()
|
||||
|
||||
# Fallback: sometimes key can be /books/OL...M
|
||||
key = doc.get("key")
|
||||
@@ -54,7 +80,7 @@ def _check_lendable(session: requests.Session, edition_id: str) -> Tuple[bool, s
|
||||
return False, "not-an-edition"
|
||||
|
||||
url = f"https://openlibrary.org/api/volumes/brief/json/OLID:{edition_id}"
|
||||
resp = session.get(url, timeout=10)
|
||||
resp = session.get(url, timeout=6)
|
||||
resp.raise_for_status()
|
||||
data = resp.json() or {}
|
||||
wrapped = data.get(f"OLID:{edition_id}")
|
||||
@@ -88,7 +114,7 @@ def _resolve_archive_id(session: requests.Session, edition_id: str, ia_candidate
|
||||
|
||||
# Otherwise query the edition JSON.
|
||||
try:
|
||||
resp = session.get(f"https://openlibrary.org/books/{edition_id}.json", timeout=10)
|
||||
resp = session.get(f"https://openlibrary.org/books/{edition_id}.json", timeout=6)
|
||||
resp.raise_for_status()
|
||||
data = resp.json() or {}
|
||||
|
||||
@@ -116,6 +142,522 @@ class OpenLibrary(SearchProvider):
|
||||
super().__init__(config)
|
||||
self._session = requests.Session()
|
||||
|
||||
class BookNotAvailableError(Exception):
|
||||
"""Raised when a book is not available for borrowing (waitlisted/in use)."""
|
||||
|
||||
@staticmethod
|
||||
def _credential_archive(config: Dict[str, Any]) -> Tuple[Optional[str], Optional[str]]:
|
||||
"""Get Archive.org email/password from config.
|
||||
|
||||
Supports:
|
||||
- New: {"provider": {"openlibrary": {"email": "...", "password": "..."}}}
|
||||
- Old: {"Archive": {"email": "...", "password": "..."}}
|
||||
{"archive_org_email": "...", "archive_org_password": "..."}
|
||||
"""
|
||||
if not isinstance(config, dict):
|
||||
return None, None
|
||||
|
||||
provider_config = config.get("provider", {})
|
||||
if isinstance(provider_config, dict):
|
||||
openlibrary_config = provider_config.get("openlibrary", {})
|
||||
if isinstance(openlibrary_config, dict):
|
||||
email = openlibrary_config.get("email")
|
||||
password = openlibrary_config.get("password")
|
||||
if email or password:
|
||||
return str(email) if email is not None else None, str(password) if password is not None else None
|
||||
|
||||
archive_config = config.get("Archive")
|
||||
if isinstance(archive_config, dict):
|
||||
email = archive_config.get("email")
|
||||
password = archive_config.get("password")
|
||||
if email or password:
|
||||
return str(email) if email is not None else None, str(password) if password is not None else None
|
||||
|
||||
email = config.get("archive_org_email")
|
||||
password = config.get("archive_org_password")
|
||||
return str(email) if email is not None else None, str(password) if password is not None else None
|
||||
|
||||
@staticmethod
|
||||
def _archive_error_body(response: requests.Response) -> str:
|
||||
try:
|
||||
body = response.text or ""
|
||||
except Exception:
|
||||
return ""
|
||||
if len(body) > 2000:
|
||||
return body[:1200] + "\n... (truncated) ...\n" + body[-400:]
|
||||
return body
|
||||
|
||||
@classmethod
|
||||
def _archive_login(cls, email: str, password: str) -> requests.Session:
|
||||
"""Login to archive.org using the token-based services endpoint (matches test-login.py)."""
|
||||
session = requests.Session()
|
||||
|
||||
token_resp = session.get("https://archive.org/services/account/login/", timeout=30)
|
||||
try:
|
||||
token_json = token_resp.json()
|
||||
except Exception as exc:
|
||||
raise RuntimeError(f"Archive login token parse failed: {exc}\n{cls._archive_error_body(token_resp)}")
|
||||
|
||||
if not token_json.get("success"):
|
||||
raise RuntimeError(f"Archive login token fetch failed\n{cls._archive_error_body(token_resp)}")
|
||||
|
||||
token = (token_json.get("value") or {}).get("token")
|
||||
if not token:
|
||||
raise RuntimeError("Archive login token missing")
|
||||
|
||||
headers = {"Content-Type": "application/x-www-form-urlencoded"}
|
||||
payload = {"username": email, "password": password, "t": token}
|
||||
|
||||
login_resp = session.post(
|
||||
"https://archive.org/services/account/login/",
|
||||
headers=headers,
|
||||
data=json_module.dumps(payload),
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
try:
|
||||
login_json = login_resp.json()
|
||||
except Exception as exc:
|
||||
raise RuntimeError(f"Archive login parse failed: {exc}\n{cls._archive_error_body(login_resp)}")
|
||||
|
||||
if login_json.get("success") is False:
|
||||
if login_json.get("value") == "bad_login":
|
||||
raise RuntimeError("Invalid Archive.org credentials")
|
||||
raise RuntimeError(f"Archive login failed: {login_json}")
|
||||
|
||||
return session
|
||||
|
||||
@classmethod
|
||||
def _archive_loan(cls, session: requests.Session, book_id: str, *, verbose: bool = True) -> requests.Session:
|
||||
data = {"action": "grant_access", "identifier": book_id}
|
||||
session.post("https://archive.org/services/loans/loan/searchInside.php", data=data, timeout=30)
|
||||
data["action"] = "browse_book"
|
||||
response = session.post("https://archive.org/services/loans/loan/", data=data, timeout=30)
|
||||
|
||||
if response.status_code == 400:
|
||||
try:
|
||||
err = (response.json() or {}).get("error")
|
||||
if err == "This book is not available to borrow at this time. Please try again later.":
|
||||
raise cls.BookNotAvailableError("Book is waitlisted or in use")
|
||||
raise RuntimeError(f"Borrow failed: {err or response.text}")
|
||||
except cls.BookNotAvailableError:
|
||||
raise
|
||||
except Exception:
|
||||
raise RuntimeError("The book cannot be borrowed")
|
||||
|
||||
data["action"] = "create_token"
|
||||
response = session.post("https://archive.org/services/loans/loan/", data=data, timeout=30)
|
||||
if "token" in (response.text or ""):
|
||||
return session
|
||||
raise RuntimeError("Something went wrong when trying to borrow the book")
|
||||
|
||||
@staticmethod
|
||||
def _archive_return_loan(session: requests.Session, book_id: str) -> None:
|
||||
data = {"action": "return_loan", "identifier": book_id}
|
||||
response = session.post("https://archive.org/services/loans/loan/", data=data, timeout=30)
|
||||
if response.status_code == 200:
|
||||
try:
|
||||
if (response.json() or {}).get("success"):
|
||||
return
|
||||
except Exception:
|
||||
pass
|
||||
raise RuntimeError("Something went wrong when trying to return the book")
|
||||
|
||||
@staticmethod
|
||||
def _archive_get_book_infos(session: requests.Session, url: str) -> Tuple[str, List[str], Dict[str, Any]]:
|
||||
"""Extract page links from Archive.org book reader."""
|
||||
r = session.get(url, timeout=30).text
|
||||
|
||||
# Matches: "url":"//archive.org/..." (allow whitespace)
|
||||
match = re.search(r'"url"\s*:\s*"([^"]+)"', r)
|
||||
if not match:
|
||||
raise RuntimeError("Failed to extract book info URL from response")
|
||||
|
||||
url_path = match.group(1)
|
||||
infos_url = ("https:" + url_path) if url_path.startswith("//") else url_path
|
||||
infos_url = infos_url.replace("\\u0026", "&")
|
||||
|
||||
response = session.get(infos_url, timeout=30)
|
||||
payload = response.json()
|
||||
data = payload["data"]
|
||||
|
||||
title = str(data["brOptions"]["bookTitle"]).strip().replace(" ", "_")
|
||||
title = "".join(c for c in title if c not in '<>:"/\\|?*')
|
||||
title = title[:150]
|
||||
|
||||
metadata = data.get("metadata") or {}
|
||||
links: List[str] = []
|
||||
br_data = (data.get("brOptions") or {}).get("data", [])
|
||||
if isinstance(br_data, list):
|
||||
for item in br_data:
|
||||
if isinstance(item, list):
|
||||
for page in item:
|
||||
if isinstance(page, dict) and "uri" in page:
|
||||
links.append(page["uri"])
|
||||
elif isinstance(item, dict) and "uri" in item:
|
||||
links.append(item["uri"])
|
||||
|
||||
if not links:
|
||||
raise RuntimeError("No pages found in book data")
|
||||
return title, links, metadata if isinstance(metadata, dict) else {}
|
||||
|
||||
@staticmethod
|
||||
def _archive_image_name(pages: int, page: int, directory: str) -> str:
|
||||
return f"{directory}/{(len(str(pages)) - len(str(page))) * '0'}{page}.jpg"
|
||||
|
||||
@staticmethod
|
||||
def _archive_deobfuscate_image(image_data: bytes, link: str, obf_header: str) -> bytes:
|
||||
if not AES or not Counter:
|
||||
raise RuntimeError("Crypto library not available")
|
||||
|
||||
try:
|
||||
version, counter_b64 = obf_header.split("|")
|
||||
except Exception as exc:
|
||||
raise ValueError("Invalid X-Obfuscate header format") from exc
|
||||
|
||||
if version != "1":
|
||||
raise ValueError("Unsupported obfuscation version: " + version)
|
||||
|
||||
aes_key = re.sub(r"^https?:\/\/.*?\/", "/", link)
|
||||
sha1_digest = hashlib.sha1(aes_key.encode("utf-8")).digest()
|
||||
key = sha1_digest[:16]
|
||||
|
||||
counter_bytes = base64.b64decode(counter_b64)
|
||||
if len(counter_bytes) != 16:
|
||||
raise ValueError(f"Expected counter to be 16 bytes, got {len(counter_bytes)}")
|
||||
|
||||
prefix = counter_bytes[:8]
|
||||
initial_value = int.from_bytes(counter_bytes[8:], byteorder="big")
|
||||
ctr = Counter.new(64, prefix=prefix, initial_value=initial_value, little_endian=False) # type: ignore
|
||||
cipher = AES.new(key, AES.MODE_CTR, counter=ctr) # type: ignore
|
||||
|
||||
decrypted_part = cipher.decrypt(image_data[:1024])
|
||||
return decrypted_part + image_data[1024:]
|
||||
|
||||
@classmethod
|
||||
def _archive_download_one_image(
|
||||
cls,
|
||||
session: requests.Session,
|
||||
link: str,
|
||||
i: int,
|
||||
directory: str,
|
||||
book_id: str,
|
||||
pages: int,
|
||||
) -> None:
|
||||
headers = {
|
||||
"Referer": "https://archive.org/",
|
||||
"Accept": "image/avif,image/webp,image/apng,image/*,*/*;q=0.8",
|
||||
"Sec-Fetch-Site": "same-site",
|
||||
"Sec-Fetch-Mode": "no-cors",
|
||||
"Sec-Fetch-Dest": "image",
|
||||
}
|
||||
|
||||
while True:
|
||||
try:
|
||||
response = session.get(link, headers=headers, timeout=30)
|
||||
if response.status_code == 403:
|
||||
cls._archive_loan(session, book_id, verbose=False)
|
||||
raise RuntimeError("Borrow again")
|
||||
if response.status_code == 200:
|
||||
break
|
||||
except Exception:
|
||||
time.sleep(1)
|
||||
|
||||
image = cls._archive_image_name(pages, i, directory)
|
||||
obf_header = response.headers.get("X-Obfuscate")
|
||||
if obf_header:
|
||||
image_content = cls._archive_deobfuscate_image(response.content, link, obf_header)
|
||||
else:
|
||||
image_content = response.content
|
||||
|
||||
with open(image, "wb") as f:
|
||||
f.write(image_content)
|
||||
|
||||
@classmethod
|
||||
def _archive_download(
|
||||
cls,
|
||||
session: requests.Session,
|
||||
n_threads: int,
|
||||
directory: str,
|
||||
links: List[str],
|
||||
scale: int,
|
||||
book_id: str,
|
||||
) -> List[str]:
|
||||
links_scaled = [f"{link}&rotate=0&scale={scale}" for link in links]
|
||||
pages = len(links_scaled)
|
||||
|
||||
tasks = []
|
||||
with futures.ThreadPoolExecutor(max_workers=n_threads) as executor:
|
||||
for i, link in enumerate(links_scaled):
|
||||
tasks.append(
|
||||
executor.submit(
|
||||
cls._archive_download_one_image,
|
||||
session=session,
|
||||
link=link,
|
||||
i=i,
|
||||
directory=directory,
|
||||
book_id=book_id,
|
||||
pages=pages,
|
||||
)
|
||||
)
|
||||
if tqdm:
|
||||
for _ in tqdm(futures.as_completed(tasks), total=len(tasks)): # type: ignore
|
||||
pass
|
||||
else:
|
||||
for _ in futures.as_completed(tasks):
|
||||
pass
|
||||
|
||||
return [cls._archive_image_name(pages, i, directory) for i in range(pages)]
|
||||
|
||||
@staticmethod
|
||||
def _archive_check_direct_download(book_id: str) -> Tuple[bool, str]:
|
||||
"""Check for a directly downloadable original PDF in Archive.org metadata."""
|
||||
try:
|
||||
metadata_url = f"https://archive.org/metadata/{book_id}"
|
||||
response = requests.get(metadata_url, timeout=6)
|
||||
response.raise_for_status()
|
||||
metadata = response.json()
|
||||
files = metadata.get("files") if isinstance(metadata, dict) else None
|
||||
if isinstance(files, list):
|
||||
for file_info in files:
|
||||
if not isinstance(file_info, dict):
|
||||
continue
|
||||
filename = str(file_info.get("name", ""))
|
||||
if filename.endswith(".pdf") and file_info.get("source") == "original":
|
||||
pdf_url = f"https://archive.org/download/{book_id}/{filename.replace(' ', '%20')}"
|
||||
check_response = requests.head(pdf_url, timeout=4, allow_redirects=True)
|
||||
if check_response.status_code == 200:
|
||||
return True, pdf_url
|
||||
return False, ""
|
||||
except Exception:
|
||||
return False, ""
|
||||
|
||||
@staticmethod
|
||||
def scrape_isbn_metadata(isbn: str) -> List[str]:
|
||||
"""Scrape tags for an ISBN using Open Library API.
|
||||
|
||||
Returns tags such as:
|
||||
- title:<...>, author:<...>, publish_date:<...>, publisher:<...>, description:<...>, pages:<...>
|
||||
- identifiers: openlibrary:<...>, lccn:<...>, oclc:<...>, goodreads:<...>, librarything:<...>, doi:<...>, internet_archive:<...>
|
||||
"""
|
||||
new_tags: List[str] = []
|
||||
|
||||
isbn_clean = str(isbn or "").replace("isbn:", "").replace("-", "").strip()
|
||||
if not isbn_clean:
|
||||
return []
|
||||
|
||||
url = f"https://openlibrary.org/api/books?bibkeys=ISBN:{isbn_clean}&jscmd=data&format=json"
|
||||
try:
|
||||
with HTTPClient() as client:
|
||||
response = client.get(url)
|
||||
response.raise_for_status()
|
||||
data = json_module.loads(response.content.decode("utf-8"))
|
||||
except Exception as exc:
|
||||
log(f"Failed to fetch ISBN metadata: {exc}", file=sys.stderr)
|
||||
return []
|
||||
|
||||
if not data:
|
||||
log(f"No ISBN metadata found for: {isbn}")
|
||||
return []
|
||||
|
||||
book_data = next(iter(data.values()), None)
|
||||
if not isinstance(book_data, dict):
|
||||
return []
|
||||
|
||||
if "title" in book_data:
|
||||
new_tags.append(f"title:{book_data['title']}")
|
||||
|
||||
authors = book_data.get("authors")
|
||||
if isinstance(authors, list):
|
||||
for author in authors[:3]:
|
||||
if isinstance(author, dict) and author.get("name"):
|
||||
new_tags.append(f"author:{author['name']}")
|
||||
|
||||
if book_data.get("publish_date"):
|
||||
new_tags.append(f"publish_date:{book_data['publish_date']}")
|
||||
|
||||
publishers = book_data.get("publishers")
|
||||
if isinstance(publishers, list) and publishers:
|
||||
pub = publishers[0]
|
||||
if isinstance(pub, dict) and pub.get("name"):
|
||||
new_tags.append(f"publisher:{pub['name']}")
|
||||
|
||||
if "description" in book_data:
|
||||
desc = book_data.get("description")
|
||||
if isinstance(desc, dict) and "value" in desc:
|
||||
desc = desc.get("value")
|
||||
if desc:
|
||||
desc_str = str(desc).strip()
|
||||
if desc_str:
|
||||
new_tags.append(f"description:{desc_str[:200]}")
|
||||
|
||||
page_count = book_data.get("number_of_pages")
|
||||
if isinstance(page_count, int) and page_count > 0:
|
||||
new_tags.append(f"pages:{page_count}")
|
||||
|
||||
identifiers = book_data.get("identifiers")
|
||||
if isinstance(identifiers, dict):
|
||||
|
||||
def _first(value: Any) -> Any:
|
||||
if isinstance(value, list) and value:
|
||||
return value[0]
|
||||
return value
|
||||
|
||||
for key, ns in (
|
||||
("openlibrary", "openlibrary"),
|
||||
("lccn", "lccn"),
|
||||
("oclc", "oclc"),
|
||||
("goodreads", "goodreads"),
|
||||
("librarything", "librarything"),
|
||||
("doi", "doi"),
|
||||
("internet_archive", "internet_archive"),
|
||||
):
|
||||
val = _first(identifiers.get(key))
|
||||
if val:
|
||||
new_tags.append(f"{ns}:{val}")
|
||||
|
||||
log(f"Found {len(new_tags)} tag(s) from ISBN lookup")
|
||||
return new_tags
|
||||
|
||||
@staticmethod
|
||||
def scrape_openlibrary_metadata(olid: str) -> List[str]:
|
||||
"""Scrape tags for an OpenLibrary ID using the .json API endpoint."""
|
||||
new_tags: List[str] = []
|
||||
|
||||
olid_text = str(olid or "").strip()
|
||||
if not olid_text:
|
||||
return []
|
||||
|
||||
# Normalize OLID to the common "OL<digits>M" form when possible.
|
||||
olid_norm = olid_text
|
||||
try:
|
||||
if not olid_norm.startswith("OL"):
|
||||
olid_norm = f"OL{olid_norm}"
|
||||
if not olid_norm.endswith("M"):
|
||||
olid_norm = f"{olid_norm}M"
|
||||
except Exception:
|
||||
olid_norm = olid_text
|
||||
|
||||
# Ensure we always include a scrapeable identifier tag.
|
||||
new_tags.append(f"openlibrary:{olid_norm}")
|
||||
|
||||
# Accept OL9674499M, 9674499M, or just digits.
|
||||
olid_clean = olid_text.replace("OL", "").replace("M", "")
|
||||
if not olid_clean.isdigit():
|
||||
olid_clean = olid_text
|
||||
|
||||
if not olid_text.startswith("OL"):
|
||||
url = f"https://openlibrary.org/books/OL{olid_clean}M.json"
|
||||
else:
|
||||
url = f"https://openlibrary.org/books/{olid_text}.json"
|
||||
|
||||
try:
|
||||
with HTTPClient() as client:
|
||||
response = client.get(url)
|
||||
response.raise_for_status()
|
||||
data = json_module.loads(response.content.decode("utf-8"))
|
||||
except Exception as exc:
|
||||
log(f"Failed to fetch OpenLibrary metadata: {exc}", file=sys.stderr)
|
||||
return []
|
||||
|
||||
if not isinstance(data, dict) or not data:
|
||||
log(f"No OpenLibrary metadata found for: {olid_text}")
|
||||
return []
|
||||
|
||||
if "title" in data:
|
||||
new_tags.append(f"title:{data['title']}")
|
||||
|
||||
authors = data.get("authors")
|
||||
if isinstance(authors, list):
|
||||
for author in authors[:3]:
|
||||
if isinstance(author, dict) and author.get("name"):
|
||||
new_tags.append(f"author:{author['name']}")
|
||||
continue
|
||||
|
||||
# Common OL shape: {"key": "/authors/OL...A"} or {"author": {"key": ...}}
|
||||
author_key = None
|
||||
if isinstance(author, dict):
|
||||
if isinstance(author.get("author"), dict):
|
||||
author_key = author.get("author", {}).get("key")
|
||||
if not author_key:
|
||||
author_key = author.get("key")
|
||||
|
||||
if isinstance(author_key, str) and author_key.startswith("/"):
|
||||
try:
|
||||
author_url = f"https://openlibrary.org{author_key}.json"
|
||||
with HTTPClient(timeout=10) as client:
|
||||
author_resp = client.get(author_url)
|
||||
author_resp.raise_for_status()
|
||||
author_data = json_module.loads(author_resp.content.decode("utf-8"))
|
||||
if isinstance(author_data, dict) and author_data.get("name"):
|
||||
new_tags.append(f"author:{author_data['name']}")
|
||||
continue
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if isinstance(author, str) and author:
|
||||
new_tags.append(f"author:{author}")
|
||||
|
||||
if data.get("publish_date"):
|
||||
new_tags.append(f"publish_date:{data['publish_date']}")
|
||||
|
||||
publishers = data.get("publishers")
|
||||
if isinstance(publishers, list) and publishers:
|
||||
pub = publishers[0]
|
||||
if isinstance(pub, dict) and pub.get("name"):
|
||||
new_tags.append(f"publisher:{pub['name']}")
|
||||
elif isinstance(pub, str) and pub:
|
||||
new_tags.append(f"publisher:{pub}")
|
||||
|
||||
if "description" in data:
|
||||
desc = data.get("description")
|
||||
if isinstance(desc, dict) and "value" in desc:
|
||||
desc = desc.get("value")
|
||||
if desc:
|
||||
desc_str = str(desc).strip()
|
||||
if desc_str:
|
||||
new_tags.append(f"description:{desc_str[:200]}")
|
||||
|
||||
page_count = data.get("number_of_pages")
|
||||
if isinstance(page_count, int) and page_count > 0:
|
||||
new_tags.append(f"pages:{page_count}")
|
||||
|
||||
subjects = data.get("subjects")
|
||||
if isinstance(subjects, list):
|
||||
for subject in subjects[:10]:
|
||||
if isinstance(subject, str):
|
||||
subject_clean = subject.strip()
|
||||
if subject_clean and subject_clean not in new_tags:
|
||||
new_tags.append(subject_clean)
|
||||
|
||||
identifiers = data.get("identifiers")
|
||||
if isinstance(identifiers, dict):
|
||||
|
||||
def _first(value: Any) -> Any:
|
||||
if isinstance(value, list) and value:
|
||||
return value[0]
|
||||
return value
|
||||
|
||||
for key, ns in (
|
||||
("isbn_10", "isbn_10"),
|
||||
("isbn_13", "isbn_13"),
|
||||
("lccn", "lccn"),
|
||||
("oclc_numbers", "oclc"),
|
||||
("goodreads", "goodreads"),
|
||||
("internet_archive", "internet_archive"),
|
||||
):
|
||||
val = _first(identifiers.get(key))
|
||||
if val:
|
||||
new_tags.append(f"{ns}:{val}")
|
||||
|
||||
# Some editions expose a direct Archive.org identifier as "ocaid".
|
||||
ocaid = data.get("ocaid")
|
||||
if isinstance(ocaid, str) and ocaid.strip():
|
||||
new_tags.append(f"internet_archive:{ocaid.strip()}")
|
||||
|
||||
log(f"Found {len(new_tags)} tag(s) from OpenLibrary lookup")
|
||||
return new_tags
|
||||
|
||||
def search(
|
||||
self,
|
||||
query: str,
|
||||
@@ -155,7 +697,70 @@ class OpenLibrary(SearchProvider):
|
||||
if not isinstance(docs, list):
|
||||
return []
|
||||
|
||||
for doc in docs[: int(limit)]:
|
||||
# Availability enrichment can be slow if done sequentially (it may require multiple
|
||||
# network calls per row). Do it concurrently to keep the pipeline responsive.
|
||||
docs = docs[: int(limit)]
|
||||
|
||||
def _compute_availability(doc_dict: Dict[str, Any]) -> Tuple[str, str, str, str]:
|
||||
edition_id_local = _resolve_edition_id(doc_dict)
|
||||
if not edition_id_local:
|
||||
return "no-olid", "", "", ""
|
||||
|
||||
ia_val_local = doc_dict.get("ia") or []
|
||||
if isinstance(ia_val_local, str):
|
||||
ia_val_local = [ia_val_local]
|
||||
if not isinstance(ia_val_local, list):
|
||||
ia_val_local = []
|
||||
ia_ids_local = [str(x) for x in ia_val_local if x]
|
||||
|
||||
session_local = requests.Session()
|
||||
|
||||
try:
|
||||
archive_id_local = _resolve_archive_id(session_local, edition_id_local, ia_ids_local)
|
||||
except Exception:
|
||||
archive_id_local = ""
|
||||
|
||||
if not archive_id_local:
|
||||
return "no-archive", "", "", ""
|
||||
|
||||
# Prefer the fastest signal first: OpenLibrary lendable status.
|
||||
lendable_local, reason_local = _check_lendable(session_local, edition_id_local)
|
||||
if lendable_local:
|
||||
return "borrow", reason_local, archive_id_local, ""
|
||||
|
||||
# Not lendable: check whether it's directly downloadable (public domain uploads, etc.).
|
||||
try:
|
||||
can_direct, pdf_url = self._archive_check_direct_download(archive_id_local)
|
||||
if can_direct and pdf_url:
|
||||
return "download", reason_local, archive_id_local, str(pdf_url)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return "unavailable", reason_local, archive_id_local, ""
|
||||
|
||||
availability_rows: List[Tuple[str, str, str, str]] = [("unknown", "", "", "") for _ in range(len(docs))]
|
||||
if docs:
|
||||
log(f"[openlibrary] Enriching availability for {len(docs)} result(s)...")
|
||||
max_workers = min(8, max(1, len(docs)))
|
||||
done = 0
|
||||
with futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||
future_to_index = {
|
||||
executor.submit(_compute_availability, doc_dict): i
|
||||
for i, doc_dict in enumerate(docs)
|
||||
if isinstance(doc_dict, dict)
|
||||
}
|
||||
for fut in futures.as_completed(list(future_to_index.keys())):
|
||||
i = future_to_index[fut]
|
||||
try:
|
||||
availability_rows[i] = fut.result()
|
||||
except Exception:
|
||||
availability_rows[i] = ("unknown", "", "", "")
|
||||
done += 1
|
||||
if done in {1, len(future_to_index)} or (done % 10 == 0):
|
||||
log(f"[openlibrary] Availability: {done}/{len(future_to_index)}")
|
||||
log("[openlibrary] Availability enrichment complete")
|
||||
|
||||
for idx, doc in enumerate(docs):
|
||||
if not isinstance(doc, dict):
|
||||
continue
|
||||
|
||||
@@ -172,6 +777,7 @@ class OpenLibrary(SearchProvider):
|
||||
year = str(year_val) if year_val is not None else ""
|
||||
|
||||
edition_id = _resolve_edition_id(doc)
|
||||
work_key = doc.get("key") if isinstance(doc.get("key"), str) else ""
|
||||
|
||||
ia_val = doc.get("ia") or []
|
||||
if isinstance(ia_val, str):
|
||||
@@ -193,9 +799,21 @@ class OpenLibrary(SearchProvider):
|
||||
("Title", book_title),
|
||||
("Author", ", ".join(authors_list)),
|
||||
("Year", year),
|
||||
("Avail", ""),
|
||||
("OLID", edition_id),
|
||||
]
|
||||
|
||||
# Determine availability using the concurrently computed enrichment.
|
||||
availability, availability_reason, archive_id, direct_url = ("unknown", "", "", "")
|
||||
if 0 <= idx < len(availability_rows):
|
||||
availability, availability_reason, archive_id, direct_url = availability_rows[idx]
|
||||
|
||||
# Patch the display column.
|
||||
for idx, (name, _val) in enumerate(columns):
|
||||
if name == "Avail":
|
||||
columns[idx] = ("Avail", availability)
|
||||
break
|
||||
|
||||
annotations: List[str] = []
|
||||
if isbn_13:
|
||||
annotations.append(f"isbn_13:{isbn_13}")
|
||||
@@ -203,12 +821,18 @@ class OpenLibrary(SearchProvider):
|
||||
annotations.append(f"isbn_10:{isbn_10}")
|
||||
if ia_ids:
|
||||
annotations.append("archive")
|
||||
if availability in {"download", "borrow"}:
|
||||
annotations.append(availability)
|
||||
|
||||
results.append(
|
||||
SearchResult(
|
||||
table="openlibrary",
|
||||
title=book_title,
|
||||
path=(f"https://openlibrary.org/books/{edition_id}" if edition_id else "https://openlibrary.org"),
|
||||
path=(
|
||||
f"https://openlibrary.org/books/{edition_id}" if edition_id else (
|
||||
f"https://openlibrary.org{work_key}" if isinstance(work_key, str) and work_key.startswith("/") else "https://openlibrary.org"
|
||||
)
|
||||
),
|
||||
detail=(
|
||||
(f"By: {', '.join(authors_list)}" if authors_list else "")
|
||||
+ (f" ({year})" if year else "")
|
||||
@@ -218,11 +842,16 @@ class OpenLibrary(SearchProvider):
|
||||
columns=columns,
|
||||
full_metadata={
|
||||
"openlibrary_id": edition_id,
|
||||
"openlibrary_key": work_key,
|
||||
"authors": authors_list,
|
||||
"year": year,
|
||||
"isbn_10": isbn_10,
|
||||
"isbn_13": isbn_13,
|
||||
"ia": ia_ids,
|
||||
"availability": availability,
|
||||
"availability_reason": availability_reason,
|
||||
"archive_id": archive_id,
|
||||
"direct_url": direct_url,
|
||||
"raw": doc,
|
||||
},
|
||||
)
|
||||
@@ -256,9 +885,7 @@ class OpenLibrary(SearchProvider):
|
||||
|
||||
# 1) Direct download if available.
|
||||
try:
|
||||
from API.archive_client import check_direct_download
|
||||
|
||||
can_direct, pdf_url = check_direct_download(archive_id)
|
||||
can_direct, pdf_url = self._archive_check_direct_download(archive_id)
|
||||
except Exception:
|
||||
can_direct, pdf_url = False, ""
|
||||
|
||||
@@ -272,10 +899,7 @@ class OpenLibrary(SearchProvider):
|
||||
|
||||
# 2) Borrow flow (credentials required).
|
||||
try:
|
||||
from API.archive_client import BookNotAvailableError, credential_openlibrary, download as archive_download
|
||||
from API.archive_client import get_book_infos, loan, login
|
||||
|
||||
email, password = credential_openlibrary(self.config or {})
|
||||
email, password = self._credential_archive(self.config or {})
|
||||
if not email or not password:
|
||||
log("[openlibrary] Archive credentials missing; cannot borrow", file=sys.stderr)
|
||||
return None
|
||||
@@ -285,13 +909,13 @@ class OpenLibrary(SearchProvider):
|
||||
log(f"[openlibrary] Not lendable: {reason}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
session = login(email, password)
|
||||
session = self._archive_login(email, password)
|
||||
try:
|
||||
session = loan(session, archive_id, verbose=False)
|
||||
except BookNotAvailableError:
|
||||
session = self._archive_loan(session, archive_id, verbose=False)
|
||||
except self.BookNotAvailableError:
|
||||
log("[openlibrary] Book not available to borrow", file=sys.stderr)
|
||||
return None
|
||||
except SystemExit:
|
||||
except Exception:
|
||||
log("[openlibrary] Borrow failed", file=sys.stderr)
|
||||
return None
|
||||
|
||||
@@ -301,7 +925,7 @@ class OpenLibrary(SearchProvider):
|
||||
last_exc: Optional[Exception] = None
|
||||
for u in urls:
|
||||
try:
|
||||
title_raw, links, _metadata = get_book_infos(session, u)
|
||||
title_raw, links, _metadata = self._archive_get_book_infos(session, u)
|
||||
if title_raw:
|
||||
title = sanitize_filename(title_raw)
|
||||
break
|
||||
@@ -315,7 +939,7 @@ class OpenLibrary(SearchProvider):
|
||||
|
||||
temp_dir = tempfile.mkdtemp(prefix=f"{title}_", dir=str(output_dir))
|
||||
try:
|
||||
images = archive_download(session=session, n_threads=10, directory=temp_dir, links=links, scale=3, book_id=archive_id)
|
||||
images = self._archive_download(session=session, n_threads=10, directory=temp_dir, links=links, scale=3, book_id=archive_id)
|
||||
|
||||
try:
|
||||
import img2pdf # type: ignore
|
||||
|
||||
@@ -642,7 +642,7 @@ def _download_direct_file(
|
||||
return DownloadMediaResult(
|
||||
path=file_path,
|
||||
info=info,
|
||||
tags=tags,
|
||||
tag=tags,
|
||||
source_url=url,
|
||||
hash_value=hash_value,
|
||||
)
|
||||
|
||||
@@ -36,6 +36,7 @@ mime_maps = {
|
||||
"mp3": { "ext": ".mp3", "mimes": ["audio/mpeg", "audio/mp3"] },
|
||||
"m4a": { "ext": ".m4a", "mimes": ["audio/mp4", "audio/x-m4a"] },
|
||||
"ogg": { "ext": ".ogg", "mimes": ["audio/ogg"] },
|
||||
"opus": { "ext": ".opus", "mimes": ["audio/opus"] },
|
||||
"flac": { "ext": ".flac", "mimes": ["audio/flac"] },
|
||||
"wav": { "ext": ".wav", "mimes": ["audio/wav", "audio/x-wav", "audio/vnd.wave"] },
|
||||
"wma": { "ext": ".wma", "mimes": ["audio/x-ms-wma"] },
|
||||
@@ -98,3 +99,13 @@ def get_type_from_ext(ext: str) -> str:
|
||||
return type_name
|
||||
|
||||
return 'other'
|
||||
|
||||
|
||||
# Canonical supported extension set for all stores/cmdlets.
|
||||
# Derived from mime_maps so there is a single source of truth.
|
||||
ALL_SUPPORTED_EXTENSIONS: set[str] = {
|
||||
spec["ext"].lower()
|
||||
for group in mime_maps.values()
|
||||
for spec in group.values()
|
||||
if isinstance(spec, dict) and isinstance(spec.get("ext"), str) and spec.get("ext")
|
||||
}
|
||||
|
||||
233
Store/Folder.py
233
Store/Folder.py
@@ -30,6 +30,8 @@ def _resolve_file_hash(db_hash: Optional[str], file_path: Path) -> Optional[str]
|
||||
return _normalize_hash(file_path.stem)
|
||||
|
||||
|
||||
|
||||
|
||||
class Folder(Store):
|
||||
""""""
|
||||
# Track which locations have already been migrated to avoid repeated migrations
|
||||
@@ -359,6 +361,17 @@ class Folder(Store):
|
||||
else:
|
||||
shutil.copy2(str(file_path), str(save_file))
|
||||
debug(f"Local copy: {save_file}", file=sys.stderr)
|
||||
|
||||
# Best-effort: capture duration for media
|
||||
duration_value: float | None = None
|
||||
try:
|
||||
from SYS.utils import ffprobe
|
||||
probe = ffprobe(str(save_file))
|
||||
duration = probe.get("duration")
|
||||
if isinstance(duration, (int, float)) and duration > 0:
|
||||
duration_value = float(duration)
|
||||
except Exception:
|
||||
duration_value = None
|
||||
|
||||
# Save to database
|
||||
with API_folder_store(Path(self._location)) as db:
|
||||
@@ -368,7 +381,8 @@ class Folder(Store):
|
||||
db.save_metadata(save_file, {
|
||||
'hash': file_hash,
|
||||
'ext': ext_clean,
|
||||
'size': file_path.stat().st_size
|
||||
'size': file_path.stat().st_size,
|
||||
'duration': duration_value,
|
||||
})
|
||||
|
||||
# Add tags if provided
|
||||
@@ -405,6 +419,21 @@ class Folder(Store):
|
||||
results = []
|
||||
search_dir = Path(self._location).expanduser()
|
||||
|
||||
def _url_like_pattern(value: str) -> str:
|
||||
# Interpret user patterns as substring matches (with optional glob wildcards).
|
||||
v = (value or "").strip().lower()
|
||||
if not v or v == "*":
|
||||
return "%"
|
||||
v = v.replace("%", "\\%").replace("_", "\\_")
|
||||
v = v.replace("*", "%").replace("?", "_")
|
||||
if "%" not in v and "_" not in v:
|
||||
return f"%{v}%"
|
||||
if not v.startswith("%"):
|
||||
v = "%" + v
|
||||
if not v.endswith("%"):
|
||||
v = v + "%"
|
||||
return v
|
||||
|
||||
tokens = [t.strip() for t in query.split(',') if t.strip()]
|
||||
|
||||
if not match_all and len(tokens) == 1 and _normalize_hash(query):
|
||||
@@ -453,6 +482,8 @@ class Folder(Store):
|
||||
try:
|
||||
with DatabaseAPI(search_dir) as api:
|
||||
if tokens and len(tokens) > 1:
|
||||
url_fetch_limit = (limit or 45) * 50
|
||||
|
||||
def _like_pattern(term: str) -> str:
|
||||
return term.replace('*', '%').replace('?', '_')
|
||||
|
||||
@@ -473,6 +504,11 @@ class Folder(Store):
|
||||
h = api.get_file_hash_by_hash(normalized_hash)
|
||||
return {h} if h else set()
|
||||
|
||||
if namespace == 'url':
|
||||
if not pattern or pattern == '*':
|
||||
return api.get_file_hashes_with_any_url(limit=url_fetch_limit)
|
||||
return api.get_file_hashes_by_url_like(_url_like_pattern(pattern), limit=url_fetch_limit)
|
||||
|
||||
if namespace == 'store':
|
||||
if pattern not in {'local', 'file', 'filesystem'}:
|
||||
return set()
|
||||
@@ -562,6 +598,29 @@ class Folder(Store):
|
||||
if limit is not None and len(results) >= limit:
|
||||
return results
|
||||
return results
|
||||
|
||||
if namespace == "url":
|
||||
if not pattern or pattern == "*":
|
||||
rows = api.get_files_with_any_url(limit)
|
||||
else:
|
||||
rows = api.get_files_by_url_like(_url_like_pattern(pattern), limit)
|
||||
for file_hash, file_path_str, size_bytes, ext in rows:
|
||||
if not file_path_str:
|
||||
continue
|
||||
file_path = Path(file_path_str)
|
||||
if not file_path.exists():
|
||||
continue
|
||||
if size_bytes is None:
|
||||
try:
|
||||
size_bytes = file_path.stat().st_size
|
||||
except OSError:
|
||||
size_bytes = None
|
||||
tags = api.get_tags_for_file(file_hash)
|
||||
entry = _create_entry(file_path, tags, size_bytes, file_hash)
|
||||
results.append(entry)
|
||||
if limit is not None and len(results) >= limit:
|
||||
return results
|
||||
return results
|
||||
|
||||
query_pattern = f"{namespace}:%"
|
||||
rows = api.get_files_by_namespace_pattern(query_pattern, limit)
|
||||
@@ -592,126 +651,59 @@ class Folder(Store):
|
||||
if limit is not None and len(results) >= limit:
|
||||
return results
|
||||
elif not match_all:
|
||||
# Strict tag-based search only (no filename/path searching).
|
||||
terms = [t.strip() for t in query_lower.replace(',', ' ').split() if t.strip()]
|
||||
if not terms:
|
||||
terms = [query_lower]
|
||||
|
||||
debug(f"Performing filename/tag search for terms: {terms}")
|
||||
|
||||
|
||||
fetch_limit = (limit or 45) * 50
|
||||
|
||||
conditions = ["LOWER(f.file_path) LIKE ?" for _ in terms]
|
||||
params = [f"%{t}%" for t in terms]
|
||||
|
||||
rows = api.get_files_by_multiple_path_conditions(conditions, params, fetch_limit)
|
||||
debug(f"Found {len(rows)} filename matches in DB (before whole-word filter)")
|
||||
|
||||
word_regex = None
|
||||
if len(terms) == 1:
|
||||
term = terms[0]
|
||||
has_wildcard = '*' in term or '?' in term
|
||||
|
||||
if has_wildcard:
|
||||
try:
|
||||
from fnmatch import translate
|
||||
word_regex = re.compile(translate(term), re.IGNORECASE)
|
||||
except Exception:
|
||||
word_regex = None
|
||||
else:
|
||||
try:
|
||||
pattern = r'(?<![a-zA-Z0-9])' + re.escape(term) + r'(?![a-zA-Z0-9])'
|
||||
word_regex = re.compile(pattern, re.IGNORECASE)
|
||||
except Exception:
|
||||
word_regex = None
|
||||
|
||||
seen_files = set()
|
||||
for file_id, file_path_str, size_bytes, file_hash in rows:
|
||||
if not file_path_str or file_path_str in seen_files:
|
||||
continue
|
||||
|
||||
if word_regex:
|
||||
p = Path(file_path_str)
|
||||
if not word_regex.search(p.name):
|
||||
# AND semantics across terms: each term must match at least one tag.
|
||||
hits: dict[str, dict[str, Any]] = {}
|
||||
for term in terms:
|
||||
tag_pattern = f"%{term}%"
|
||||
term_rows = api.get_files_by_namespace_pattern(tag_pattern, fetch_limit)
|
||||
for file_hash, file_path_str, size_bytes, ext in term_rows:
|
||||
if not file_path_str:
|
||||
continue
|
||||
seen_files.add(file_path_str)
|
||||
|
||||
file_path = Path(file_path_str)
|
||||
if file_path.exists():
|
||||
if size_bytes is None:
|
||||
size_bytes = file_path.stat().st_size
|
||||
|
||||
tags = api.get_tags_for_file(file_hash)
|
||||
entry = _create_entry(file_path, tags, size_bytes, file_hash)
|
||||
results.append(entry)
|
||||
if limit is not None and len(results) >= limit:
|
||||
return results
|
||||
entry = hits.get(file_hash)
|
||||
if entry:
|
||||
entry["count"] += 1
|
||||
if size_bytes is not None:
|
||||
entry["size"] = size_bytes
|
||||
else:
|
||||
hits[file_hash] = {
|
||||
"path": file_path_str,
|
||||
"size": size_bytes,
|
||||
"hash": file_hash,
|
||||
"count": 1,
|
||||
}
|
||||
|
||||
if terms:
|
||||
title_hits: dict[str, dict[str, Any]] = {}
|
||||
for term in terms:
|
||||
title_pattern = f"title:%{term}%"
|
||||
title_rows = api.get_files_by_title_tag_pattern(title_pattern, fetch_limit)
|
||||
for file_hash, file_path_str, size_bytes, ext in title_rows:
|
||||
if not file_path_str:
|
||||
continue
|
||||
entry = title_hits.get(file_hash)
|
||||
if entry:
|
||||
entry["count"] += 1
|
||||
if size_bytes is not None:
|
||||
entry["size"] = size_bytes
|
||||
else:
|
||||
title_hits[file_hash] = {
|
||||
"path": file_path_str,
|
||||
"size": size_bytes,
|
||||
"hash": file_hash,
|
||||
"count": 1,
|
||||
}
|
||||
|
||||
if title_hits:
|
||||
required = len(terms)
|
||||
for file_hash, info in title_hits.items():
|
||||
if info.get("count") != required:
|
||||
continue
|
||||
file_path_str = info.get("path")
|
||||
if not file_path_str or file_path_str in seen_files:
|
||||
continue
|
||||
file_path = Path(file_path_str)
|
||||
if not file_path.exists():
|
||||
continue
|
||||
seen_files.add(file_path_str)
|
||||
|
||||
size_bytes = info.get("size")
|
||||
if size_bytes is None:
|
||||
try:
|
||||
size_bytes = file_path.stat().st_size
|
||||
except OSError:
|
||||
size_bytes = None
|
||||
|
||||
tags = api.get_tags_for_file(file_hash)
|
||||
entry = _create_entry(file_path, tags, size_bytes, info.get("hash"))
|
||||
results.append(entry)
|
||||
if limit is not None and len(results) >= limit:
|
||||
return results
|
||||
|
||||
query_pattern = f"%{query_lower}%"
|
||||
tag_rows = api.get_files_by_simple_tag_pattern(query_pattern, limit)
|
||||
|
||||
for file_hash, file_path_str, size_bytes, ext in tag_rows:
|
||||
required = len(terms)
|
||||
seen_files: set[str] = set()
|
||||
for file_hash, info in hits.items():
|
||||
if info.get("count") != required:
|
||||
continue
|
||||
file_path_str = info.get("path")
|
||||
if not file_path_str or file_path_str in seen_files:
|
||||
continue
|
||||
seen_files.add(file_path_str)
|
||||
|
||||
file_path = Path(file_path_str)
|
||||
if file_path.exists():
|
||||
if size_bytes is None:
|
||||
if not file_path.exists():
|
||||
continue
|
||||
seen_files.add(file_path_str)
|
||||
|
||||
size_bytes = info.get("size")
|
||||
if size_bytes is None:
|
||||
try:
|
||||
size_bytes = file_path.stat().st_size
|
||||
|
||||
tags = api.get_tags_for_file(file_hash)
|
||||
entry = _create_entry(file_path, tags, size_bytes, file_hash)
|
||||
results.append(entry)
|
||||
|
||||
if limit is not None and len(results) >= limit:
|
||||
return results
|
||||
except OSError:
|
||||
size_bytes = None
|
||||
|
||||
tags = api.get_tags_for_file(file_hash)
|
||||
entry_obj = _create_entry(file_path, tags, size_bytes, info.get("hash"))
|
||||
results.append(entry_obj)
|
||||
if limit is not None and len(results) >= limit:
|
||||
break
|
||||
|
||||
else:
|
||||
rows = api.get_all_files(limit)
|
||||
@@ -726,10 +718,8 @@ class Folder(Store):
|
||||
entry = _create_entry(file_path, tags, size_bytes, file_hash)
|
||||
results.append(entry)
|
||||
|
||||
if results:
|
||||
debug(f"Returning {len(results)} results from DB")
|
||||
else:
|
||||
debug("No results found in DB")
|
||||
backend_label = str(getattr(self, "_name", "") or getattr(self, "NAME", "") or "folder")
|
||||
debug(f"[folder:{backend_label}] {len(results)} result(s)")
|
||||
return results
|
||||
|
||||
except Exception as e:
|
||||
@@ -938,9 +928,11 @@ class Folder(Store):
|
||||
file_hash = file_identifier
|
||||
if self._location:
|
||||
try:
|
||||
from metadata import normalize_urls
|
||||
with API_folder_store(Path(self._location)) as db:
|
||||
meta = db.get_metadata(file_hash) or {}
|
||||
return list(meta.get("url") or [])
|
||||
urls = normalize_urls(meta.get("url"))
|
||||
return urls
|
||||
except Exception as exc:
|
||||
debug(f"Local DB get_metadata failed: {exc}")
|
||||
return []
|
||||
@@ -955,11 +947,13 @@ class Folder(Store):
|
||||
file_hash = file_identifier
|
||||
if self._location:
|
||||
try:
|
||||
from metadata import normalize_urls
|
||||
with API_folder_store(Path(self._location)) as db:
|
||||
meta = db.get_metadata(file_hash) or {}
|
||||
existing_urls = list(meta.get("url") or [])
|
||||
existing_urls = normalize_urls(meta.get("url"))
|
||||
incoming_urls = normalize_urls(url)
|
||||
changed = False
|
||||
for u in list(url or []):
|
||||
for u in list(incoming_urls or []):
|
||||
if not u:
|
||||
continue
|
||||
if u not in existing_urls:
|
||||
@@ -982,10 +976,11 @@ class Folder(Store):
|
||||
file_hash = file_identifier
|
||||
if self._location:
|
||||
try:
|
||||
from metadata import normalize_urls
|
||||
with API_folder_store(Path(self._location)) as db:
|
||||
meta = db.get_metadata(file_hash) or {}
|
||||
existing_urls = list(meta.get("url") or [])
|
||||
remove_set = {u for u in (url or []) if u}
|
||||
existing_urls = normalize_urls(meta.get("url"))
|
||||
remove_set = {u for u in normalize_urls(url) if u}
|
||||
if not remove_set:
|
||||
return False
|
||||
new_urls = [u for u in existing_urls if u not in remove_set]
|
||||
|
||||
@@ -264,6 +264,170 @@ class HydrusNetwork(Store):
|
||||
|
||||
debug(f"Searching Hydrus for: {query}")
|
||||
|
||||
def _extract_urls(meta_obj: Any) -> list[str]:
|
||||
if not isinstance(meta_obj, dict):
|
||||
return []
|
||||
raw = meta_obj.get("url")
|
||||
if raw is None:
|
||||
raw = meta_obj.get("urls")
|
||||
if isinstance(raw, str):
|
||||
val = raw.strip()
|
||||
return [val] if val else []
|
||||
if isinstance(raw, list):
|
||||
out: list[str] = []
|
||||
for item in raw:
|
||||
if not isinstance(item, str):
|
||||
continue
|
||||
s = item.strip()
|
||||
if s:
|
||||
out.append(s)
|
||||
return out
|
||||
return []
|
||||
|
||||
def _iter_url_filtered_metadata(url_value: str | None, want_any: bool, fetch_limit: int) -> list[dict[str, Any]]:
|
||||
"""Best-effort URL search by scanning Hydrus metadata with include_file_url=True."""
|
||||
|
||||
# First try a fast system predicate if Hydrus supports it.
|
||||
candidate_file_ids: list[int] = []
|
||||
try:
|
||||
if want_any:
|
||||
predicate = "system:has url"
|
||||
url_search = client.search_files(
|
||||
tags=[predicate],
|
||||
return_hashes=False,
|
||||
return_file_ids=True,
|
||||
return_file_count=False,
|
||||
)
|
||||
ids = url_search.get("file_ids", []) if isinstance(url_search, dict) else []
|
||||
if isinstance(ids, list):
|
||||
candidate_file_ids = [int(x) for x in ids if isinstance(x, (int, float, str)) and str(x).strip().isdigit()]
|
||||
except Exception:
|
||||
candidate_file_ids = []
|
||||
|
||||
if not candidate_file_ids:
|
||||
# Fallback: scan from system:everything and filter by URL substring.
|
||||
everything = client.search_files(
|
||||
tags=["system:everything"],
|
||||
return_hashes=False,
|
||||
return_file_ids=True,
|
||||
return_file_count=False,
|
||||
)
|
||||
ids = everything.get("file_ids", []) if isinstance(everything, dict) else []
|
||||
if isinstance(ids, list):
|
||||
candidate_file_ids = [int(x) for x in ids if isinstance(x, (int, float))]
|
||||
|
||||
if not candidate_file_ids:
|
||||
return []
|
||||
|
||||
needle = (url_value or "").strip().lower()
|
||||
chunk_size = 200
|
||||
out: list[dict[str, Any]] = []
|
||||
|
||||
for start in range(0, len(candidate_file_ids), chunk_size):
|
||||
if len(out) >= fetch_limit:
|
||||
break
|
||||
chunk = candidate_file_ids[start : start + chunk_size]
|
||||
try:
|
||||
payload = client.fetch_file_metadata(
|
||||
file_ids=chunk,
|
||||
include_file_url=True,
|
||||
include_service_keys_to_tags=True,
|
||||
include_duration=True,
|
||||
include_size=True,
|
||||
include_mime=True,
|
||||
)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
metas = payload.get("metadata", []) if isinstance(payload, dict) else []
|
||||
if not isinstance(metas, list):
|
||||
continue
|
||||
|
||||
for meta in metas:
|
||||
if not isinstance(meta, dict):
|
||||
continue
|
||||
urls = _extract_urls(meta)
|
||||
if not urls:
|
||||
continue
|
||||
if want_any:
|
||||
out.append(meta)
|
||||
if len(out) >= fetch_limit:
|
||||
break
|
||||
continue
|
||||
|
||||
if not needle:
|
||||
continue
|
||||
if any(needle in u.lower() for u in urls):
|
||||
out.append(meta)
|
||||
if len(out) >= fetch_limit:
|
||||
break
|
||||
|
||||
return out
|
||||
|
||||
query_lower = query.lower().strip()
|
||||
|
||||
# Special case: url:* and url:<value>
|
||||
metadata_list: list[dict[str, Any]] | None = None
|
||||
if ":" in query_lower and not query_lower.startswith(":"):
|
||||
namespace, pattern = query_lower.split(":", 1)
|
||||
namespace = namespace.strip().lower()
|
||||
pattern = pattern.strip()
|
||||
if namespace == "url":
|
||||
if not pattern or pattern == "*":
|
||||
metadata_list = _iter_url_filtered_metadata(None, want_any=True, fetch_limit=int(limit) if limit else 100)
|
||||
else:
|
||||
# Fast-path: exact URL via /add_url/get_url_files when a full URL is provided.
|
||||
try:
|
||||
if pattern.startswith("http://") or pattern.startswith("https://"):
|
||||
from API.HydrusNetwork import HydrusRequestSpec
|
||||
|
||||
spec = HydrusRequestSpec(method="GET", endpoint="/add_url/get_url_files", query={"url": pattern})
|
||||
response = client._perform_request(spec) # type: ignore[attr-defined]
|
||||
hashes: list[str] = []
|
||||
file_ids: list[int] = []
|
||||
if isinstance(response, dict):
|
||||
raw_hashes = response.get("hashes") or response.get("file_hashes")
|
||||
if isinstance(raw_hashes, list):
|
||||
hashes = [str(h).strip() for h in raw_hashes if isinstance(h, str) and str(h).strip()]
|
||||
raw_ids = response.get("file_ids")
|
||||
if isinstance(raw_ids, list):
|
||||
for item in raw_ids:
|
||||
try:
|
||||
file_ids.append(int(item))
|
||||
except (TypeError, ValueError):
|
||||
continue
|
||||
|
||||
if file_ids:
|
||||
payload = client.fetch_file_metadata(
|
||||
file_ids=file_ids,
|
||||
include_file_url=True,
|
||||
include_service_keys_to_tags=True,
|
||||
include_duration=True,
|
||||
include_size=True,
|
||||
include_mime=True,
|
||||
)
|
||||
metas = payload.get("metadata", []) if isinstance(payload, dict) else []
|
||||
if isinstance(metas, list):
|
||||
metadata_list = [m for m in metas if isinstance(m, dict)]
|
||||
elif hashes:
|
||||
payload = client.fetch_file_metadata(
|
||||
hashes=hashes,
|
||||
include_file_url=True,
|
||||
include_service_keys_to_tags=True,
|
||||
include_duration=True,
|
||||
include_size=True,
|
||||
include_mime=True,
|
||||
)
|
||||
metas = payload.get("metadata", []) if isinstance(payload, dict) else []
|
||||
if isinstance(metas, list):
|
||||
metadata_list = [m for m in metas if isinstance(m, dict)]
|
||||
except Exception:
|
||||
metadata_list = None
|
||||
|
||||
# Fallback: substring scan
|
||||
if metadata_list is None:
|
||||
metadata_list = _iter_url_filtered_metadata(pattern, want_any=False, fetch_limit=int(limit) if limit else 100)
|
||||
|
||||
# Parse the query into tags
|
||||
# Handle both simple tags and complex queries
|
||||
# "*" means "match all" - use system:everything tag in Hydrus
|
||||
@@ -271,7 +435,6 @@ class HydrusNetwork(Store):
|
||||
# Use system:everything to match all files in Hydrus
|
||||
tags = ["system:everything"]
|
||||
else:
|
||||
query_lower = query.lower().strip()
|
||||
# If query doesn't have a namespace (no ':'), search all files and filter by title/tags
|
||||
# If query has explicit namespace, use it as a tag search
|
||||
if ':' not in query_lower:
|
||||
@@ -286,30 +449,36 @@ class HydrusNetwork(Store):
|
||||
debug(f"Found 0 result(s)")
|
||||
return []
|
||||
|
||||
# Search files with the tags
|
||||
search_result = client.search_files(
|
||||
tags=tags,
|
||||
return_hashes=True,
|
||||
return_file_ids=True
|
||||
)
|
||||
|
||||
# Extract file IDs from search result
|
||||
file_ids = search_result.get("file_ids", [])
|
||||
hashes = search_result.get("hashes", [])
|
||||
|
||||
if not file_ids and not hashes:
|
||||
debug(f"Found 0 result(s)")
|
||||
return []
|
||||
|
||||
# Fetch metadata for the found files
|
||||
# Search files with the tags (unless url: search already produced metadata)
|
||||
results = []
|
||||
query_lower = query.lower().strip()
|
||||
# Split by comma or space for AND logic
|
||||
search_terms = set(query_lower.replace(',', ' ').split()) # For substring matching
|
||||
|
||||
if file_ids:
|
||||
metadata = client.fetch_file_metadata(file_ids=file_ids)
|
||||
metadata_list = metadata.get("metadata", [])
|
||||
|
||||
if metadata_list is None:
|
||||
search_result = client.search_files(
|
||||
tags=tags,
|
||||
return_hashes=True,
|
||||
return_file_ids=True
|
||||
)
|
||||
|
||||
file_ids = search_result.get("file_ids", []) if isinstance(search_result, dict) else []
|
||||
hashes = search_result.get("hashes", []) if isinstance(search_result, dict) else []
|
||||
|
||||
if not file_ids and not hashes:
|
||||
debug(f"Found 0 result(s)")
|
||||
return []
|
||||
|
||||
if file_ids:
|
||||
metadata = client.fetch_file_metadata(file_ids=file_ids)
|
||||
metadata_list = metadata.get("metadata", [])
|
||||
elif hashes:
|
||||
metadata = client.fetch_file_metadata(hashes=hashes)
|
||||
metadata_list = metadata.get("metadata", [])
|
||||
else:
|
||||
metadata_list = []
|
||||
|
||||
if not isinstance(metadata_list, list):
|
||||
metadata_list = []
|
||||
|
||||
for meta in metadata_list:
|
||||
if len(results) >= limit:
|
||||
|
||||
@@ -119,6 +119,37 @@ class Store:
|
||||
self._backend_errors: Dict[str, str] = {}
|
||||
self._load_backends()
|
||||
|
||||
def _maybe_register_temp_alias(self, store_type: str, backend_name: str, kwargs: Dict[str, Any], backend: BaseStore) -> None:
|
||||
"""If a folder backend points at config['temp'], also expose it as the 'temp' backend.
|
||||
|
||||
This keeps config compatibility (e.g. existing 'default') while presenting the temp
|
||||
directory under a clearer name.
|
||||
"""
|
||||
try:
|
||||
if _normalize_store_type(store_type) != "folder":
|
||||
return
|
||||
temp_value = self._config.get("temp")
|
||||
if not temp_value:
|
||||
return
|
||||
path_value = kwargs.get("PATH") or kwargs.get("path")
|
||||
if not path_value:
|
||||
return
|
||||
|
||||
temp_path = Path(str(temp_value)).expanduser().resolve()
|
||||
backend_path = Path(str(path_value)).expanduser().resolve()
|
||||
if backend_path != temp_path:
|
||||
return
|
||||
|
||||
# If the user already has a dedicated temp backend, do nothing.
|
||||
if "temp" in self._backends:
|
||||
return
|
||||
|
||||
# Keep original name working, but add an alias.
|
||||
if backend_name != "temp":
|
||||
self._backends["temp"] = backend
|
||||
except Exception:
|
||||
return
|
||||
|
||||
def _load_backends(self) -> None:
|
||||
store_cfg = self._config.get("store")
|
||||
if not isinstance(store_cfg, dict):
|
||||
@@ -161,6 +192,9 @@ class Store:
|
||||
|
||||
backend_name = str(kwargs.get("NAME") or instance_name)
|
||||
self._backends[backend_name] = backend
|
||||
|
||||
# If this is the configured temp directory, also alias it as 'temp'.
|
||||
self._maybe_register_temp_alias(store_type, backend_name, kwargs, backend)
|
||||
except Exception as exc:
|
||||
err_text = str(exc)
|
||||
self._backend_errors[str(instance_name)] = err_text
|
||||
@@ -177,11 +211,24 @@ class Store:
|
||||
return sorted(self._backends.keys())
|
||||
|
||||
def list_searchable_backends(self) -> list[str]:
|
||||
searchable: list[str] = []
|
||||
# De-duplicate backends by instance (aliases can point at the same object).
|
||||
def _rank(name: str) -> int:
|
||||
n = str(name or "").strip().lower()
|
||||
if n == "temp":
|
||||
return 0
|
||||
if n == "default":
|
||||
return 2
|
||||
return 1
|
||||
|
||||
chosen: Dict[int, str] = {}
|
||||
for name, backend in self._backends.items():
|
||||
if type(backend).search is not BaseStore.search:
|
||||
searchable.append(name)
|
||||
return sorted(searchable)
|
||||
if type(backend).search is BaseStore.search:
|
||||
continue
|
||||
key = id(backend)
|
||||
prev = chosen.get(key)
|
||||
if prev is None or _rank(name) < _rank(prev):
|
||||
chosen[key] = name
|
||||
return sorted(chosen.values())
|
||||
|
||||
def __getitem__(self, backend_name: str) -> BaseStore:
|
||||
if backend_name not in self._backends:
|
||||
|
||||
@@ -5,10 +5,9 @@ from __future__ import annotations
|
||||
|
||||
import json
|
||||
import sys
|
||||
import inspect
|
||||
from collections.abc import Iterable as IterableABC
|
||||
|
||||
from SYS.logger import log, debug
|
||||
from SYS.logger import log
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Set
|
||||
from dataclasses import dataclass, field
|
||||
@@ -690,7 +689,9 @@ def get_field(obj: Any, field: str, default: Optional[Any] = None) -> Any:
|
||||
get_field(result, "table", "unknown") # With default
|
||||
"""
|
||||
# Handle lists by accessing the first element
|
||||
if isinstance(obj, list) and obj:
|
||||
if isinstance(obj, list):
|
||||
if not obj:
|
||||
return default
|
||||
obj = obj[0]
|
||||
|
||||
if isinstance(obj, dict):
|
||||
@@ -702,8 +703,9 @@ def get_field(obj: Any, field: str, default: Optional[Any] = None) -> Any:
|
||||
return value
|
||||
|
||||
# For PipeObjects, also check the extra field
|
||||
if hasattr(obj, 'extra') and isinstance(obj.extra, dict):
|
||||
return obj.extra.get(field, default)
|
||||
extra_val = getattr(obj, 'extra', None)
|
||||
if isinstance(extra_val, dict):
|
||||
return extra_val.get(field, default)
|
||||
|
||||
return default
|
||||
|
||||
@@ -1118,7 +1120,7 @@ def create_pipe_object_result(
|
||||
Returns:
|
||||
Dict with all PipeObject fields for emission
|
||||
"""
|
||||
result = {
|
||||
result: Dict[str, Any] = {
|
||||
'source': source,
|
||||
'id': identifier,
|
||||
'path': file_path,
|
||||
@@ -1546,14 +1548,11 @@ def coerce_to_pipe_object(value: Any, default_path: Optional[str] = None) -> mod
|
||||
extra = {k: v for k, v in value.items() if k not in known_keys}
|
||||
|
||||
# Extract URL: prefer direct url field, then url list
|
||||
url_val = value.get("url")
|
||||
if not url_val:
|
||||
url = value.get("url") or value.get("url") or []
|
||||
if url and isinstance(url, list) and len(url) > 0:
|
||||
url_val = url[0]
|
||||
# Preserve url in extra if multiple url exist
|
||||
if url and len(url) > 1:
|
||||
extra["url"] = url
|
||||
from metadata import normalize_urls
|
||||
url_list = normalize_urls(value.get("url"))
|
||||
url_val = url_list[0] if url_list else None
|
||||
if len(url_list) > 1:
|
||||
extra["url"] = url_list
|
||||
|
||||
# Extract relationships
|
||||
rels = value.get("relationships") or {}
|
||||
|
||||
@@ -1,14 +1,16 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, Dict, Optional, Sequence, Tuple, List, Union
|
||||
from typing import Any, Dict, Optional, Sequence, Tuple, List
|
||||
from pathlib import Path
|
||||
import sys
|
||||
import shutil
|
||||
import tempfile
|
||||
|
||||
import models
|
||||
import pipeline as ctx
|
||||
from API import HydrusNetwork as hydrus_wrapper
|
||||
from SYS.logger import log, debug
|
||||
from SYS.utils_constant import ALL_SUPPORTED_EXTENSIONS
|
||||
from Store import Store
|
||||
from ._shared import (
|
||||
Cmdlet, CmdletArg, parse_cmdlet_args, SharedArgs,
|
||||
@@ -20,8 +22,8 @@ from API.folder import read_sidecar, find_sidecar, write_sidecar, API_folder_sto
|
||||
from SYS.utils import sha256_file, unique_path
|
||||
from metadata import write_metadata
|
||||
|
||||
# Use official Hydrus supported filetypes from hydrus_wrapper
|
||||
SUPPORTED_MEDIA_EXTENSIONS = hydrus_wrapper.ALL_SUPPORTED_EXTENSIONS
|
||||
# Canonical supported filetypes for all stores/cmdlets
|
||||
SUPPORTED_MEDIA_EXTENSIONS = ALL_SUPPORTED_EXTENSIONS
|
||||
|
||||
class Add_File(Cmdlet):
|
||||
"""Add file into the DB"""
|
||||
@@ -53,93 +55,210 @@ class Add_File(Cmdlet):
|
||||
|
||||
def run(self, result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
|
||||
"""Main execution entry point."""
|
||||
# Parse arguments
|
||||
parsed = parse_cmdlet_args(args, self)
|
||||
|
||||
# Initialize state
|
||||
path_arg = parsed.get("path")
|
||||
location = parsed.get("store") # Fixed: was "storage", should be "store"
|
||||
location = parsed.get("store")
|
||||
provider_name = parsed.get("provider")
|
||||
delete_after = parsed.get("delete", False)
|
||||
|
||||
# Coerce result to PipeObject; if result is a list, prefer the first element
|
||||
effective_result = result
|
||||
if isinstance(result, list) and result:
|
||||
first_item = result[0]
|
||||
# Prefer first item if it's a dict or PipeObject
|
||||
if isinstance(first_item, (dict, )):
|
||||
effective_result = first_item
|
||||
pipe_obj = coerce_to_pipe_object(effective_result, path_arg)
|
||||
stage_ctx = ctx.get_stage_context()
|
||||
is_last_stage = (stage_ctx is None) or bool(getattr(stage_ctx, "is_last_stage", False))
|
||||
|
||||
# Decide which items to process.
|
||||
# - If user provided -path, treat this invocation as single-item.
|
||||
# - Otherwise, if piped input is a list, ingest each item.
|
||||
if path_arg:
|
||||
items_to_process: List[Any] = [result]
|
||||
elif isinstance(result, list) and result:
|
||||
items_to_process = list(result)
|
||||
else:
|
||||
items_to_process = [result]
|
||||
|
||||
# Debug: Log input result details
|
||||
debug(f"[add-file] INPUT result type={type(result).__name__}")
|
||||
if isinstance(result, list):
|
||||
debug(f"[add-file] INPUT result is list with {len(result)} items")
|
||||
if result and isinstance(result[0], dict):
|
||||
first = result[0]
|
||||
hash_val = first.get('hash')
|
||||
hash_str = hash_val[:12] + "..." if hash_val else "N/A"
|
||||
debug(f"[add-file] First item details: title={first.get('title')}, hash={hash_str}, store={first.get('store', 'N/A')}")
|
||||
elif isinstance(result, dict):
|
||||
hash_val = result.get('hash')
|
||||
hash_str = hash_val[:12] + "..." if hash_val else "N/A"
|
||||
debug(f"[add-file] INPUT result is dict: title={result.get('title')}, hash={hash_str}, store={result.get('store', 'N/A')}")
|
||||
|
||||
# Debug: Log parsed arguments
|
||||
debug(f"[add-file] PARSED args: location={location}, provider={provider_name}, delete={delete_after}")
|
||||
|
||||
# Resolve source - returns (media_path_or_url, file_hash)
|
||||
media_path_or_url, file_hash = self._resolve_source(result, path_arg, pipe_obj, config)
|
||||
debug(f"[add-file] RESOLVED source: path={media_path_or_url}, hash={file_hash[:12] if file_hash else 'N/A'}...")
|
||||
if not media_path_or_url:
|
||||
debug(f"[add-file] ERROR: Could not resolve source file/URL")
|
||||
return 1
|
||||
|
||||
# Update pipe_obj with resolved path
|
||||
pipe_obj.path = str(media_path_or_url) if isinstance(media_path_or_url, (str, Path)) else str(media_path_or_url)
|
||||
|
||||
# Check if it's a URL before validating as file
|
||||
if isinstance(media_path_or_url, str) and media_path_or_url.lower().startswith(("http://", "https://", "magnet:", "torrent:")):
|
||||
debug(f"Detected URL target, delegating to download-data: {media_path_or_url}")
|
||||
return self._delegate_to_download_data(result, media_path_or_url, location, provider_name, args, config)
|
||||
collected_payloads: List[Dict[str, Any]] = []
|
||||
successes = 0
|
||||
failures = 0
|
||||
|
||||
# Convert to Path and validate
|
||||
media_path = Path(media_path_or_url) if isinstance(media_path_or_url, str) else media_path_or_url
|
||||
|
||||
# Validate source
|
||||
if not self._validate_source(media_path):
|
||||
debug(f"[add-file] ERROR: Source validation failed for {media_path}")
|
||||
return 1
|
||||
# Only run the search-store refresh when add-file is the last stage.
|
||||
# In the middle of a pipeline, downstream cmdlets should receive the emitted
|
||||
# storage payload directly (no need to re-search and risk duplicate emits).
|
||||
auto_search_store_after_add = bool(is_last_stage) and len(items_to_process) == 1
|
||||
|
||||
# Debug: Log execution path decision
|
||||
debug(f"[add-file] DECISION POINT: provider={provider_name}, location={location}")
|
||||
debug(f" media_path={media_path}, exists={media_path.exists()}")
|
||||
for item in items_to_process:
|
||||
pipe_obj = coerce_to_pipe_object(item, path_arg)
|
||||
|
||||
# Execute transfer based on destination (using Store registry)
|
||||
if provider_name:
|
||||
debug(f"[add-file] ROUTE: file provider upload")
|
||||
return self._handle_provider_upload(media_path, provider_name, pipe_obj, config, delete_after)
|
||||
elif location:
|
||||
# Check if location is a registered backend name
|
||||
temp_dir_to_cleanup: Optional[Path] = None
|
||||
delete_after_item = delete_after
|
||||
try:
|
||||
store = Store(config)
|
||||
backends = store.list_backends()
|
||||
|
||||
if location in backends:
|
||||
debug(f"[add-file] ROUTE: storage backend '{location}'")
|
||||
return self._handle_storage_backend(media_path, location, pipe_obj, config, delete_after)
|
||||
else:
|
||||
# Treat as local export path
|
||||
debug(f"[add-file] ROUTE: local export to path '{location}'")
|
||||
return self._handle_local_export(media_path, location, pipe_obj, config, delete_after)
|
||||
except Exception as exc:
|
||||
debug(f"[add-file] ERROR: Failed to resolve location: {exc}")
|
||||
log(f"Invalid location: {location}", file=sys.stderr)
|
||||
return 1
|
||||
else:
|
||||
debug(f"[add-file] ERROR: No location or provider specified")
|
||||
log(f"No storage location or provider specified", file=sys.stderr)
|
||||
return 1
|
||||
media_path_or_url, file_hash = self._resolve_source(item, path_arg, pipe_obj, config)
|
||||
debug(f"[add-file] RESOLVED source: path={media_path_or_url}, hash={file_hash[:12] if file_hash else 'N/A'}...")
|
||||
if not media_path_or_url:
|
||||
failures += 1
|
||||
continue
|
||||
|
||||
# Update pipe_obj with resolved path
|
||||
pipe_obj.path = str(media_path_or_url)
|
||||
|
||||
# URL targets: prefer provider-aware download for OpenLibrary selections.
|
||||
if isinstance(media_path_or_url, str) and media_path_or_url.lower().startswith(
|
||||
("http://", "https://", "magnet:", "torrent:")
|
||||
):
|
||||
table = None
|
||||
full_metadata = None
|
||||
if isinstance(pipe_obj.extra, dict):
|
||||
table = pipe_obj.extra.get("table")
|
||||
full_metadata = pipe_obj.extra.get("full_metadata")
|
||||
|
||||
is_openlibrary = (str(table or "").lower() == "openlibrary") or ("openlibrary.org/books/" in media_path_or_url.lower())
|
||||
if is_openlibrary:
|
||||
# Enrich tags from OpenLibrary metadata so the stored file has book tags (author/pages/etc).
|
||||
try:
|
||||
from Provider.openlibrary import OpenLibrary as _OpenLibrary
|
||||
|
||||
olid = None
|
||||
archive_id = None
|
||||
if isinstance(full_metadata, dict):
|
||||
olid = full_metadata.get("openlibrary_id") or full_metadata.get("openlibrary")
|
||||
archive_id = full_metadata.get("archive_id")
|
||||
|
||||
if not olid:
|
||||
import re
|
||||
m = re.search(r"/books/(OL\d+M)", str(media_path_or_url), flags=re.IGNORECASE)
|
||||
if m:
|
||||
olid = m.group(1)
|
||||
|
||||
scraped_tags: List[str] = []
|
||||
if olid:
|
||||
scraped_tags.extend(_OpenLibrary.scrape_openlibrary_metadata(str(olid)) or [])
|
||||
if archive_id:
|
||||
scraped_tags.append(f"internet_archive:{archive_id}")
|
||||
|
||||
if scraped_tags:
|
||||
existing = list(pipe_obj.tag or [])
|
||||
pipe_obj.tag = merge_sequences(existing, scraped_tags, case_sensitive=False)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
from ProviderCore.registry import get_search_provider
|
||||
from ProviderCore.base import SearchResult
|
||||
|
||||
provider = get_search_provider("openlibrary", config)
|
||||
if provider is None:
|
||||
log("[add-file] OpenLibrary provider not available", file=sys.stderr)
|
||||
failures += 1
|
||||
continue
|
||||
|
||||
temp_dir_to_cleanup = Path(tempfile.mkdtemp(prefix="medios_openlibrary_"))
|
||||
sr = SearchResult(
|
||||
table="openlibrary",
|
||||
title=str(getattr(pipe_obj, "title", None) or "Unknown"),
|
||||
path=str(media_path_or_url),
|
||||
full_metadata=full_metadata if isinstance(full_metadata, dict) else {},
|
||||
)
|
||||
downloaded = provider.download(sr, temp_dir_to_cleanup)
|
||||
if downloaded is None:
|
||||
log("[add-file] OpenLibrary download failed", file=sys.stderr)
|
||||
failures += 1
|
||||
continue
|
||||
|
||||
downloaded_path = Path(downloaded)
|
||||
if downloaded_path.exists() and downloaded_path.is_dir():
|
||||
log(
|
||||
"[add-file] OpenLibrary download produced a directory (missing img2pdf?). Cannot ingest.",
|
||||
file=sys.stderr,
|
||||
)
|
||||
failures += 1
|
||||
continue
|
||||
|
||||
media_path_or_url = str(downloaded_path)
|
||||
pipe_obj.path = str(downloaded_path)
|
||||
delete_after_item = True
|
||||
|
||||
# For non-provider URLs, or if still a URL after provider attempt, delegate to download-media.
|
||||
if isinstance(media_path_or_url, str) and media_path_or_url.lower().startswith(
|
||||
("http://", "https://", "magnet:", "torrent:")
|
||||
):
|
||||
code = self._delegate_to_download_data(item, media_path_or_url, location, provider_name, args, config)
|
||||
if code == 0:
|
||||
successes += 1
|
||||
else:
|
||||
failures += 1
|
||||
continue
|
||||
|
||||
media_path = Path(media_path_or_url) if isinstance(media_path_or_url, str) else media_path_or_url
|
||||
|
||||
if not self._validate_source(media_path):
|
||||
failures += 1
|
||||
continue
|
||||
|
||||
if provider_name:
|
||||
code = self._handle_provider_upload(media_path, provider_name, pipe_obj, config, delete_after_item)
|
||||
if code == 0:
|
||||
successes += 1
|
||||
else:
|
||||
failures += 1
|
||||
continue
|
||||
|
||||
if location:
|
||||
try:
|
||||
store = Store(config)
|
||||
backends = store.list_backends()
|
||||
if location in backends:
|
||||
code = self._handle_storage_backend(
|
||||
item,
|
||||
media_path,
|
||||
location,
|
||||
pipe_obj,
|
||||
config,
|
||||
delete_after_item,
|
||||
collect_payloads=collected_payloads,
|
||||
suppress_last_stage_overlay=is_last_stage and len(items_to_process) > 1,
|
||||
auto_search_store=auto_search_store_after_add,
|
||||
)
|
||||
else:
|
||||
code = self._handle_local_export(media_path, location, pipe_obj, config, delete_after_item)
|
||||
except Exception as exc:
|
||||
debug(f"[add-file] ERROR: Failed to resolve location: {exc}")
|
||||
log(f"Invalid location: {location}", file=sys.stderr)
|
||||
failures += 1
|
||||
continue
|
||||
|
||||
if code == 0:
|
||||
successes += 1
|
||||
else:
|
||||
failures += 1
|
||||
continue
|
||||
|
||||
log("No destination specified", file=sys.stderr)
|
||||
failures += 1
|
||||
finally:
|
||||
if temp_dir_to_cleanup is not None:
|
||||
try:
|
||||
shutil.rmtree(temp_dir_to_cleanup, ignore_errors=True)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# If we processed multiple storage ingests, present a single consolidated overlay table.
|
||||
if is_last_stage and len(items_to_process) > 1 and collected_payloads:
|
||||
try:
|
||||
from result_table import ResultTable
|
||||
|
||||
table = ResultTable("Result")
|
||||
for payload in collected_payloads:
|
||||
table.add_result(payload)
|
||||
# Make this the active selectable table so @.. returns here (and playlist table is kept in history).
|
||||
ctx.set_last_result_table(table, collected_payloads, subject=collected_payloads)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if successes > 0:
|
||||
return 0
|
||||
return 1
|
||||
|
||||
@staticmethod
|
||||
def _resolve_source(
|
||||
@@ -149,10 +268,7 @@ class Add_File(Cmdlet):
|
||||
config: Dict[str, Any],
|
||||
) -> Tuple[Optional[Path | str], Optional[str]]:
|
||||
"""Resolve the source file path from args or pipeline result.
|
||||
|
||||
PRIORITY: hash+store pattern is preferred over path-based resolution.
|
||||
This ensures consistency when @N selections pass hash+store identifiers.
|
||||
|
||||
|
||||
Returns (media_path_or_url, file_hash)
|
||||
where media_path_or_url can be a Path object or a URL string.
|
||||
"""
|
||||
@@ -161,8 +277,9 @@ class Add_File(Cmdlet):
|
||||
result_hash = result.get("hash")
|
||||
result_store = result.get("store")
|
||||
if result_hash and result_store:
|
||||
debug(f"[add-file] Using hash+store from result: hash={result_hash[:12]}..., store={result_store}")
|
||||
# Use get_file to retrieve from the specific store
|
||||
debug(
|
||||
f"[add-file] Using hash+store from result: hash={str(result_hash)[:12]}..., store={result_store}"
|
||||
)
|
||||
try:
|
||||
store = Store(config)
|
||||
if result_store in store.list_backends():
|
||||
@@ -170,16 +287,15 @@ class Add_File(Cmdlet):
|
||||
media_path = backend.get_file(result_hash)
|
||||
if isinstance(media_path, Path) and media_path.exists():
|
||||
pipe_obj.path = str(media_path)
|
||||
debug(f"[add-file] Retrieved file from {result_store}: {media_path}")
|
||||
return media_path, result_hash
|
||||
|
||||
if isinstance(media_path, str) and media_path.lower().startswith(("http://", "https://")):
|
||||
return media_path, str(result_hash)
|
||||
if isinstance(media_path, str) and media_path.lower().startswith(
|
||||
("http://", "https://", "magnet:", "torrent:")
|
||||
):
|
||||
pipe_obj.path = media_path
|
||||
debug(f"[add-file] Retrieved URL from {result_store}: {media_path}")
|
||||
return media_path, result_hash
|
||||
return media_path, str(result_hash)
|
||||
except Exception as exc:
|
||||
debug(f"[add-file] Failed to retrieve via hash+store: {exc}")
|
||||
|
||||
|
||||
# PRIORITY 2: Try explicit path argument
|
||||
if path_arg:
|
||||
media_path = Path(path_arg)
|
||||
@@ -196,10 +312,9 @@ class Add_File(Cmdlet):
|
||||
file_hash = pipe_path_str.split(":", 1)[1]
|
||||
media_path, success = Add_File._fetch_hydrus_path(file_hash, config)
|
||||
return media_path, file_hash if success else None
|
||||
# Check if pipe_path is a URL - skip to URL handling below
|
||||
if not pipe_path_str.lower().startswith(("http://", "https://", "magnet:", "torrent:")):
|
||||
media_path = Path(pipe_path_str)
|
||||
return media_path, None
|
||||
if pipe_path_str.lower().startswith(("http://", "https://", "magnet:", "torrent:")):
|
||||
return pipe_path_str, None
|
||||
return Path(pipe_path_str), None
|
||||
|
||||
# PRIORITY 4: Try from pipe_obj.url (for streaming url without downloaded file)
|
||||
pipe_url = getattr(pipe_obj, "url", None)
|
||||
@@ -248,8 +363,9 @@ class Add_File(Cmdlet):
|
||||
# Look for path or path-like keys
|
||||
path_candidate = first_item.get("path") or first_item.get("filepath") or first_item.get("file")
|
||||
# If the dict includes a 'paths' list (multi-part/section download), prefer the first file
|
||||
if not path_candidate and isinstance(first_item.get("paths"), (list, tuple)) and first_item.get("paths"):
|
||||
path_candidate = first_item.get("paths")[0]
|
||||
paths_val = first_item.get("paths")
|
||||
if not path_candidate and isinstance(paths_val, (list, tuple)) and paths_val:
|
||||
path_candidate = paths_val[0]
|
||||
if path_candidate:
|
||||
debug(f"Resolved path from result dict: {path_candidate}")
|
||||
try:
|
||||
@@ -361,10 +477,12 @@ class Add_File(Cmdlet):
|
||||
selection_args = result["_selection_args"]
|
||||
if selection_args:
|
||||
dl_args.extend(selection_args)
|
||||
elif hasattr(result, 'extra') and isinstance(result.extra, dict) and "_selection_args" in result.extra:
|
||||
selection_args = result.extra["_selection_args"]
|
||||
if selection_args:
|
||||
dl_args.extend(selection_args)
|
||||
else:
|
||||
extra_val = getattr(result, "extra", None)
|
||||
if isinstance(extra_val, dict) and "_selection_args" in extra_val:
|
||||
selection_args = extra_val["_selection_args"]
|
||||
if selection_args:
|
||||
dl_args.extend(selection_args)
|
||||
|
||||
# download-media doesn't support -storage flag
|
||||
# It downloads to the configured directory, then add-file will handle storage
|
||||
@@ -375,18 +493,32 @@ class Add_File(Cmdlet):
|
||||
|
||||
@staticmethod
|
||||
def _get_url(result: Any, pipe_obj: models.PipeObject) -> List[str]:
|
||||
url: List[str] = []
|
||||
try:
|
||||
if isinstance(pipe_obj.extra, dict):
|
||||
url = list(pipe_obj.extra.get("url") or pipe_obj.extra.get("url") or [])
|
||||
except Exception:
|
||||
pass
|
||||
from metadata import normalize_urls
|
||||
|
||||
if not url and isinstance(result, dict):
|
||||
url = list(result.get("url") or result.get("url") or [])
|
||||
if not url:
|
||||
url = list(extract_url_from_result(result) or [])
|
||||
return url
|
||||
# Prefer explicit PipeObject.url if present
|
||||
urls: List[str] = []
|
||||
try:
|
||||
urls = normalize_urls(getattr(pipe_obj, "url", None))
|
||||
except Exception:
|
||||
urls = []
|
||||
|
||||
# Then check extra.url
|
||||
if not urls:
|
||||
try:
|
||||
if isinstance(pipe_obj.extra, dict):
|
||||
urls = normalize_urls(pipe_obj.extra.get("url"))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Then check result dict
|
||||
if not urls and isinstance(result, dict):
|
||||
urls = normalize_urls(result.get("url"))
|
||||
|
||||
# Finally, try extractor helper
|
||||
if not urls:
|
||||
urls = normalize_urls(extract_url_from_result(result))
|
||||
|
||||
return urls
|
||||
|
||||
@staticmethod
|
||||
def _get_relationships(result: Any, pipe_obj: models.PipeObject) -> Optional[Dict[str, Any]]:
|
||||
@@ -405,10 +537,36 @@ class Add_File(Cmdlet):
|
||||
|
||||
@staticmethod
|
||||
def _get_duration(result: Any, pipe_obj: models.PipeObject) -> Optional[float]:
|
||||
if getattr(pipe_obj, "duration", None) is not None:
|
||||
return pipe_obj.duration
|
||||
def _parse_duration(value: Any) -> Optional[float]:
|
||||
if value is None:
|
||||
return None
|
||||
if isinstance(value, (int, float)):
|
||||
return float(value) if value > 0 else None
|
||||
if isinstance(value, str):
|
||||
s = value.strip()
|
||||
if not s:
|
||||
return None
|
||||
try:
|
||||
candidate = float(s)
|
||||
return candidate if candidate > 0 else None
|
||||
except ValueError:
|
||||
pass
|
||||
if ":" in s:
|
||||
parts = [p.strip() for p in s.split(":") if p.strip()]
|
||||
if len(parts) in {2, 3} and all(p.isdigit() for p in parts):
|
||||
nums = [int(p) for p in parts]
|
||||
if len(nums) == 2:
|
||||
minutes, seconds = nums
|
||||
return float(minutes * 60 + seconds)
|
||||
hours, minutes, seconds = nums
|
||||
return float(hours * 3600 + minutes * 60 + seconds)
|
||||
return None
|
||||
|
||||
parsed = _parse_duration(getattr(pipe_obj, "duration", None))
|
||||
if parsed is not None:
|
||||
return parsed
|
||||
try:
|
||||
return extract_duration(result)
|
||||
return _parse_duration(extract_duration(result))
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
@@ -442,19 +600,20 @@ class Add_File(Cmdlet):
|
||||
ctx.set_current_stage_table(None)
|
||||
|
||||
@staticmethod
|
||||
def _emit_storage_result(payload: Dict[str, Any]) -> None:
|
||||
def _emit_storage_result(payload: Dict[str, Any], *, overlay: bool = True, emit: bool = True) -> None:
|
||||
"""Emit a storage-style result payload.
|
||||
|
||||
- Always emits the dict downstream (when in a pipeline).
|
||||
- If this is the last stage (or not in a pipeline), prints a search-store-like table
|
||||
and sets an overlay table/items for @N selection.
|
||||
"""
|
||||
# Always emit for downstream commands (no-op if not in a pipeline)
|
||||
ctx.emit(payload)
|
||||
# Emit for downstream commands (no-op if not in a pipeline)
|
||||
if emit:
|
||||
ctx.emit(payload)
|
||||
|
||||
stage_ctx = ctx.get_stage_context()
|
||||
is_last = (stage_ctx is None) or bool(getattr(stage_ctx, "is_last_stage", False))
|
||||
if not is_last:
|
||||
if not is_last or not overlay:
|
||||
return
|
||||
|
||||
try:
|
||||
@@ -470,6 +629,53 @@ class Add_File(Cmdlet):
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def _try_emit_search_store_by_hash(*, store: str, hash_value: str, config: Dict[str, Any]) -> bool:
|
||||
"""Run search-store for a single hash so the final table/payload is consistent.
|
||||
|
||||
Important: `add-file` is treated as an action command by the CLI, so the CLI only
|
||||
prints tables for it when a display overlay exists. After running search-store,
|
||||
this copies the resulting table into the display overlay (when this is the last
|
||||
stage) so the canonical store table is what the user sees and can select from.
|
||||
|
||||
Returns True if search-store ran successfully, else False.
|
||||
"""
|
||||
try:
|
||||
from cmdlet.search_store import CMDLET as search_store_cmdlet
|
||||
|
||||
args = ["-store", str(store), f"hash:{str(hash_value)}"]
|
||||
log(f"[add-file] Refresh: search-store -store {store} \"hash:{hash_value}\"", file=sys.stderr)
|
||||
|
||||
# Run search-store under a temporary stage context so its ctx.emit() calls
|
||||
# don't interfere with the outer add-file pipeline stage.
|
||||
prev_ctx = ctx.get_stage_context()
|
||||
temp_ctx = ctx.PipelineStageContext(stage_index=0, total_stages=1, worker_id=getattr(prev_ctx, "worker_id", None))
|
||||
ctx.set_stage_context(temp_ctx)
|
||||
try:
|
||||
code = search_store_cmdlet.run(None, args, config)
|
||||
finally:
|
||||
ctx.set_stage_context(prev_ctx)
|
||||
if code != 0:
|
||||
return False
|
||||
|
||||
# Promote the search-store result to a display overlay so the CLI prints it
|
||||
# for action commands like add-file.
|
||||
stage_ctx = ctx.get_stage_context()
|
||||
is_last = (stage_ctx is None) or bool(getattr(stage_ctx, "is_last_stage", False))
|
||||
if is_last:
|
||||
try:
|
||||
table = ctx.get_last_result_table()
|
||||
items = ctx.get_last_result_items()
|
||||
if table is not None and items:
|
||||
ctx.set_last_result_table_overlay(table, items, subject={"store": store, "hash": hash_value})
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return True
|
||||
except Exception as exc:
|
||||
debug(f"[add-file] Failed to run search-store after add-file: {type(exc).__name__}: {exc}")
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def _prepare_metadata(
|
||||
result: Any,
|
||||
@@ -664,8 +870,9 @@ class Add_File(Cmdlet):
|
||||
|
||||
if not username or not filename:
|
||||
debug(f"[add-file] ERROR: Could not extract soulseek metadata from result (type={type(result).__name__})")
|
||||
if hasattr(result, "extra"):
|
||||
debug(f"[add-file] Result extra keys: {list(result.extra.keys())}")
|
||||
extra_val = getattr(result, "extra", None)
|
||||
if isinstance(extra_val, dict):
|
||||
debug(f"[add-file] Result extra keys: {list(extra_val.keys())}")
|
||||
return None
|
||||
|
||||
if not username or not filename:
|
||||
@@ -769,28 +976,55 @@ class Add_File(Cmdlet):
|
||||
|
||||
@staticmethod
|
||||
def _handle_storage_backend(
|
||||
result: Any,
|
||||
media_path: Path,
|
||||
backend_name: str,
|
||||
pipe_obj: models.PipeObject,
|
||||
config: Dict[str, Any],
|
||||
delete_after: bool,
|
||||
*,
|
||||
collect_payloads: Optional[List[Dict[str, Any]]] = None,
|
||||
suppress_last_stage_overlay: bool = False,
|
||||
auto_search_store: bool = True,
|
||||
) -> int:
|
||||
"""Handle uploading to a registered storage backend (e.g., 'test' folder store, 'hydrus', etc.)."""
|
||||
log(f"Adding file to storage backend '{backend_name}': {media_path.name}", file=sys.stderr)
|
||||
|
||||
delete_after_effective = bool(delete_after)
|
||||
if not delete_after_effective:
|
||||
# When download-media is piped into add-file, the downloaded artifact is a temp file.
|
||||
# After it is persisted to a storage backend, delete the temp copy to avoid duplicates.
|
||||
try:
|
||||
if (
|
||||
str(backend_name or "").strip().lower() != "temp"
|
||||
and getattr(pipe_obj, "is_temp", False)
|
||||
and getattr(pipe_obj, "action", None) == "cmdlet:download-media"
|
||||
):
|
||||
from config import resolve_output_dir
|
||||
temp_dir = resolve_output_dir(config)
|
||||
try:
|
||||
if media_path.resolve().is_relative_to(temp_dir.expanduser().resolve()):
|
||||
delete_after_effective = True
|
||||
debug(f"[add-file] Auto-delete temp source after ingest: {media_path}")
|
||||
except Exception:
|
||||
# If path resolution fails, fall back to non-destructive behavior
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
store = Store(config)
|
||||
backend = store[backend_name]
|
||||
|
||||
# Prepare metadata from pipe_obj and sidecars
|
||||
tags, url, title, f_hash = Add_File._prepare_metadata(None, media_path, pipe_obj, config)
|
||||
tags, url, title, f_hash = Add_File._prepare_metadata(result, media_path, pipe_obj, config)
|
||||
|
||||
# Call backend's add_file with full metadata
|
||||
# Backend returns hash as identifier
|
||||
file_identifier = backend.add_file(
|
||||
media_path,
|
||||
title=title,
|
||||
tags=tags,
|
||||
tag=tags,
|
||||
url=url
|
||||
)
|
||||
log(f"✓ File added to '{backend_name}': {file_identifier}", file=sys.stderr)
|
||||
@@ -822,6 +1056,14 @@ class Add_File(Cmdlet):
|
||||
# Keep hash/store for downstream commands (get-tag, get-file, etc.).
|
||||
resolved_hash = file_identifier if len(file_identifier) == 64 else (f_hash or file_identifier or "unknown")
|
||||
|
||||
# If we have url(s), ensure they get associated with the destination file.
|
||||
# This mirrors `add-url` behavior but avoids emitting extra pipeline noise.
|
||||
if url:
|
||||
try:
|
||||
backend.add_url(resolved_hash, list(url))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
meta: Dict[str, Any] = {}
|
||||
try:
|
||||
meta = backend.get_metadata(resolved_hash) or {}
|
||||
@@ -865,9 +1107,30 @@ class Add_File(Cmdlet):
|
||||
"tag": list(tags or []),
|
||||
"url": list(url or []),
|
||||
}
|
||||
Add_File._emit_storage_result(payload)
|
||||
if collect_payloads is not None:
|
||||
try:
|
||||
collect_payloads.append(payload)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Keep the add-file 1-row summary overlay (when last stage), then emit the
|
||||
# canonical search-store payload/table for piping/selection consistency.
|
||||
if auto_search_store and resolved_hash and resolved_hash != "unknown":
|
||||
# Show the add-file summary (overlay only) but let search-store provide the downstream payload.
|
||||
Add_File._emit_storage_result(payload, overlay=not suppress_last_stage_overlay, emit=False)
|
||||
|
||||
ok = Add_File._try_emit_search_store_by_hash(
|
||||
store=backend_name,
|
||||
hash_value=resolved_hash,
|
||||
config=config,
|
||||
)
|
||||
if not ok:
|
||||
# Fall back to emitting the add-file payload so downstream stages still receive an item.
|
||||
ctx.emit(payload)
|
||||
else:
|
||||
Add_File._emit_storage_result(payload, overlay=not suppress_last_stage_overlay, emit=True)
|
||||
|
||||
Add_File._cleanup_after_success(media_path, delete_source=delete_after)
|
||||
Add_File._cleanup_after_success(media_path, delete_source=delete_after_effective)
|
||||
return 0
|
||||
|
||||
except Exception as exc:
|
||||
|
||||
@@ -3,7 +3,6 @@ from __future__ import annotations
|
||||
from typing import Any, Dict, Sequence
|
||||
import sys
|
||||
|
||||
from . import register
|
||||
import pipeline as ctx
|
||||
from ._shared import Cmdlet, CmdletArg, SharedArgs, parse_cmdlet_args, get_field, normalize_hash
|
||||
from SYS.logger import log
|
||||
@@ -12,19 +11,24 @@ from Store import Store
|
||||
|
||||
class Add_Url(Cmdlet):
|
||||
"""Add URL associations to files via hash+store."""
|
||||
|
||||
NAME = "add-url"
|
||||
SUMMARY = "Associate a URL with a file"
|
||||
USAGE = "@1 | add-url <url>"
|
||||
ARGS = [
|
||||
SharedArgs.HASH,
|
||||
SharedArgs.STORE,
|
||||
CmdletArg("url", required=True, description="URL to associate"),
|
||||
]
|
||||
DETAIL = [
|
||||
"- Associates URL with file identified by hash+store",
|
||||
"- Multiple url can be comma-separated",
|
||||
]
|
||||
|
||||
def __init__(self) -> None:
|
||||
super().__init__(
|
||||
name="add-url",
|
||||
summary="Associate a URL with a file",
|
||||
usage="@1 | add-url <url>",
|
||||
arg=[
|
||||
SharedArgs.HASH,
|
||||
SharedArgs.STORE,
|
||||
CmdletArg("url", required=True, description="URL to associate"),
|
||||
],
|
||||
detail=[
|
||||
"- Associates URL with file identified by hash+store",
|
||||
"- Multiple url can be comma-separated",
|
||||
],
|
||||
exec=self.run,
|
||||
)
|
||||
self.register()
|
||||
|
||||
def run(self, result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
|
||||
"""Add URL to file via hash+store backend."""
|
||||
@@ -78,8 +82,7 @@ class Add_Url(Cmdlet):
|
||||
return 1
|
||||
|
||||
|
||||
# Register cmdlet
|
||||
register(["add-url", "add_url"])(Add_Url)
|
||||
CMDLET = Add_Url()
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -3,7 +3,6 @@ from __future__ import annotations
|
||||
from typing import Any, Dict, Sequence
|
||||
import sys
|
||||
|
||||
from . import register
|
||||
import pipeline as ctx
|
||||
from ._shared import Cmdlet, CmdletArg, SharedArgs, parse_cmdlet_args, get_field, normalize_hash
|
||||
from SYS.logger import log
|
||||
@@ -12,19 +11,24 @@ from Store import Store
|
||||
|
||||
class Delete_Url(Cmdlet):
|
||||
"""Delete URL associations from files via hash+store."""
|
||||
|
||||
NAME = "delete-url"
|
||||
SUMMARY = "Remove a URL association from a file"
|
||||
USAGE = "@1 | delete-url <url>"
|
||||
ARGS = [
|
||||
SharedArgs.HASH,
|
||||
SharedArgs.STORE,
|
||||
CmdletArg("url", required=True, description="URL to remove"),
|
||||
]
|
||||
DETAIL = [
|
||||
"- Removes URL association from file identified by hash+store",
|
||||
"- Multiple url can be comma-separated",
|
||||
]
|
||||
|
||||
def __init__(self) -> None:
|
||||
super().__init__(
|
||||
name="delete-url",
|
||||
summary="Remove a URL association from a file",
|
||||
usage="@1 | delete-url <url>",
|
||||
arg=[
|
||||
SharedArgs.HASH,
|
||||
SharedArgs.STORE,
|
||||
CmdletArg("url", required=True, description="URL to remove"),
|
||||
],
|
||||
detail=[
|
||||
"- Removes URL association from file identified by hash+store",
|
||||
"- Multiple url can be comma-separated",
|
||||
],
|
||||
exec=self.run,
|
||||
)
|
||||
self.register()
|
||||
|
||||
def run(self, result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
|
||||
"""Delete URL from file via hash+store backend."""
|
||||
@@ -78,5 +82,4 @@ class Delete_Url(Cmdlet):
|
||||
return 1
|
||||
|
||||
|
||||
# Register cmdlet
|
||||
register(["delete-url", "del-url", "delete_url"])(Delete_Url)
|
||||
CMDLET = Delete_Url()
|
||||
|
||||
@@ -190,9 +190,11 @@ class Download_File(Cmdlet):
|
||||
|
||||
# If this looks like a provider item and providers are available, prefer provider.download()
|
||||
downloaded_path: Optional[Path] = None
|
||||
attempted_provider_download = False
|
||||
if table and get_search_provider and SearchResult:
|
||||
provider = get_search_provider(str(table), config)
|
||||
if provider is not None:
|
||||
attempted_provider_download = True
|
||||
sr = SearchResult(
|
||||
table=str(table),
|
||||
title=str(title or "Unknown"),
|
||||
@@ -202,6 +204,19 @@ class Download_File(Cmdlet):
|
||||
debug(f"[download-file] Downloading provider item via {table}: {sr.title}")
|
||||
downloaded_path = provider.download(sr, final_output_dir)
|
||||
|
||||
# OpenLibrary: if provider download failed, do NOT try to download the OpenLibrary page HTML.
|
||||
if downloaded_path is None and attempted_provider_download and str(table or "").lower() == "openlibrary":
|
||||
availability = None
|
||||
reason = None
|
||||
if isinstance(full_metadata, dict):
|
||||
availability = full_metadata.get("availability")
|
||||
reason = full_metadata.get("availability_reason")
|
||||
msg = "[download-file] OpenLibrary item not downloadable"
|
||||
if availability or reason:
|
||||
msg += f" (availability={availability or ''} reason={reason or ''})"
|
||||
log(msg, file=sys.stderr)
|
||||
continue
|
||||
|
||||
# Fallback: if we have a direct HTTP URL, download it directly
|
||||
if downloaded_path is None and isinstance(target, str) and target.startswith("http"):
|
||||
debug(f"[download-file] Provider item looks like direct URL, downloading: {target}")
|
||||
|
||||
@@ -693,6 +693,7 @@ def probe_url(url: str, no_playlist: bool = False, timeout_seconds: int = 15) ->
|
||||
return
|
||||
|
||||
# Extract relevant fields
|
||||
webpage_url = info.get("webpage_url") or info.get("original_url") or info.get("url")
|
||||
result_container[0] = {
|
||||
"extractor": info.get("extractor", ""),
|
||||
"title": info.get("title", ""),
|
||||
@@ -700,7 +701,9 @@ def probe_url(url: str, no_playlist: bool = False, timeout_seconds: int = 15) ->
|
||||
"duration": info.get("duration"),
|
||||
"uploader": info.get("uploader"),
|
||||
"description": info.get("description"),
|
||||
"url": url,
|
||||
# Keep both the requested and canonical URL forms; callers should prefer webpage_url.
|
||||
"requested_url": url,
|
||||
"webpage_url": webpage_url,
|
||||
}
|
||||
except Exception as exc:
|
||||
log(f"Probe error for {url}: {exc}")
|
||||
@@ -1220,9 +1223,359 @@ class Download_Media(Cmdlet):
|
||||
log(f"Invalid clip format: {clip_spec}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
quiet_mode = bool(config.get("_quiet_background_output")) if isinstance(config, dict) else False
|
||||
|
||||
storage = None
|
||||
hydrus_available = True
|
||||
try:
|
||||
from Store import Store
|
||||
storage = Store(config=config or {}, suppress_debug=True)
|
||||
from API.HydrusNetwork import is_hydrus_available
|
||||
hydrus_available = bool(is_hydrus_available(config or {}))
|
||||
except Exception:
|
||||
storage = None
|
||||
|
||||
def _preflight_url_duplicate(candidate_url: str, extra_urls: Optional[Sequence[str]] = None) -> bool:
|
||||
# NOTE: download-media sets _quiet_background_output=True when running in a pipeline to
|
||||
# reduce background noise. URL de-dup is interactive and must still run in pipelines.
|
||||
if storage is None:
|
||||
debug("Preflight URL check skipped: storage unavailable")
|
||||
return True
|
||||
|
||||
debug(f"Preflight URL check: candidate={candidate_url}")
|
||||
|
||||
try:
|
||||
from metadata import normalize_urls
|
||||
except Exception:
|
||||
normalize_urls = None # type: ignore[assignment]
|
||||
|
||||
needles: List[str] = []
|
||||
if normalize_urls is not None:
|
||||
for raw in [candidate_url, *(list(extra_urls) if extra_urls else [])]:
|
||||
try:
|
||||
needles.extend(normalize_urls(raw))
|
||||
except Exception:
|
||||
continue
|
||||
# Fallback: always have at least one needle
|
||||
if not needles:
|
||||
needles = [str(candidate_url)]
|
||||
|
||||
# Deduplicate needles (preserve order)
|
||||
seen_needles: List[str] = []
|
||||
for needle in needles:
|
||||
if needle and needle not in seen_needles:
|
||||
seen_needles.append(needle)
|
||||
needles = seen_needles
|
||||
|
||||
try:
|
||||
debug(f"Preflight URL needles: {needles}")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
url_matches: List[Dict[str, Any]] = []
|
||||
try:
|
||||
from Store.HydrusNetwork import HydrusNetwork
|
||||
|
||||
# Avoid searching the temp/download directory backend during dedup.
|
||||
# We only want to warn about duplicates in real stores.
|
||||
backend_names_all = storage.list_searchable_backends()
|
||||
backend_names: List[str] = []
|
||||
skipped: List[str] = []
|
||||
for backend_name in backend_names_all:
|
||||
try:
|
||||
backend = storage[backend_name]
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
try:
|
||||
if str(backend_name).strip().lower() == "temp":
|
||||
skipped.append(backend_name)
|
||||
continue
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Heuristic: if a Folder backend points at the configured temp output dir, skip it.
|
||||
try:
|
||||
backend_location = getattr(backend, "_location", None)
|
||||
if backend_location and final_output_dir:
|
||||
backend_path = Path(str(backend_location)).expanduser().resolve()
|
||||
temp_path = Path(str(final_output_dir)).expanduser().resolve()
|
||||
if backend_path == temp_path:
|
||||
skipped.append(backend_name)
|
||||
continue
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
backend_names.append(backend_name)
|
||||
|
||||
try:
|
||||
if skipped:
|
||||
debug(f"Preflight backends: {backend_names} (skipped temp: {skipped})")
|
||||
else:
|
||||
debug(f"Preflight backends: {backend_names}")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
for backend_name in backend_names:
|
||||
backend = storage[backend_name]
|
||||
if isinstance(backend, HydrusNetwork) and not hydrus_available:
|
||||
continue
|
||||
|
||||
backend_hits: List[Dict[str, Any]] = []
|
||||
for needle in needles:
|
||||
try:
|
||||
backend_hits = backend.search(f"url:{needle}", limit=25) or []
|
||||
if backend_hits:
|
||||
break
|
||||
except Exception:
|
||||
continue
|
||||
if backend_hits:
|
||||
url_matches.extend([dict(x) if isinstance(x, dict) else {"title": str(x)} for x in backend_hits])
|
||||
|
||||
if len(url_matches) >= 25:
|
||||
url_matches = url_matches[:25]
|
||||
break
|
||||
except Exception:
|
||||
url_matches = []
|
||||
|
||||
if not url_matches:
|
||||
debug("Preflight URL check: no matches")
|
||||
return True
|
||||
|
||||
table = ResultTable(f"URL already exists ({len(url_matches)} match(es))")
|
||||
results_list: List[Dict[str, Any]] = []
|
||||
for item in url_matches:
|
||||
if "title" not in item:
|
||||
item["title"] = item.get("name") or item.get("target") or item.get("path") or "Result"
|
||||
table.add_result(item)
|
||||
results_list.append(item)
|
||||
|
||||
pipeline_context.set_current_stage_table(table)
|
||||
pipeline_context.set_last_result_table(table, results_list)
|
||||
|
||||
print(f"\n{table}")
|
||||
response = input("Continue anyway? (y/n): ").strip().lower()
|
||||
if response not in {"y", "yes"}:
|
||||
return False
|
||||
return True
|
||||
|
||||
def _canonicalize_url_for_storage(requested_url: str) -> str:
|
||||
# Prefer yt-dlp's canonical webpage URL (e.g. strips timestamps/redirects).
|
||||
# Fall back to the requested URL if probing fails.
|
||||
# Important: when playlist item selection is used, avoid probing (can hang on large playlists).
|
||||
if playlist_items:
|
||||
return str(requested_url)
|
||||
try:
|
||||
pr = probe_url(requested_url, no_playlist=False, timeout_seconds=15)
|
||||
if isinstance(pr, dict):
|
||||
for key in ("webpage_url", "original_url", "url", "requested_url"):
|
||||
value = pr.get(key)
|
||||
if isinstance(value, str) and value.strip():
|
||||
return value.strip()
|
||||
except Exception:
|
||||
pass
|
||||
return str(requested_url)
|
||||
|
||||
# Check if we need to show format selection
|
||||
playlist_items = str(parsed.get("item")) if parsed.get("item") else None
|
||||
ytdl_format = parsed.get("format")
|
||||
playlist_selection_handled = False
|
||||
|
||||
def _parse_at_selection(choice: str, *, max_index: int) -> Optional[List[int]]:
|
||||
"""Parse @ selection syntax (@2, @2-5, @{1,3,5}, @2,5,7) into 1-based indices."""
|
||||
raw = str(choice or "").strip()
|
||||
if not raw:
|
||||
return None
|
||||
|
||||
if raw.lower() in {"q", "quit", "cancel"}:
|
||||
return None
|
||||
|
||||
if raw == "@*" or raw == "*":
|
||||
return list(range(1, max_index + 1))
|
||||
|
||||
if raw.startswith("@"):
|
||||
raw = raw[1:].strip()
|
||||
|
||||
if raw.startswith("{") and raw.endswith("}"):
|
||||
raw = raw[1:-1].strip()
|
||||
|
||||
if not raw:
|
||||
return None
|
||||
|
||||
indices: set[int] = set()
|
||||
for part in raw.split(","):
|
||||
part = part.strip()
|
||||
if not part:
|
||||
continue
|
||||
if "-" in part:
|
||||
left, right = [p.strip() for p in part.split("-", 1)]
|
||||
if not left or not right:
|
||||
return None
|
||||
try:
|
||||
start = int(left)
|
||||
end = int(right)
|
||||
except ValueError:
|
||||
return None
|
||||
if start < 1 or end < 1:
|
||||
return None
|
||||
if end < start:
|
||||
start, end = end, start
|
||||
for i in range(start, end + 1):
|
||||
if 1 <= i <= max_index:
|
||||
indices.add(i)
|
||||
else:
|
||||
try:
|
||||
i = int(part)
|
||||
except ValueError:
|
||||
return None
|
||||
if 1 <= i <= max_index:
|
||||
indices.add(i)
|
||||
if not indices:
|
||||
return None
|
||||
return sorted(indices)
|
||||
|
||||
def _maybe_prompt_playlist_items(url: str) -> Optional[Dict[str, Any]]:
|
||||
"""If URL appears to be a playlist/channel/collection, prompt user for @ selection.
|
||||
|
||||
Returns:
|
||||
- None if URL is not a playlist-like multi-entry page (or probe fails)
|
||||
- Dict with keys:
|
||||
- cancel: bool
|
||||
- playlist_items: Optional[str] (None means download all)
|
||||
- selected_urls: Optional[List[str]] (expanded per-entry urls when available)
|
||||
"""
|
||||
try:
|
||||
pr = probe_url(url, no_playlist=False, timeout_seconds=15)
|
||||
except Exception:
|
||||
pr = None
|
||||
if not isinstance(pr, dict):
|
||||
return None
|
||||
entries = pr.get("entries")
|
||||
if not isinstance(entries, list) or len(entries) <= 1:
|
||||
return None
|
||||
|
||||
# Display table (limit rows to keep output reasonable)
|
||||
max_rows = 200
|
||||
display_entries = entries[:max_rows]
|
||||
total = len(entries)
|
||||
|
||||
def _entry_to_url(entry: Any) -> Optional[str]:
|
||||
if not isinstance(entry, dict):
|
||||
return None
|
||||
# Prefer explicit absolute URLs when present
|
||||
for key in ("webpage_url", "original_url", "url"):
|
||||
v = entry.get(key)
|
||||
if isinstance(v, str) and v.strip():
|
||||
s = v.strip()
|
||||
try:
|
||||
if urlparse(s).scheme in {"http", "https"}:
|
||||
return s
|
||||
except Exception:
|
||||
return s
|
||||
|
||||
# Best-effort YouTube fallback from id
|
||||
entry_id = entry.get("id")
|
||||
if isinstance(entry_id, str) and entry_id.strip():
|
||||
extractor_name = str(pr.get("extractor") or pr.get("extractor_key") or "").lower()
|
||||
if "youtube" in extractor_name:
|
||||
return f"https://www.youtube.com/watch?v={entry_id.strip()}"
|
||||
return None
|
||||
|
||||
table = ResultTable()
|
||||
table.title = f"Playlist items ({total}{' shown ' + str(len(display_entries)) if total > max_rows else ''})"
|
||||
table.set_source_command("download-media", [url])
|
||||
try:
|
||||
table.set_preserve_order(True)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
results_list: List[Dict[str, Any]] = []
|
||||
for idx, entry in enumerate(display_entries, 1):
|
||||
title = None
|
||||
uploader = None
|
||||
duration = None
|
||||
try:
|
||||
if isinstance(entry, dict):
|
||||
title = entry.get("title")
|
||||
uploader = entry.get("uploader") or pr.get("uploader")
|
||||
duration = entry.get("duration")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
row: Dict[str, Any] = {
|
||||
"table": "download-media",
|
||||
"title": str(title or f"Item {idx}"),
|
||||
"detail": str(uploader or ""),
|
||||
"media_kind": "playlist-item",
|
||||
"playlist_index": idx,
|
||||
"columns": [
|
||||
("#", str(idx)),
|
||||
("Title", str(title or "")),
|
||||
("Duration", str(duration or "")),
|
||||
("Uploader", str(uploader or "")),
|
||||
],
|
||||
}
|
||||
results_list.append(row)
|
||||
table.add_result(row)
|
||||
|
||||
pipeline_context.set_current_stage_table(table)
|
||||
pipeline_context.set_last_result_table(table, results_list)
|
||||
|
||||
print(f"\n{table}")
|
||||
choice = input("Select items to download (@N, @2-5, @{1,3}, @*, or 'q' to cancel): ").strip()
|
||||
if not choice or choice.lower() in {"q", "quit", "cancel"}:
|
||||
return {"cancel": True, "playlist_items": None, "selected_urls": []}
|
||||
if choice.strip() == "@*" or choice.strip() == "*":
|
||||
# @* means all entries, not just displayed rows.
|
||||
selected_urls: List[str] = []
|
||||
for entry in entries:
|
||||
u = _entry_to_url(entry)
|
||||
if u and u not in selected_urls:
|
||||
selected_urls.append(u)
|
||||
# Only expand when we can derive URLs for all entries; otherwise fall back to yt-dlp playlist handling.
|
||||
if len(selected_urls) == len(entries):
|
||||
return {"cancel": False, "playlist_items": None, "selected_urls": selected_urls}
|
||||
return {"cancel": False, "playlist_items": None, "selected_urls": []}
|
||||
|
||||
parsed_indices = _parse_at_selection(choice, max_index=len(display_entries))
|
||||
if not parsed_indices:
|
||||
log("Invalid selection. Use @N, @2-5, @{1,3}, or @*", file=sys.stderr)
|
||||
return {"cancel": True, "playlist_items": None, "selected_urls": []}
|
||||
|
||||
selected_urls: List[str] = []
|
||||
for i in parsed_indices:
|
||||
try:
|
||||
entry = display_entries[i - 1]
|
||||
except Exception:
|
||||
continue
|
||||
u = _entry_to_url(entry)
|
||||
if u and u not in selected_urls:
|
||||
selected_urls.append(u)
|
||||
|
||||
# If we can expand per-entry URLs, return them.
|
||||
if selected_urls and len(selected_urls) == len(parsed_indices):
|
||||
return {"cancel": False, "playlist_items": None, "selected_urls": selected_urls}
|
||||
|
||||
# yt-dlp accepts comma-separated 1-based indices for playlist_items
|
||||
return {"cancel": False, "playlist_items": ",".join(str(i) for i in parsed_indices), "selected_urls": []}
|
||||
|
||||
# Playlist/multi-entry detection: if the URL has multiple items and the user didn't
|
||||
# specify -item, prompt for @ selection (supports @* for all).
|
||||
if len(supported_url) == 1 and not playlist_items and not ytdl_format:
|
||||
candidate_url = supported_url[0]
|
||||
selection_info = _maybe_prompt_playlist_items(candidate_url)
|
||||
if selection_info is not None:
|
||||
playlist_selection_handled = True
|
||||
if bool(selection_info.get("cancel")):
|
||||
return 0
|
||||
selected_urls = selection_info.get("selected_urls")
|
||||
if isinstance(selected_urls, list) and selected_urls:
|
||||
# Expand playlist/channel URL into per-entry URLs so that de-dup preflight
|
||||
# and downloads operate per file.
|
||||
supported_url = selected_urls
|
||||
playlist_items = None
|
||||
else:
|
||||
playlist_items = selection_info.get("playlist_items")
|
||||
|
||||
# If no -item, no explicit -format specified, and single URL, show the format table.
|
||||
# Do NOT stop to show formats when -audio is used (auto-pick) or when -clip is used.
|
||||
@@ -1232,8 +1585,15 @@ class Download_Media(Cmdlet):
|
||||
and not playlist_items
|
||||
and not ytdl_format
|
||||
and len(supported_url) == 1
|
||||
and not playlist_selection_handled
|
||||
):
|
||||
url = supported_url[0]
|
||||
|
||||
canonical_url = _canonicalize_url_for_storage(url)
|
||||
if not _preflight_url_duplicate(canonical_url, extra_urls=[url]):
|
||||
log(f"Skipping download: {url}", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
formats = list_formats(url, no_playlist=False)
|
||||
|
||||
if formats and len(formats) > 1:
|
||||
@@ -1379,12 +1739,18 @@ class Download_Media(Cmdlet):
|
||||
# Download each URL
|
||||
downloaded_count = 0
|
||||
clip_sections_spec = self._build_clip_sections_spec(clip_range)
|
||||
quiet_mode = bool(config.get("_quiet_background_output")) if isinstance(config, dict) else False
|
||||
|
||||
for url in supported_url:
|
||||
try:
|
||||
debug(f"Processing: {url}")
|
||||
|
||||
canonical_url = _canonicalize_url_for_storage(url)
|
||||
|
||||
# Preflight: warn if URL already exists in storage backends.
|
||||
if not _preflight_url_duplicate(canonical_url, extra_urls=[url]):
|
||||
log(f"Skipping download: {url}", file=sys.stderr)
|
||||
continue
|
||||
|
||||
# If playlist_items is specified but looks like a format ID (e.g. from table selection),
|
||||
# treat it as a format selector instead of playlist items.
|
||||
# This handles the case where @N selection passes -item <format_id>
|
||||
@@ -1532,24 +1898,17 @@ class Download_Media(Cmdlet):
|
||||
if title and f"title:{title}" not in tag:
|
||||
tag.insert(0, f"title:{title}")
|
||||
|
||||
# Build a single canonical URL field; prefer yt-dlp provided webpage_url or info.url,
|
||||
# but fall back to the original requested URL. If multiple unique urls are available,
|
||||
# join them into a comma-separated string.
|
||||
urls_to_consider: List[str] = []
|
||||
# Store the canonical URL for de-dup/search purposes.
|
||||
# Prefer yt-dlp's webpage_url, and do not mix in the raw requested URL (which may contain timestamps).
|
||||
final_url = None
|
||||
try:
|
||||
page_url = info.get("webpage_url") or info.get("url")
|
||||
page_url = info.get("webpage_url") or info.get("original_url") or info.get("url")
|
||||
if page_url:
|
||||
urls_to_consider.append(str(page_url))
|
||||
final_url = str(page_url)
|
||||
except Exception:
|
||||
pass
|
||||
if url:
|
||||
urls_to_consider.append(str(url))
|
||||
|
||||
seen_urls: List[str] = []
|
||||
for u in urls_to_consider:
|
||||
if u and u not in seen_urls:
|
||||
seen_urls.append(u)
|
||||
final_url = ",".join(seen_urls) if seen_urls else None
|
||||
final_url = None
|
||||
if not final_url and url:
|
||||
final_url = str(url)
|
||||
|
||||
# Construct canonical PipeObject dict: hash, store, path, url, title, tags
|
||||
# Prefer explicit backend names (storage_name/storage_location). If none, default to PATH
|
||||
@@ -1561,6 +1920,7 @@ class Download_Media(Cmdlet):
|
||||
"url": final_url,
|
||||
"tag": tag,
|
||||
"action": "cmdlet:download-media",
|
||||
"is_temp": True,
|
||||
# download_mode removed (deprecated), keep media_kind
|
||||
"store": getattr(opts, "storage_name", None) or getattr(opts, "storage_location", None) or "PATH",
|
||||
"media_kind": "video" if opts.mode == "video" else "audio",
|
||||
|
||||
@@ -184,6 +184,32 @@ class Get_Metadata(Cmdlet):
|
||||
mime_type = metadata.get("mime") or metadata.get("ext", "")
|
||||
file_size = metadata.get("size")
|
||||
duration_seconds = metadata.get("duration")
|
||||
if duration_seconds is None:
|
||||
duration_seconds = metadata.get("duration_seconds")
|
||||
if duration_seconds is None:
|
||||
duration_seconds = metadata.get("length")
|
||||
if duration_seconds is None and isinstance(metadata.get("duration_ms"), (int, float)):
|
||||
try:
|
||||
duration_seconds = float(metadata["duration_ms"]) / 1000.0
|
||||
except Exception:
|
||||
duration_seconds = None
|
||||
|
||||
if isinstance(duration_seconds, str):
|
||||
s = duration_seconds.strip()
|
||||
if s:
|
||||
try:
|
||||
duration_seconds = float(s)
|
||||
except ValueError:
|
||||
if ":" in s:
|
||||
parts = [p.strip() for p in s.split(":") if p.strip()]
|
||||
if len(parts) in {2, 3} and all(p.isdigit() for p in parts):
|
||||
nums = [int(p) for p in parts]
|
||||
if len(nums) == 2:
|
||||
duration_seconds = float(nums[0] * 60 + nums[1])
|
||||
else:
|
||||
duration_seconds = float(nums[0] * 3600 + nums[1] * 60 + nums[2])
|
||||
else:
|
||||
duration_seconds = None
|
||||
pages = metadata.get("pages")
|
||||
url = metadata.get("url") or []
|
||||
imported_ts = self._extract_imported_ts(metadata)
|
||||
|
||||
@@ -12,7 +12,13 @@ from __future__ import annotations
|
||||
|
||||
import sys
|
||||
|
||||
from SYS.logger import log, debug
|
||||
try:
|
||||
from Provider.openlibrary import OpenLibrary
|
||||
_ol_scrape_isbn_metadata = OpenLibrary.scrape_isbn_metadata
|
||||
_ol_scrape_openlibrary_metadata = OpenLibrary.scrape_openlibrary_metadata
|
||||
except Exception:
|
||||
_ol_scrape_isbn_metadata = None # type: ignore[assignment]
|
||||
_ol_scrape_openlibrary_metadata = None # type: ignore[assignment]
|
||||
from Provider.metadata_provider import get_metadata_provider, list_metadata_providers
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
@@ -31,6 +37,10 @@ except ImportError:
|
||||
extract_title = None
|
||||
|
||||
|
||||
_scrape_isbn_metadata = _ol_scrape_isbn_metadata # type: ignore[assignment]
|
||||
_scrape_openlibrary_metadata = _ol_scrape_openlibrary_metadata # type: ignore[assignment]
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -691,249 +701,22 @@ def _extract_url_formats(formats: list) -> List[Tuple[str, str]]:
|
||||
|
||||
|
||||
def _scrape_isbn_metadata(isbn: str) -> List[str]:
|
||||
"""Scrape metadata for an ISBN using Open Library API."""
|
||||
new_tags = []
|
||||
if _ol_scrape_isbn_metadata is None:
|
||||
log("OpenLibrary scraper unavailable", file=sys.stderr)
|
||||
return []
|
||||
try:
|
||||
from ..API.HTTP import HTTPClient
|
||||
import json as json_module
|
||||
|
||||
isbn_clean = isbn.replace('-', '').strip()
|
||||
url = f"https://openlibrary.org/api/books?bibkeys=ISBN:{isbn_clean}&jscmd=data&format=json"
|
||||
|
||||
try:
|
||||
with HTTPClient() as client:
|
||||
response = client.get(url)
|
||||
response.raise_for_status()
|
||||
data = json_module.loads(response.content.decode('utf-8'))
|
||||
except Exception as e:
|
||||
log(f"Failed to fetch ISBN metadata: {e}", file=sys.stderr)
|
||||
return []
|
||||
|
||||
if not data:
|
||||
log(f"No ISBN metadata found for: {isbn}")
|
||||
return []
|
||||
|
||||
book_data = next(iter(data.values()), None)
|
||||
if not book_data:
|
||||
return []
|
||||
|
||||
if 'title' in book_data:
|
||||
new_tags.append(f"title:{book_data['title']}")
|
||||
|
||||
if 'authors' in book_data and isinstance(book_data['authors'], list):
|
||||
for author in book_data['authors'][:3]:
|
||||
if 'name' in author:
|
||||
new_tags.append(f"author:{author['name']}")
|
||||
|
||||
if 'publish_date' in book_data:
|
||||
new_tags.append(f"publish_date:{book_data['publish_date']}")
|
||||
|
||||
if 'publishers' in book_data and isinstance(book_data['publishers'], list):
|
||||
for pub in book_data['publishers'][:1]:
|
||||
if 'name' in pub:
|
||||
new_tags.append(f"publisher:{pub['name']}")
|
||||
|
||||
if 'description' in book_data:
|
||||
desc = book_data['description']
|
||||
if isinstance(desc, dict) and 'value' in desc:
|
||||
desc = desc['value']
|
||||
if desc:
|
||||
desc_str = str(desc).strip()
|
||||
# Include description if available (limit to 200 chars to keep it manageable)
|
||||
if len(desc_str) > 0:
|
||||
new_tags.append(f"description:{desc_str[:200]}")
|
||||
|
||||
if 'number_of_pages' in book_data:
|
||||
page_count = book_data['number_of_pages']
|
||||
if page_count and isinstance(page_count, int) and page_count > 0:
|
||||
new_tags.append(f"pages:{page_count}")
|
||||
|
||||
if 'identifiers' in book_data and isinstance(book_data['identifiers'], dict):
|
||||
identifiers = book_data['identifiers']
|
||||
|
||||
if 'openlibrary' in identifiers:
|
||||
ol_ids = identifiers['openlibrary']
|
||||
if isinstance(ol_ids, list) and ol_ids:
|
||||
new_tags.append(f"openlibrary:{ol_ids[0]}")
|
||||
elif isinstance(ol_ids, str):
|
||||
new_tags.append(f"openlibrary:{ol_ids}")
|
||||
|
||||
if 'lccn' in identifiers:
|
||||
lccn_list = identifiers['lccn']
|
||||
if isinstance(lccn_list, list) and lccn_list:
|
||||
new_tags.append(f"lccn:{lccn_list[0]}")
|
||||
elif isinstance(lccn_list, str):
|
||||
new_tags.append(f"lccn:{lccn_list}")
|
||||
|
||||
if 'oclc' in identifiers:
|
||||
oclc_list = identifiers['oclc']
|
||||
if isinstance(oclc_list, list) and oclc_list:
|
||||
new_tags.append(f"oclc:{oclc_list[0]}")
|
||||
elif isinstance(oclc_list, str):
|
||||
new_tags.append(f"oclc:{oclc_list}")
|
||||
|
||||
if 'goodreads' in identifiers:
|
||||
goodreads_list = identifiers['goodreads']
|
||||
if isinstance(goodreads_list, list) and goodreads_list:
|
||||
new_tags.append(f"goodreads:{goodreads_list[0]}")
|
||||
elif isinstance(goodreads_list, str):
|
||||
new_tags.append(f"goodreads:{goodreads_list}")
|
||||
|
||||
if 'librarything' in identifiers:
|
||||
lt_list = identifiers['librarything']
|
||||
if isinstance(lt_list, list) and lt_list:
|
||||
new_tags.append(f"librarything:{lt_list[0]}")
|
||||
elif isinstance(lt_list, str):
|
||||
new_tags.append(f"librarything:{lt_list}")
|
||||
|
||||
if 'doi' in identifiers:
|
||||
doi_list = identifiers['doi']
|
||||
if isinstance(doi_list, list) and doi_list:
|
||||
new_tags.append(f"doi:{doi_list[0]}")
|
||||
elif isinstance(doi_list, str):
|
||||
new_tags.append(f"doi:{doi_list}")
|
||||
|
||||
if 'internet_archive' in identifiers:
|
||||
ia_list = identifiers['internet_archive']
|
||||
if isinstance(ia_list, list) and ia_list:
|
||||
new_tags.append(f"internet_archive:{ia_list[0]}")
|
||||
elif isinstance(ia_list, str):
|
||||
new_tags.append(f"internet_archive:{ia_list}")
|
||||
|
||||
log(f"Found {len(new_tags)} tag(s) from ISBN lookup")
|
||||
return new_tags
|
||||
return list(_ol_scrape_isbn_metadata(isbn))
|
||||
except Exception as e:
|
||||
log(f"ISBN scraping error: {e}", file=sys.stderr)
|
||||
return []
|
||||
|
||||
|
||||
def _scrape_openlibrary_metadata(olid: str) -> List[str]:
|
||||
"""Scrape metadata for an OpenLibrary ID using the .json API endpoint.
|
||||
|
||||
Fetches from https://openlibrary.org/books/{OLID}.json and extracts:
|
||||
- Title, authors, publish date, publishers
|
||||
- Description
|
||||
- Subjects as freeform tags (without namespace prefix)
|
||||
- Identifiers (ISBN, LCCN, OCLC, etc.)
|
||||
"""
|
||||
new_tags = []
|
||||
if _ol_scrape_openlibrary_metadata is None:
|
||||
log("OpenLibrary scraper unavailable", file=sys.stderr)
|
||||
return []
|
||||
try:
|
||||
from ..API.HTTP import HTTPClient
|
||||
import json as json_module
|
||||
|
||||
# Format: OL9674499M or just 9674499M
|
||||
olid_clean = olid.replace('OL', '').replace('M', '')
|
||||
if not olid_clean.isdigit():
|
||||
olid_clean = olid
|
||||
|
||||
# Ensure we have the full OLID format for the URL
|
||||
if not olid.startswith('OL'):
|
||||
url = f"https://openlibrary.org/books/OL{olid_clean}M.json"
|
||||
else:
|
||||
url = f"https://openlibrary.org/books/{olid}.json"
|
||||
|
||||
try:
|
||||
with HTTPClient() as client:
|
||||
response = client.get(url)
|
||||
response.raise_for_status()
|
||||
data = json_module.loads(response.content.decode('utf-8'))
|
||||
except Exception as e:
|
||||
log(f"Failed to fetch OpenLibrary metadata: {e}", file=sys.stderr)
|
||||
return []
|
||||
|
||||
if not data:
|
||||
log(f"No OpenLibrary metadata found for: {olid}")
|
||||
return []
|
||||
|
||||
# Add title
|
||||
if 'title' in data:
|
||||
new_tags.append(f"title:{data['title']}")
|
||||
|
||||
# Add authors
|
||||
if 'authors' in data and isinstance(data['authors'], list):
|
||||
for author in data['authors'][:3]:
|
||||
if isinstance(author, dict) and 'name' in author:
|
||||
new_tags.append(f"author:{author['name']}")
|
||||
elif isinstance(author, str):
|
||||
new_tags.append(f"author:{author}")
|
||||
|
||||
# Add publish date
|
||||
if 'publish_date' in data:
|
||||
new_tags.append(f"publish_date:{data['publish_date']}")
|
||||
|
||||
# Add publishers
|
||||
if 'publishers' in data and isinstance(data['publishers'], list):
|
||||
for pub in data['publishers'][:1]:
|
||||
if isinstance(pub, dict) and 'name' in pub:
|
||||
new_tags.append(f"publisher:{pub['name']}")
|
||||
elif isinstance(pub, str):
|
||||
new_tags.append(f"publisher:{pub}")
|
||||
|
||||
# Add description
|
||||
if 'description' in data:
|
||||
desc = data['description']
|
||||
if isinstance(desc, dict) and 'value' in desc:
|
||||
desc = desc['value']
|
||||
if desc:
|
||||
desc_str = str(desc).strip()
|
||||
if len(desc_str) > 0:
|
||||
new_tags.append(f"description:{desc_str[:200]}")
|
||||
|
||||
# Add number of pages
|
||||
if 'number_of_pages' in data:
|
||||
page_count = data['number_of_pages']
|
||||
if page_count and isinstance(page_count, int) and page_count > 0:
|
||||
new_tags.append(f"pages:{page_count}")
|
||||
|
||||
# Add subjects as FREEFORM tags (no namespace prefix)
|
||||
if 'subjects' in data and isinstance(data['subjects'], list):
|
||||
for subject in data['subjects'][:10]:
|
||||
if subject and isinstance(subject, str):
|
||||
subject_clean = str(subject).strip()
|
||||
if subject_clean and subject_clean not in new_tags:
|
||||
new_tags.append(subject_clean)
|
||||
|
||||
# Add identifiers
|
||||
if 'identifiers' in data and isinstance(data['identifiers'], dict):
|
||||
identifiers = data['identifiers']
|
||||
|
||||
if 'isbn_10' in identifiers:
|
||||
isbn_10_list = identifiers['isbn_10']
|
||||
if isinstance(isbn_10_list, list) and isbn_10_list:
|
||||
new_tags.append(f"isbn_10:{isbn_10_list[0]}")
|
||||
elif isinstance(isbn_10_list, str):
|
||||
new_tags.append(f"isbn_10:{isbn_10_list}")
|
||||
|
||||
if 'isbn_13' in identifiers:
|
||||
isbn_13_list = identifiers['isbn_13']
|
||||
if isinstance(isbn_13_list, list) and isbn_13_list:
|
||||
new_tags.append(f"isbn_13:{isbn_13_list[0]}")
|
||||
elif isinstance(isbn_13_list, str):
|
||||
new_tags.append(f"isbn_13:{isbn_13_list}")
|
||||
|
||||
if 'lccn' in identifiers:
|
||||
lccn_list = identifiers['lccn']
|
||||
if isinstance(lccn_list, list) and lccn_list:
|
||||
new_tags.append(f"lccn:{lccn_list[0]}")
|
||||
elif isinstance(lccn_list, str):
|
||||
new_tags.append(f"lccn:{lccn_list}")
|
||||
|
||||
if 'oclc_numbers' in identifiers:
|
||||
oclc_list = identifiers['oclc_numbers']
|
||||
if isinstance(oclc_list, list) and oclc_list:
|
||||
new_tags.append(f"oclc:{oclc_list[0]}")
|
||||
elif isinstance(oclc_list, str):
|
||||
new_tags.append(f"oclc:{oclc_list}")
|
||||
|
||||
if 'goodreads' in identifiers:
|
||||
goodreads_list = identifiers['goodreads']
|
||||
if isinstance(goodreads_list, list) and goodreads_list:
|
||||
new_tags.append(f"goodreads:{goodreads_list[0]}")
|
||||
elif isinstance(goodreads_list, str):
|
||||
new_tags.append(f"goodreads:{goodreads_list}")
|
||||
|
||||
log(f"Found {len(new_tags)} tag(s) from OpenLibrary lookup")
|
||||
return new_tags
|
||||
return list(_ol_scrape_openlibrary_metadata(olid))
|
||||
except Exception as e:
|
||||
log(f"OpenLibrary scraping error: {e}", file=sys.stderr)
|
||||
return []
|
||||
|
||||
@@ -1,28 +1,40 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, Dict, Sequence
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, List, Sequence
|
||||
import sys
|
||||
|
||||
from . import register
|
||||
import pipeline as ctx
|
||||
from ._shared import Cmdlet, CmdletArg, SharedArgs, parse_cmdlet_args, get_field, normalize_hash
|
||||
from ._shared import Cmdlet, SharedArgs, parse_cmdlet_args, get_field, normalize_hash
|
||||
from SYS.logger import log
|
||||
from Store import Store
|
||||
|
||||
|
||||
@dataclass
|
||||
class UrlItem:
|
||||
url: str
|
||||
hash: str
|
||||
store: str
|
||||
|
||||
|
||||
class Get_Url(Cmdlet):
|
||||
"""Get url associated with files via hash+store."""
|
||||
|
||||
NAME = "get-url"
|
||||
SUMMARY = "List url associated with a file"
|
||||
USAGE = "@1 | get-url"
|
||||
ARGS = [
|
||||
SharedArgs.HASH,
|
||||
SharedArgs.STORE,
|
||||
]
|
||||
DETAIL = [
|
||||
"- Lists all url associated with file identified by hash+store",
|
||||
]
|
||||
|
||||
def __init__(self) -> None:
|
||||
super().__init__(
|
||||
name="get-url",
|
||||
summary="List url associated with a file",
|
||||
usage="@1 | get-url",
|
||||
arg=[
|
||||
SharedArgs.HASH,
|
||||
SharedArgs.STORE,
|
||||
],
|
||||
detail=[
|
||||
"- Lists all url associated with file identified by hash+store",
|
||||
],
|
||||
exec=self.run,
|
||||
)
|
||||
self.register()
|
||||
|
||||
def run(self, result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
|
||||
"""Get url for file via hash+store backend."""
|
||||
@@ -53,18 +65,34 @@ class Get_Url(Cmdlet):
|
||||
|
||||
urls = backend.get_url(file_hash)
|
||||
|
||||
if urls:
|
||||
for u in urls:
|
||||
# Emit rich object for pipeline compatibility
|
||||
ctx.emit({
|
||||
"url": u,
|
||||
"hash": file_hash,
|
||||
"store": store_name,
|
||||
})
|
||||
return 0
|
||||
else:
|
||||
ctx.emit("No url found")
|
||||
return 0
|
||||
from result_table import ResultTable
|
||||
|
||||
title = str(get_field(result, "title") or "").strip()
|
||||
table_title = "Title"
|
||||
if title:
|
||||
table_title = f"Title: {title}"
|
||||
|
||||
table = ResultTable(table_title, max_columns=1).set_preserve_order(True)
|
||||
table.set_source_command("get-url", [])
|
||||
|
||||
items: List[UrlItem] = []
|
||||
for u in list(urls or []):
|
||||
u = str(u or "").strip()
|
||||
if not u:
|
||||
continue
|
||||
row = table.add_row()
|
||||
row.add_column("Url", u)
|
||||
item = UrlItem(url=u, hash=file_hash, store=str(store_name))
|
||||
items.append(item)
|
||||
ctx.emit(item)
|
||||
|
||||
# Make this a real result table so @.. / @,, can navigate it
|
||||
ctx.set_last_result_table(table if items else None, items, subject=result)
|
||||
|
||||
if not items:
|
||||
log("No url found", file=sys.stderr)
|
||||
|
||||
return 0
|
||||
|
||||
except KeyError:
|
||||
log(f"Error: Storage backend '{store_name}' not configured")
|
||||
@@ -74,7 +102,6 @@ class Get_Url(Cmdlet):
|
||||
return 1
|
||||
|
||||
|
||||
# Register cmdlet
|
||||
register(["get-url", "get_url"])(Get_Url)
|
||||
CMDLET = Get_Url()
|
||||
|
||||
|
||||
|
||||
@@ -3,7 +3,6 @@ from __future__ import annotations
|
||||
|
||||
from typing import Any, Dict, Sequence, List, Optional, Tuple
|
||||
from pathlib import Path
|
||||
from dataclasses import dataclass, field
|
||||
from collections import OrderedDict
|
||||
import re
|
||||
import json
|
||||
@@ -11,57 +10,9 @@ import sys
|
||||
|
||||
from SYS.logger import log, debug
|
||||
|
||||
from ._shared import Cmdlet, CmdletArg, get_field, should_show_help
|
||||
from ._shared import Cmdlet, CmdletArg, get_field, should_show_help, normalize_hash, first_title_tag
|
||||
import pipeline as ctx
|
||||
|
||||
# Optional dependencies
|
||||
try:
|
||||
import mutagen # type: ignore
|
||||
except ImportError: # pragma: no cover
|
||||
mutagen = None # type: ignore
|
||||
|
||||
try:
|
||||
from config import get_hydrus_url, resolve_output_dir
|
||||
except Exception: # pragma: no cover
|
||||
get_hydrus_url = None # type: ignore
|
||||
resolve_output_dir = None # type: ignore
|
||||
|
||||
try:
|
||||
from API.HydrusNetwork import HydrusNetwork, HydrusRequestError
|
||||
except ImportError: # pragma: no cover
|
||||
HydrusNetwork = None # type: ignore
|
||||
HydrusRequestError = RuntimeError # type: ignore
|
||||
|
||||
try:
|
||||
from SYS.utils import sha256_file
|
||||
except ImportError: # pragma: no cover
|
||||
sha256_file = None # type: ignore
|
||||
|
||||
try:
|
||||
from SYS.utils_constant import mime_maps
|
||||
except ImportError: # pragma: no cover
|
||||
mime_maps = {} # type: ignore
|
||||
|
||||
@dataclass(slots=True)
|
||||
class SearchRecord:
|
||||
path: str
|
||||
size_bytes: int | None = None
|
||||
duration_seconds: str | None = None
|
||||
tag: str | None = None
|
||||
hash: str | None = None
|
||||
|
||||
def as_dict(self) -> dict[str, str]:
|
||||
payload: dict[str, str] = {"path": self.path}
|
||||
if self.size_bytes is not None:
|
||||
payload["size"] = str(self.size_bytes)
|
||||
if self.duration_seconds:
|
||||
payload["duration"] = self.duration_seconds
|
||||
if self.tag:
|
||||
payload["tag"] = self.tag
|
||||
if self.hash:
|
||||
payload["hash"] = self.hash
|
||||
return payload
|
||||
|
||||
|
||||
STORAGE_ORIGINS = {"local", "hydrus", "folder"}
|
||||
|
||||
@@ -86,12 +37,15 @@ class Search_Store(Cmdlet):
|
||||
detail=[
|
||||
"Search across storage backends: Folder stores and Hydrus instances",
|
||||
"Use -store to search a specific backend by name",
|
||||
"URL search: url:* (any URL) or url:<value> (URL substring)",
|
||||
"Filter results by: tag, size, type, duration",
|
||||
"Results include hash for downstream commands (get-file, add-tag, etc.)",
|
||||
"Examples:",
|
||||
"search-store foo # Search all storage backends",
|
||||
"search-store -store home '*' # Search 'home' Hydrus instance",
|
||||
"search-store -store test 'video' # Search 'test' folder store",
|
||||
"search-store 'url:*' # Files that have any URL",
|
||||
"search-store 'url:youtube.com' # Files whose URL contains substring",
|
||||
"search-store song -type audio # Search for audio files",
|
||||
"search-store movie -tag action # Search with tag filter",
|
||||
],
|
||||
@@ -100,6 +54,40 @@ class Search_Store(Cmdlet):
|
||||
self.register()
|
||||
|
||||
# --- Helper methods -------------------------------------------------
|
||||
@staticmethod
|
||||
def _parse_hash_query(query: str) -> List[str]:
|
||||
"""Parse a `hash:` query into a list of normalized 64-hex SHA256 hashes.
|
||||
|
||||
Supported examples:
|
||||
- hash:<h1>,<h2>,<h3>
|
||||
- Hash: <h1> <h2> <h3>
|
||||
- hash:{<h1>, <h2>}
|
||||
"""
|
||||
q = str(query or "").strip()
|
||||
if not q:
|
||||
return []
|
||||
|
||||
m = re.match(r"^hash(?:es)?\s*:\s*(.+)$", q, flags=re.IGNORECASE)
|
||||
if not m:
|
||||
return []
|
||||
|
||||
rest = (m.group(1) or "").strip()
|
||||
if rest.startswith("{") and rest.endswith("}"):
|
||||
rest = rest[1:-1].strip()
|
||||
if rest.startswith("[") and rest.endswith("]"):
|
||||
rest = rest[1:-1].strip()
|
||||
|
||||
# Split on commas and whitespace.
|
||||
raw_parts = [p.strip() for p in re.split(r"[\s,]+", rest) if p.strip()]
|
||||
out: List[str] = []
|
||||
for part in raw_parts:
|
||||
h = normalize_hash(part)
|
||||
if not h:
|
||||
continue
|
||||
if h not in out:
|
||||
out.append(h)
|
||||
return out
|
||||
|
||||
@staticmethod
|
||||
def _normalize_extension(ext_value: Any) -> str:
|
||||
"""Sanitize extension strings to alphanumerics and cap at 5 chars."""
|
||||
@@ -150,10 +138,10 @@ class Search_Store(Cmdlet):
|
||||
|
||||
# Parse arguments
|
||||
query = ""
|
||||
tag_filters: List[str] = []
|
||||
size_filter: Optional[Tuple[str, int]] = None
|
||||
duration_filter: Optional[Tuple[str, float]] = None
|
||||
type_filter: Optional[str] = None
|
||||
_tag_filters: List[str] = []
|
||||
_size_filter: Optional[Tuple[str, int]] = None
|
||||
_duration_filter: Optional[Tuple[str, float]] = None
|
||||
_type_filter: Optional[str] = None
|
||||
storage_backend: Optional[str] = None
|
||||
limit = 100
|
||||
searched_backends: List[str] = []
|
||||
@@ -166,7 +154,7 @@ class Search_Store(Cmdlet):
|
||||
storage_backend = args_list[i + 1]
|
||||
i += 2
|
||||
elif low in {"-tag", "--tag"} and i + 1 < len(args_list):
|
||||
tag_filters.append(args_list[i + 1])
|
||||
_tag_filters.append(args_list[i + 1])
|
||||
i += 2
|
||||
elif low in {"-limit", "--limit"} and i + 1 < len(args_list):
|
||||
try:
|
||||
@@ -175,7 +163,7 @@ class Search_Store(Cmdlet):
|
||||
limit = 100
|
||||
i += 2
|
||||
elif low in {"-type", "--type"} and i + 1 < len(args_list):
|
||||
type_filter = args_list[i + 1].lower()
|
||||
_type_filter = args_list[i + 1].lower()
|
||||
i += 2
|
||||
elif not arg.startswith("-"):
|
||||
query = f"{query} {arg}".strip() if query else arg
|
||||
@@ -195,6 +183,8 @@ class Search_Store(Cmdlet):
|
||||
if store_filter and not storage_backend:
|
||||
storage_backend = store_filter
|
||||
|
||||
hash_query = self._parse_hash_query(query)
|
||||
|
||||
if not query:
|
||||
log("Provide a search query", file=sys.stderr)
|
||||
return 1
|
||||
@@ -230,12 +220,136 @@ class Search_Store(Cmdlet):
|
||||
table_title += f" [{storage_backend}]"
|
||||
|
||||
table = ResultTable(table_title)
|
||||
if hash_query:
|
||||
try:
|
||||
table.set_preserve_order(True)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
from Store import Store
|
||||
storage = Store(config=config or {})
|
||||
from Store._base import Store as BaseStore
|
||||
|
||||
backend_to_search = storage_backend or None
|
||||
if hash_query:
|
||||
# Explicit hash list search: build rows from backend metadata.
|
||||
backends_to_try: List[str] = []
|
||||
if backend_to_search:
|
||||
backends_to_try = [backend_to_search]
|
||||
else:
|
||||
backends_to_try = list(storage.list_backends())
|
||||
|
||||
found_any = False
|
||||
for h in hash_query:
|
||||
resolved_backend_name: Optional[str] = None
|
||||
resolved_backend = None
|
||||
|
||||
for backend_name in backends_to_try:
|
||||
try:
|
||||
backend = storage[backend_name]
|
||||
except Exception:
|
||||
continue
|
||||
try:
|
||||
# If get_metadata works, consider it a hit; get_file can be optional (e.g. remote URL).
|
||||
meta = backend.get_metadata(h)
|
||||
if meta is None:
|
||||
continue
|
||||
resolved_backend_name = backend_name
|
||||
resolved_backend = backend
|
||||
break
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
if resolved_backend_name is None or resolved_backend is None:
|
||||
continue
|
||||
|
||||
found_any = True
|
||||
searched_backends.append(resolved_backend_name)
|
||||
|
||||
# Resolve a path/URL string if possible
|
||||
path_str: Optional[str] = None
|
||||
try:
|
||||
maybe_path = resolved_backend.get_file(h)
|
||||
if isinstance(maybe_path, Path):
|
||||
path_str = str(maybe_path)
|
||||
elif isinstance(maybe_path, str) and maybe_path:
|
||||
path_str = maybe_path
|
||||
except Exception:
|
||||
path_str = None
|
||||
|
||||
meta_obj: Dict[str, Any] = {}
|
||||
try:
|
||||
meta_obj = resolved_backend.get_metadata(h) or {}
|
||||
except Exception:
|
||||
meta_obj = {}
|
||||
|
||||
tags_list: List[str] = []
|
||||
try:
|
||||
tag_result = resolved_backend.get_tag(h)
|
||||
if isinstance(tag_result, tuple) and tag_result:
|
||||
maybe_tags = tag_result[0]
|
||||
else:
|
||||
maybe_tags = tag_result
|
||||
if isinstance(maybe_tags, list):
|
||||
tags_list = [str(t).strip() for t in maybe_tags if isinstance(t, str) and str(t).strip()]
|
||||
except Exception:
|
||||
tags_list = []
|
||||
|
||||
title_from_tag: Optional[str] = None
|
||||
try:
|
||||
title_tag = first_title_tag(tags_list)
|
||||
if title_tag and ":" in title_tag:
|
||||
title_from_tag = title_tag.split(":", 1)[1].strip()
|
||||
except Exception:
|
||||
title_from_tag = None
|
||||
|
||||
title = title_from_tag or meta_obj.get("title") or meta_obj.get("name")
|
||||
if not title and path_str:
|
||||
try:
|
||||
title = Path(path_str).stem
|
||||
except Exception:
|
||||
title = path_str
|
||||
|
||||
ext_val = meta_obj.get("ext") or meta_obj.get("extension")
|
||||
if not ext_val and path_str:
|
||||
try:
|
||||
ext_val = Path(path_str).suffix
|
||||
except Exception:
|
||||
ext_val = None
|
||||
|
||||
size_bytes = meta_obj.get("size")
|
||||
if size_bytes is None:
|
||||
size_bytes = meta_obj.get("size_bytes")
|
||||
try:
|
||||
size_bytes_int: Optional[int] = int(size_bytes) if size_bytes is not None else None
|
||||
except Exception:
|
||||
size_bytes_int = None
|
||||
|
||||
payload: Dict[str, Any] = {
|
||||
"title": str(title or h),
|
||||
"hash": h,
|
||||
"store": resolved_backend_name,
|
||||
"path": path_str,
|
||||
"ext": self._normalize_extension(ext_val),
|
||||
"size_bytes": size_bytes_int,
|
||||
"tag": tags_list,
|
||||
}
|
||||
|
||||
table.add_result(payload)
|
||||
results_list.append(payload)
|
||||
ctx.emit(payload)
|
||||
|
||||
if found_any:
|
||||
ctx.set_last_result_table(table, results_list)
|
||||
db.append_worker_stdout(worker_id, json.dumps(results_list, indent=2))
|
||||
db.update_worker_status(worker_id, 'completed')
|
||||
return 0
|
||||
|
||||
log("No results found", file=sys.stderr)
|
||||
db.append_worker_stdout(worker_id, json.dumps([], indent=2))
|
||||
db.update_worker_status(worker_id, 'completed')
|
||||
return 0
|
||||
|
||||
if backend_to_search:
|
||||
searched_backends.append(backend_to_search)
|
||||
target_backend = storage[backend_to_search]
|
||||
@@ -243,7 +357,9 @@ class Search_Store(Cmdlet):
|
||||
log(f"Backend '{backend_to_search}' does not support searching", file=sys.stderr)
|
||||
db.update_worker_status(worker_id, 'error')
|
||||
return 1
|
||||
debug(f"[search-store] Searching '{backend_to_search}'")
|
||||
results = target_backend.search(query, limit=limit)
|
||||
debug(f"[search-store] '{backend_to_search}' -> {len(results or [])} result(s)")
|
||||
else:
|
||||
from API.HydrusNetwork import is_hydrus_available
|
||||
hydrus_available = is_hydrus_available(config or {})
|
||||
@@ -257,7 +373,9 @@ class Search_Store(Cmdlet):
|
||||
continue
|
||||
searched_backends.append(backend_name)
|
||||
|
||||
debug(f"[search-store] Searching '{backend_name}'")
|
||||
backend_results = backend.search(query, limit=limit - len(all_results))
|
||||
debug(f"[search-store] '{backend_name}' -> {len(backend_results or [])} result(s)")
|
||||
if backend_results:
|
||||
all_results.extend(backend_results)
|
||||
if len(all_results) >= limit:
|
||||
@@ -317,11 +435,6 @@ class Search_Store(Cmdlet):
|
||||
results_list.append(normalized)
|
||||
ctx.emit(normalized)
|
||||
|
||||
# Debug: Verify table rows match items list
|
||||
debug(f"[search-store] Added {len(table.rows)} rows to table, {len(results_list)} items to results_list")
|
||||
if len(table.rows) != len(results_list):
|
||||
debug(f"[search-store] WARNING: Table/items mismatch! rows={len(table.rows)} items={len(results_list)}", file=sys.stderr)
|
||||
|
||||
ctx.set_last_result_table(table, results_list)
|
||||
db.append_worker_stdout(worker_id, json.dumps(results_list, indent=2))
|
||||
else:
|
||||
|
||||
726
metadata.py
726
metadata.py
@@ -3,14 +3,12 @@ import re
|
||||
import subprocess
|
||||
import sys
|
||||
import shutil
|
||||
import sqlite3
|
||||
import requests
|
||||
from SYS.logger import log, debug
|
||||
from urllib.parse import urlsplit, urlunsplit, unquote
|
||||
from collections import deque
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple
|
||||
from models import PipeObject, FileRelationshipTracker, _get_file_hash
|
||||
from models import FileRelationshipTracker
|
||||
try:
|
||||
import musicbrainzngs # type: ignore
|
||||
except ImportError: # pragma: no cover
|
||||
@@ -332,6 +330,112 @@ def _generate_hydrus_url_variants(url: str) -> List[str]:
|
||||
return variants
|
||||
|
||||
|
||||
def normalize_urls(value: Any) -> List[str]:
|
||||
"""Normalize a URL field into a stable, deduplicated list.
|
||||
|
||||
Accepts:
|
||||
- None
|
||||
- a single URL string (optionally containing multiple URLs)
|
||||
- a list/tuple/set of URL strings
|
||||
|
||||
This helper is used by cmdlets/stores/pipeline to keep `url` consistent.
|
||||
"""
|
||||
|
||||
def _iter_raw_urls(raw: Any) -> Iterable[str]:
|
||||
if raw is None:
|
||||
return
|
||||
|
||||
if isinstance(raw, str):
|
||||
text = raw.strip()
|
||||
if not text:
|
||||
return
|
||||
# Support legacy prefixes like "url:https://...".
|
||||
if text.lower().startswith("url:"):
|
||||
text = text.split(":", 1)[1].strip()
|
||||
|
||||
# Prefer extracting obvious URLs to avoid splitting inside query strings.
|
||||
matches = re.findall(r"https?://[^\s,]+", text, flags=re.IGNORECASE)
|
||||
if matches:
|
||||
for m in matches:
|
||||
yield m
|
||||
return
|
||||
|
||||
# Fallback: split on commas/whitespace.
|
||||
for token in text.replace("\n", " ").replace("\r", " ").replace(",", " ").split():
|
||||
if token:
|
||||
yield token
|
||||
return
|
||||
|
||||
if isinstance(raw, (list, tuple, set)):
|
||||
for item in raw:
|
||||
if item is None:
|
||||
continue
|
||||
if isinstance(item, str):
|
||||
if item.strip():
|
||||
yield item
|
||||
else:
|
||||
text = str(item).strip()
|
||||
if text:
|
||||
yield text
|
||||
return
|
||||
|
||||
# Last resort: string-coerce.
|
||||
text = str(raw).strip()
|
||||
if text:
|
||||
yield text
|
||||
|
||||
def _canonicalize(url_text: str) -> Optional[str]:
|
||||
u = str(url_text or "").strip()
|
||||
if not u:
|
||||
return None
|
||||
|
||||
# Trim common wrappers and trailing punctuation.
|
||||
u = u.strip("<>\"' ")
|
||||
u = u.rstrip(")].,;\"")
|
||||
if not u:
|
||||
return None
|
||||
|
||||
lower = u.lower()
|
||||
if not (lower.startswith("http://") or lower.startswith("https://")):
|
||||
return u
|
||||
|
||||
try:
|
||||
parsed = urlsplit(u)
|
||||
except Exception:
|
||||
return u
|
||||
|
||||
scheme = (parsed.scheme or "").lower()
|
||||
netloc = (parsed.netloc or "").lower()
|
||||
path = unquote(parsed.path or "")
|
||||
query = parsed.query or ""
|
||||
|
||||
# Normalize default ports.
|
||||
if scheme == "http" and netloc.endswith(":80"):
|
||||
netloc = netloc[:-3]
|
||||
elif scheme == "https" and netloc.endswith(":443"):
|
||||
netloc = netloc[:-4]
|
||||
|
||||
# Prefer no trailing slash except root.
|
||||
if path and path != "/":
|
||||
path = path.rstrip("/")
|
||||
|
||||
# Fragments are not part of the resource.
|
||||
return urlunsplit((scheme, netloc, path, query, ""))
|
||||
|
||||
seen: Set[str] = set()
|
||||
out: List[str] = []
|
||||
for raw_url in _iter_raw_urls(value):
|
||||
canonical = _canonicalize(raw_url)
|
||||
if not canonical:
|
||||
continue
|
||||
if canonical in seen:
|
||||
continue
|
||||
seen.add(canonical)
|
||||
out.append(canonical)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
def value_normalize(value: str) -> str:
|
||||
"""Normalize whitespace: collapse internal spaces, strip, remove newlines."""
|
||||
value = value.replace("\n", " ").replace("\r", " ")
|
||||
@@ -358,6 +462,7 @@ def import_pending_sidecars(db_root: Path, db: Any) -> None:
|
||||
continue
|
||||
|
||||
# Ensure file entry exists
|
||||
file_id: Optional[int] = None
|
||||
try:
|
||||
cursor = db.connection.cursor() if db.connection else None
|
||||
if cursor:
|
||||
@@ -394,10 +499,16 @@ def import_pending_sidecars(db_root: Path, db: Any) -> None:
|
||||
try:
|
||||
cursor = db.connection.cursor() if db.connection else None
|
||||
if cursor:
|
||||
file_hash_value: Optional[str] = None
|
||||
if hasattr(db, 'get_file_hash'):
|
||||
try:
|
||||
file_hash_value = db.get_file_hash(file_id)
|
||||
except Exception:
|
||||
file_hash_value = None
|
||||
for tag in tags:
|
||||
cursor.execute(
|
||||
'INSERT OR IGNORE INTO tags (hash, tag) VALUES (?, ?)',
|
||||
(file_hash_value, tag) if hasattr(db, 'get_file_hash') else (None, tag)
|
||||
(file_hash_value, tag)
|
||||
)
|
||||
db.connection.commit()
|
||||
except Exception:
|
||||
@@ -663,128 +774,6 @@ def fetch_musicbrainz_tags(mbid: str, entity: str) -> Dict[str, object]:
|
||||
return {"source": "musicbrainz", "id": mbid, "tag": tags, "entity": entity}
|
||||
|
||||
|
||||
def fetch_openlibrary_tags(ol_id: str) -> Dict[str, object]:
|
||||
"""Fetch metadata tags from OpenLibrary.
|
||||
|
||||
Args:
|
||||
ol_id: OpenLibrary ID (e.g., 'OL123456M' for a book)
|
||||
|
||||
Returns:
|
||||
Dictionary with 'tag' key containing list of extracted tags
|
||||
"""
|
||||
import urllib.request
|
||||
|
||||
# Normalize OL ID
|
||||
ol_id = ol_id.strip().upper()
|
||||
if not ol_id.startswith('OL'):
|
||||
ol_id = f'OL{ol_id}'
|
||||
|
||||
# Fetch from OpenLibrary API
|
||||
url = f"https://openlibrary.org/books/{ol_id}.json"
|
||||
tags: List[str] = []
|
||||
|
||||
try:
|
||||
with urllib.request.urlopen(url, timeout=10) as response:
|
||||
data = json.loads(response.read().decode('utf-8'))
|
||||
except Exception as e:
|
||||
raise ValueError(f"Failed to fetch OpenLibrary data for {ol_id}: {e}")
|
||||
|
||||
# Add OpenLibrary ID tag
|
||||
_add_tag(tags, "openlibrary", ol_id)
|
||||
|
||||
# Extract title
|
||||
_add_tag(tags, "title", data.get("title"))
|
||||
|
||||
# Extract subtitle if present
|
||||
if data.get("subtitle"):
|
||||
_add_tag(tags, "subtitle", data["subtitle"])
|
||||
|
||||
# Extract authors
|
||||
authors = data.get("authors", [])
|
||||
author_names: List[str] = []
|
||||
for author in authors:
|
||||
if isinstance(author, dict):
|
||||
name = author.get("name")
|
||||
else:
|
||||
name = str(author)
|
||||
if name:
|
||||
author_names.append(name)
|
||||
if author_names:
|
||||
_extend_tags(tags, "author", author_names)
|
||||
|
||||
# Extract publication details
|
||||
if data.get("publish_date"):
|
||||
_add_tag(tags, "publish_date", data["publish_date"])
|
||||
# Extract year if present
|
||||
year_match = re.search(r'\b(\d{4})\b', str(data.get("publish_date", "")))
|
||||
if year_match:
|
||||
_add_tag(tags, "year", year_match.group(1))
|
||||
|
||||
# Extract publishers
|
||||
publishers = data.get("publishers", [])
|
||||
if publishers:
|
||||
publisher_names = []
|
||||
for pub in publishers:
|
||||
if isinstance(pub, dict):
|
||||
name = pub.get("name")
|
||||
else:
|
||||
name = str(pub)
|
||||
if name:
|
||||
publisher_names.append(name)
|
||||
if publisher_names:
|
||||
_extend_tags(tags, "publisher", publisher_names)
|
||||
|
||||
# Extract languages
|
||||
languages = data.get("languages", [])
|
||||
if languages:
|
||||
lang_codes = []
|
||||
for lang in languages:
|
||||
if isinstance(lang, dict):
|
||||
code = lang.get("key", "").split("/")[-1]
|
||||
else:
|
||||
code = str(lang).split("/")[-1]
|
||||
if code and code != "":
|
||||
lang_codes.append(code)
|
||||
if lang_codes:
|
||||
_extend_tags(tags, "language", lang_codes)
|
||||
|
||||
# Extract ISBN
|
||||
isbns = data.get("isbn_10", []) + data.get("isbn_13", [])
|
||||
if isbns:
|
||||
for isbn in isbns[:1]: # Just take first one
|
||||
if len(str(isbn)) == 10:
|
||||
_add_tag(tags, "isbn_10", isbn)
|
||||
elif len(str(isbn)) == 13:
|
||||
_add_tag(tags, "isbn_13", isbn)
|
||||
|
||||
# Extract page count
|
||||
_add_tag(tags, "pages", data.get("number_of_pages"))
|
||||
|
||||
# Extract genres/subjects (OpenLibrary calls them subjects)
|
||||
# Subjects are added as plain freeform tags (no namespace prefix)
|
||||
subjects = data.get("subjects", [])
|
||||
if subjects:
|
||||
for subject in subjects[:10]: # Limit to 10 subjects
|
||||
if isinstance(subject, dict):
|
||||
name = subject.get("name")
|
||||
else:
|
||||
name = str(subject)
|
||||
if name:
|
||||
# Add subject as plain tag without "subject:" prefix
|
||||
normalized = value_normalize(str(name))
|
||||
if normalized:
|
||||
tags.append(normalized)
|
||||
|
||||
# Extract OpenLibrary description
|
||||
description = data.get("description")
|
||||
if description:
|
||||
if isinstance(description, dict):
|
||||
description = description.get("value")
|
||||
_add_tag(tags, "summary", description)
|
||||
|
||||
return {"source": "openlibrary", "id": ol_id, "tag": tags}
|
||||
|
||||
|
||||
def _append_unique(target: List[str], seen: Set[str], value: Optional[str]) -> None:
|
||||
"""Append a single value if not already in seen set (deduplication)."""
|
||||
if value is None:
|
||||
@@ -1545,7 +1534,7 @@ def _derive_sidecar_path(media_path: Path) -> Path:
|
||||
return preferred
|
||||
|
||||
|
||||
def _read_sidecar_metadata(sidecar_path: Path) -> tuple[Optional[str], List[str], List[str]]:
|
||||
def _read_sidecar_metadata(sidecar_path: Path) -> tuple[Optional[str], List[str], List[str]]: # pyright: ignore[reportUnusedFunction]
|
||||
"""Read hash, tags, and url from sidecar file.
|
||||
|
||||
Consolidated with read_tags_from_file - this extracts extra metadata (hash, url).
|
||||
@@ -1559,7 +1548,7 @@ def _read_sidecar_metadata(sidecar_path: Path) -> tuple[Optional[str], List[str]
|
||||
|
||||
hash_value: Optional[str] = None
|
||||
tags: List[str] = []
|
||||
url: List[str] = []
|
||||
urls: List[str] = []
|
||||
|
||||
for raw_line in raw.splitlines():
|
||||
line = raw_line.strip()
|
||||
@@ -1574,15 +1563,15 @@ def _read_sidecar_metadata(sidecar_path: Path) -> tuple[Optional[str], List[str]
|
||||
url_part = line.split(':', 1)[1].strip() if ':' in line else ''
|
||||
if url_part:
|
||||
for url_segment in url_part.split(','):
|
||||
for url in url_segment.split():
|
||||
url_clean = url.strip()
|
||||
if url_clean and url_clean not in url:
|
||||
url.append(url_clean)
|
||||
for url_token in url_segment.split():
|
||||
url_clean = url_token.strip()
|
||||
if url_clean and url_clean not in urls:
|
||||
urls.append(url_clean)
|
||||
else:
|
||||
# Everything else is a tag (including relationship: lines)
|
||||
tags.append(line)
|
||||
|
||||
return hash_value, tags, url
|
||||
return hash_value, tags, urls
|
||||
|
||||
|
||||
|
||||
@@ -1827,63 +1816,6 @@ def apply_title_to_path(media_path: Path, tags: Iterable[str]) -> Path:
|
||||
return destination
|
||||
|
||||
|
||||
def _collect_search_roots(payload: Dict[str, Any]) -> List[Path]:
|
||||
roots: List[Path] = []
|
||||
for key in ('paths', 'search_paths', 'roots', 'directories'):
|
||||
raw = payload.get(key)
|
||||
if not raw:
|
||||
continue
|
||||
entries = raw if isinstance(raw, (list, tuple, set)) else [raw]
|
||||
for entry in entries:
|
||||
if not entry:
|
||||
continue
|
||||
try:
|
||||
candidate = Path(str(entry)).expanduser()
|
||||
except Exception:
|
||||
continue
|
||||
roots.append(candidate)
|
||||
if load_config is not None and resolve_output_dir is not None:
|
||||
try:
|
||||
config = load_config()
|
||||
except Exception:
|
||||
config = None
|
||||
if isinstance(config, dict) and config:
|
||||
try:
|
||||
default_root = resolve_output_dir(config)
|
||||
except Exception:
|
||||
default_root = None
|
||||
if default_root is not None:
|
||||
roots.append(default_root)
|
||||
return roots
|
||||
|
||||
|
||||
def _locate_sidecar_by_hash(hash_value: str, roots: Iterable[Path]) -> Optional[Path]:
|
||||
target = f'hash:{hash_value.strip().lower()}'
|
||||
for root in roots:
|
||||
try:
|
||||
root_path = root.expanduser()
|
||||
except Exception:
|
||||
continue
|
||||
if not root_path.exists() or not root_path.is_dir():
|
||||
continue
|
||||
for pattern in ('*.tag',):
|
||||
try:
|
||||
iterator = root_path.rglob(pattern)
|
||||
except OSError:
|
||||
continue
|
||||
for candidate in iterator:
|
||||
if not candidate.is_file():
|
||||
continue
|
||||
try:
|
||||
with candidate.open('r', encoding='utf-8', errors='ignore') as handle:
|
||||
for line in handle:
|
||||
if line.strip().lower() == target:
|
||||
return candidate
|
||||
except OSError:
|
||||
continue
|
||||
return None
|
||||
|
||||
|
||||
def sync_sidecar(payload: Dict[str, Any]) -> Dict[str, Any]:
|
||||
path_value = payload.get('path')
|
||||
if not path_value:
|
||||
@@ -2506,8 +2438,8 @@ def write_tags_to_file(
|
||||
|
||||
# Add known url if provided - each on separate line to prevent corruption
|
||||
if url:
|
||||
for url in url:
|
||||
content_lines.append(f"url:{url}")
|
||||
for url_item in url:
|
||||
content_lines.append(f"url:{url_item}")
|
||||
|
||||
# Add tags
|
||||
if tags:
|
||||
@@ -2642,10 +2574,10 @@ def detect_metadata_request(tag: str) -> Optional[Dict[str, str]]:
|
||||
def expand_metadata_tag(payload: Dict[str, Any]) -> Dict[str, Any]:
|
||||
tag = payload.get('tag')
|
||||
if not isinstance(tag, str):
|
||||
return {'tags': []}
|
||||
return {'tag': []}
|
||||
trimmed = value_normalize(tag)
|
||||
if not trimmed:
|
||||
return {'tags': []}
|
||||
return {'tag': []}
|
||||
request = detect_metadata_request(trimmed)
|
||||
tags: List[str] = []
|
||||
seen: Set[str] = set()
|
||||
@@ -2653,7 +2585,7 @@ def expand_metadata_tag(payload: Dict[str, Any]) -> Dict[str, Any]:
|
||||
_append_unique(tags, seen, request['base'])
|
||||
else:
|
||||
_append_unique(tags, seen, trimmed)
|
||||
return {'tags': tags}
|
||||
return {'tag': tags}
|
||||
try:
|
||||
if request['source'] == 'imdb':
|
||||
data = imdb_tag(request['id'])
|
||||
@@ -2662,8 +2594,15 @@ def expand_metadata_tag(payload: Dict[str, Any]) -> Dict[str, Any]:
|
||||
except Exception as exc: # pragma: no cover - network/service errors
|
||||
return {'tag': tags, 'error': str(exc)}
|
||||
# Add tags from fetched data (no namespace, just unique append)
|
||||
for tag in (data.get('tag') or []):
|
||||
_append_unique(tags, seen, tag)
|
||||
raw_tags = data.get('tag') if isinstance(data, dict) else None
|
||||
if isinstance(raw_tags, str):
|
||||
tag_iter: Iterable[str] = [raw_tags]
|
||||
elif isinstance(raw_tags, (list, tuple, set)):
|
||||
tag_iter = [t for t in raw_tags if isinstance(t, str)]
|
||||
else:
|
||||
tag_iter = []
|
||||
for tag_value in tag_iter:
|
||||
_append_unique(tags, seen, tag_value)
|
||||
result = {
|
||||
'tag': tags,
|
||||
'source': request['source'],
|
||||
@@ -3082,14 +3021,14 @@ def expand_tag_lists(tags_set: Set[str]) -> Set[str]:
|
||||
# Load adjective.json from workspace root
|
||||
adjective_path = Path(__file__).parent / "adjective.json"
|
||||
if not adjective_path.exists():
|
||||
log.debug(f"adjective.json not found at {adjective_path}")
|
||||
debug(f"adjective.json not found at {adjective_path}")
|
||||
return tags_set
|
||||
|
||||
try:
|
||||
with open(adjective_path, 'r') as f:
|
||||
adjective_lists = json.load(f)
|
||||
except Exception as e:
|
||||
log.error(f"Error loading adjective.json: {e}")
|
||||
debug(f"Error loading adjective.json: {e}")
|
||||
return tags_set
|
||||
|
||||
expanded_tags = set()
|
||||
@@ -3108,10 +3047,10 @@ def expand_tag_lists(tags_set: Set[str]) -> Set[str]:
|
||||
if matched_list:
|
||||
# Add all tags from the list
|
||||
expanded_tags.update(matched_list)
|
||||
log.info(f"Expanded {tag} to {len(matched_list)} tags")
|
||||
debug(f"Expanded {tag} to {len(matched_list)} tags")
|
||||
else:
|
||||
# List not found, log warning but don't add the reference
|
||||
log.warning(f"Tag list '{list_name}' not found in adjective.json")
|
||||
debug(f"Tag list '{list_name}' not found in adjective.json")
|
||||
else:
|
||||
# Regular tag, keep as is
|
||||
expanded_tags.add(tag)
|
||||
@@ -3194,98 +3133,6 @@ def build_book_tags(
|
||||
return deduped
|
||||
|
||||
|
||||
def fetch_openlibrary_metadata_tags(isbn: Optional[str] = None, olid: Optional[str] = None) -> List[str]:
|
||||
"""Fetch book metadata from OpenLibrary and return as tags.
|
||||
|
||||
Args:
|
||||
isbn: ISBN number (with or without isbn: prefix)
|
||||
olid: OpenLibrary ID
|
||||
|
||||
Returns:
|
||||
List of tags extracted from OpenLibrary metadata
|
||||
"""
|
||||
metadata_tags = []
|
||||
|
||||
# Try OLID first (preferred), then ISBN
|
||||
url = None
|
||||
|
||||
if olid:
|
||||
# Clean up OLID format
|
||||
olid_clean = str(olid).replace('OL', '').replace('M', '').replace('W', '')
|
||||
if olid_clean.isdigit():
|
||||
url = f"https://openlibrary.org/books/OL{olid_clean}M.json"
|
||||
else:
|
||||
url = f"https://openlibrary.org/books/{olid}.json"
|
||||
elif isbn:
|
||||
# Clean up ISBN
|
||||
isbn_clean = str(isbn).replace('isbn:', '').strip()
|
||||
url = f"https://openlibrary.org/isbn/{isbn_clean}.json"
|
||||
|
||||
if not url:
|
||||
return metadata_tags
|
||||
|
||||
try:
|
||||
response = requests.get(url, timeout=10)
|
||||
if response.status_code != 200:
|
||||
return metadata_tags
|
||||
|
||||
data = response.json()
|
||||
if not data:
|
||||
return metadata_tags
|
||||
|
||||
# Extract title
|
||||
if 'title' in data:
|
||||
metadata_tags.append(f"title:{data['title']}")
|
||||
|
||||
# Extract authors
|
||||
if 'authors' in data and isinstance(data['authors'], list):
|
||||
for author in data['authors'][:3]:
|
||||
if isinstance(author, dict) and 'name' in author:
|
||||
metadata_tags.append(f"author:{author['name']}")
|
||||
elif isinstance(author, str):
|
||||
metadata_tags.append(f"author:{author}")
|
||||
|
||||
# Extract publish date
|
||||
if 'publish_date' in data:
|
||||
metadata_tags.append(f"publish_date:{data['publish_date']}")
|
||||
|
||||
# Extract publishers
|
||||
if 'publishers' in data and isinstance(data['publishers'], list):
|
||||
for pub in data['publishers'][:1]:
|
||||
if isinstance(pub, dict) and 'name' in pub:
|
||||
metadata_tags.append(f"publisher:{pub['name']}")
|
||||
elif isinstance(pub, str):
|
||||
metadata_tags.append(f"publisher:{pub}")
|
||||
|
||||
# Extract number of pages
|
||||
if 'number_of_pages' in data:
|
||||
page_count = data['number_of_pages']
|
||||
if page_count and isinstance(page_count, int) and page_count > 0:
|
||||
metadata_tags.append(f"pages:{page_count}")
|
||||
|
||||
# Extract language
|
||||
if 'languages' in data and isinstance(data['languages'], list) and data['languages']:
|
||||
lang = data['languages'][0]
|
||||
if isinstance(lang, dict) and 'key' in lang:
|
||||
lang_code = lang['key'].split('/')[-1]
|
||||
metadata_tags.append(f"language:{lang_code}")
|
||||
elif isinstance(lang, str):
|
||||
metadata_tags.append(f"language:{lang}")
|
||||
|
||||
# Extract subjects as freeform tags (limit to 5)
|
||||
if 'subjects' in data and isinstance(data['subjects'], list):
|
||||
for subject in data['subjects'][:5]:
|
||||
if subject and isinstance(subject, str):
|
||||
subject_clean = str(subject).strip()
|
||||
if subject_clean:
|
||||
metadata_tags.append(subject_clean)
|
||||
|
||||
except Exception as e:
|
||||
debug(f"⚠ Failed to fetch OpenLibrary metadata: {e}")
|
||||
|
||||
return metadata_tags
|
||||
|
||||
|
||||
def enrich_playlist_entries(entries: list, extractor: str) -> list:
|
||||
"""Enrich playlist entries with full metadata by fetching individual entry info.
|
||||
|
||||
@@ -3312,7 +3159,7 @@ def enrich_playlist_entries(entries: list, extractor: str) -> list:
|
||||
if entry_url and is_url_supported_by_ytdlp(entry_url):
|
||||
try:
|
||||
import yt_dlp
|
||||
ydl_opts = {
|
||||
ydl_opts: Any = {
|
||||
"quiet": True,
|
||||
"no_warnings": True,
|
||||
"skip_download": True,
|
||||
@@ -3690,294 +3537,3 @@ def extract_url_formats(formats: list) -> List[Tuple[str, str]]:
|
||||
return []
|
||||
|
||||
|
||||
def scrape_isbn_metadata(isbn: str) -> List[str]:
|
||||
"""Scrape metadata for an ISBN using Open Library API."""
|
||||
new_tags = []
|
||||
try:
|
||||
from API.HTTP import HTTPClient
|
||||
import json as json_module
|
||||
|
||||
isbn_clean = isbn.replace('-', '').strip()
|
||||
url = f"https://openlibrary.org/api/books?bibkeys=ISBN:{isbn_clean}&jscmd=data&format=json"
|
||||
|
||||
try:
|
||||
with HTTPClient() as client:
|
||||
response = client.get(url)
|
||||
response.raise_for_status()
|
||||
data = json_module.loads(response.content.decode('utf-8'))
|
||||
except Exception as e:
|
||||
log(f"Failed to fetch ISBN metadata: {e}", file=sys.stderr)
|
||||
return []
|
||||
|
||||
if not data:
|
||||
log(f"No ISBN metadata found for: {isbn}")
|
||||
return []
|
||||
|
||||
book_data = next(iter(data.values()), None)
|
||||
if not book_data:
|
||||
return []
|
||||
|
||||
if 'title' in book_data:
|
||||
new_tags.append(f"title:{book_data['title']}")
|
||||
|
||||
if 'authors' in book_data and isinstance(book_data['authors'], list):
|
||||
for author in book_data['authors'][:3]:
|
||||
if 'name' in author:
|
||||
new_tags.append(f"author:{author['name']}")
|
||||
|
||||
if 'publish_date' in book_data:
|
||||
new_tags.append(f"publish_date:{book_data['publish_date']}")
|
||||
|
||||
if 'publishers' in book_data and isinstance(book_data['publishers'], list):
|
||||
for pub in book_data['publishers'][:1]:
|
||||
if 'name' in pub:
|
||||
new_tags.append(f"publisher:{pub['name']}")
|
||||
|
||||
if 'description' in book_data:
|
||||
desc = book_data['description']
|
||||
if isinstance(desc, dict) and 'value' in desc:
|
||||
desc = desc['value']
|
||||
if desc:
|
||||
desc_str = str(desc).strip()
|
||||
# Include description if available (limit to 200 chars to keep it manageable)
|
||||
if len(desc_str) > 0:
|
||||
new_tags.append(f"description:{desc_str[:200]}")
|
||||
|
||||
if 'number_of_pages' in book_data:
|
||||
page_count = book_data['number_of_pages']
|
||||
if page_count and isinstance(page_count, int) and page_count > 0:
|
||||
new_tags.append(f"pages:{page_count}")
|
||||
|
||||
if 'identifiers' in book_data and isinstance(book_data['identifiers'], dict):
|
||||
identifiers = book_data['identifiers']
|
||||
|
||||
if 'openlibrary' in identifiers:
|
||||
ol_ids = identifiers['openlibrary']
|
||||
if isinstance(ol_ids, list) and ol_ids:
|
||||
new_tags.append(f"openlibrary:{ol_ids[0]}")
|
||||
elif isinstance(ol_ids, str):
|
||||
new_tags.append(f"openlibrary:{ol_ids}")
|
||||
|
||||
if 'lccn' in identifiers:
|
||||
lccn_list = identifiers['lccn']
|
||||
if isinstance(lccn_list, list) and lccn_list:
|
||||
new_tags.append(f"lccn:{lccn_list[0]}")
|
||||
elif isinstance(lccn_list, str):
|
||||
new_tags.append(f"lccn:{lccn_list}")
|
||||
|
||||
if 'oclc' in identifiers:
|
||||
oclc_list = identifiers['oclc']
|
||||
if isinstance(oclc_list, list) and oclc_list:
|
||||
new_tags.append(f"oclc:{oclc_list[0]}")
|
||||
elif isinstance(oclc_list, str):
|
||||
new_tags.append(f"oclc:{oclc_list}")
|
||||
|
||||
if 'goodreads' in identifiers:
|
||||
goodreads_list = identifiers['goodreads']
|
||||
if isinstance(goodreads_list, list) and goodreads_list:
|
||||
new_tags.append(f"goodreads:{goodreads_list[0]}")
|
||||
elif isinstance(goodreads_list, str):
|
||||
new_tags.append(f"goodreads:{goodreads_list}")
|
||||
|
||||
if 'librarything' in identifiers:
|
||||
lt_list = identifiers['librarything']
|
||||
if isinstance(lt_list, list) and lt_list:
|
||||
new_tags.append(f"librarything:{lt_list[0]}")
|
||||
elif isinstance(lt_list, str):
|
||||
new_tags.append(f"librarything:{lt_list}")
|
||||
|
||||
if 'doi' in identifiers:
|
||||
doi_list = identifiers['doi']
|
||||
if isinstance(doi_list, list) and doi_list:
|
||||
new_tags.append(f"doi:{doi_list[0]}")
|
||||
elif isinstance(doi_list, str):
|
||||
new_tags.append(f"doi:{doi_list}")
|
||||
|
||||
if 'internet_archive' in identifiers:
|
||||
ia_list = identifiers['internet_archive']
|
||||
if isinstance(ia_list, list) and ia_list:
|
||||
new_tags.append(f"internet_archive:{ia_list[0]}")
|
||||
elif isinstance(ia_list, str):
|
||||
new_tags.append(f"internet_archive:{ia_list}")
|
||||
|
||||
log(f"Found {len(new_tags)} tag(s) from ISBN lookup")
|
||||
return new_tags
|
||||
except Exception as e:
|
||||
log(f"ISBN scraping error: {e}", file=sys.stderr)
|
||||
return []
|
||||
|
||||
|
||||
def scrape_openlibrary_metadata(olid: str) -> List[str]:
|
||||
"""Scrape metadata for an OpenLibrary ID using the .json API endpoint.
|
||||
|
||||
Fetches from https://openlibrary.org/books/{OLID}.json and extracts:
|
||||
- Title, authors, publish date, publishers
|
||||
- Description
|
||||
- Subjects as freeform tags (without namespace prefix)
|
||||
- Identifiers (ISBN, LCCN, OCLC, etc.)
|
||||
"""
|
||||
new_tags = []
|
||||
try:
|
||||
from API.HTTP import HTTPClient
|
||||
import json as json_module
|
||||
|
||||
# Format: OL9674499M or just 9674499M
|
||||
olid_clean = olid.replace('OL', '').replace('M', '')
|
||||
if not olid_clean.isdigit():
|
||||
olid_clean = olid
|
||||
|
||||
# Ensure we have the full OLID format for the URL
|
||||
if not olid.startswith('OL'):
|
||||
url = f"https://openlibrary.org/books/OL{olid_clean}M.json"
|
||||
else:
|
||||
url = f"https://openlibrary.org/books/{olid}.json"
|
||||
|
||||
try:
|
||||
with HTTPClient() as client:
|
||||
response = client.get(url)
|
||||
response.raise_for_status()
|
||||
data = json_module.loads(response.content.decode('utf-8'))
|
||||
except Exception as e:
|
||||
log(f"Failed to fetch OpenLibrary metadata: {e}", file=sys.stderr)
|
||||
return []
|
||||
|
||||
if not data:
|
||||
log(f"No OpenLibrary metadata found for: {olid}")
|
||||
return []
|
||||
|
||||
# Add title
|
||||
if 'title' in data:
|
||||
new_tags.append(f"title:{data['title']}")
|
||||
|
||||
# Add authors
|
||||
if 'authors' in data and isinstance(data['authors'], list):
|
||||
for author in data['authors'][:3]:
|
||||
if isinstance(author, dict) and 'name' in author:
|
||||
new_tags.append(f"author:{author['name']}")
|
||||
elif isinstance(author, str):
|
||||
new_tags.append(f"author:{author}")
|
||||
|
||||
# Add publish date
|
||||
if 'publish_date' in data:
|
||||
new_tags.append(f"publish_date:{data['publish_date']}")
|
||||
|
||||
# Add publishers
|
||||
if 'publishers' in data and isinstance(data['publishers'], list):
|
||||
for pub in data['publishers'][:1]:
|
||||
if isinstance(pub, dict) and 'name' in pub:
|
||||
new_tags.append(f"publisher:{pub['name']}")
|
||||
elif isinstance(pub, str):
|
||||
new_tags.append(f"publisher:{pub}")
|
||||
|
||||
# Add description
|
||||
if 'description' in data:
|
||||
desc = data['description']
|
||||
if isinstance(desc, dict) and 'value' in desc:
|
||||
desc = desc['value']
|
||||
if desc:
|
||||
desc_str = str(desc).strip()
|
||||
if len(desc_str) > 0:
|
||||
new_tags.append(f"description:{desc_str[:200]}")
|
||||
|
||||
# Add number of pages
|
||||
if 'number_of_pages' in data:
|
||||
page_count = data['number_of_pages']
|
||||
if page_count and isinstance(page_count, int) and page_count > 0:
|
||||
new_tags.append(f"pages:{page_count}")
|
||||
|
||||
# Add subjects as FREEFORM tags (no namespace prefix)
|
||||
if 'subjects' in data and isinstance(data['subjects'], list):
|
||||
for subject in data['subjects'][:10]:
|
||||
if subject and isinstance(subject, str):
|
||||
subject_clean = str(subject).strip()
|
||||
if subject_clean and subject_clean not in new_tags:
|
||||
new_tags.append(subject_clean)
|
||||
|
||||
# Add identifiers
|
||||
if 'identifiers' in data and isinstance(data['identifiers'], dict):
|
||||
identifiers = data['identifiers']
|
||||
|
||||
if 'isbn_10' in identifiers:
|
||||
isbn_10_list = identifiers['isbn_10']
|
||||
if isinstance(isbn_10_list, list) and isbn_10_list:
|
||||
new_tags.append(f"isbn_10:{isbn_10_list[0]}")
|
||||
elif isinstance(isbn_10_list, str):
|
||||
new_tags.append(f"isbn_10:{isbn_10_list}")
|
||||
|
||||
if 'isbn_13' in identifiers:
|
||||
isbn_13_list = identifiers['isbn_13']
|
||||
if isinstance(isbn_13_list, list) and isbn_13_list:
|
||||
new_tags.append(f"isbn_13:{isbn_13_list[0]}")
|
||||
elif isinstance(isbn_13_list, str):
|
||||
new_tags.append(f"isbn_13:{isbn_13_list}")
|
||||
|
||||
if 'lccn' in identifiers:
|
||||
lccn_list = identifiers['lccn']
|
||||
if isinstance(lccn_list, list) and lccn_list:
|
||||
new_tags.append(f"lccn:{lccn_list[0]}")
|
||||
elif isinstance(lccn_list, str):
|
||||
new_tags.append(f"lccn:{lccn_list}")
|
||||
|
||||
if 'oclc_numbers' in identifiers:
|
||||
oclc_list = identifiers['oclc_numbers']
|
||||
if isinstance(oclc_list, list) and oclc_list:
|
||||
new_tags.append(f"oclc:{oclc_list[0]}")
|
||||
elif isinstance(oclc_list, str):
|
||||
new_tags.append(f"oclc:{oclc_list}")
|
||||
|
||||
if 'goodreads' in identifiers:
|
||||
goodreads_list = identifiers['goodreads']
|
||||
if isinstance(goodreads_list, list) and goodreads_list:
|
||||
new_tags.append(f"goodreads:{goodreads_list[0]}")
|
||||
elif isinstance(goodreads_list, str):
|
||||
new_tags.append(f"goodreads:{goodreads_list}")
|
||||
|
||||
log(f"Found {len(new_tags)} tag(s) from OpenLibrary lookup")
|
||||
return new_tags
|
||||
except Exception as e:
|
||||
log(f"OpenLibrary scraping error: {e}", file=sys.stderr)
|
||||
return []
|
||||
|
||||
|
||||
def perform_metadata_scraping(tags_list: List[str]) -> List[str]:
|
||||
"""Perform scraping based on identifiers in tags.
|
||||
|
||||
Priority order:
|
||||
1. openlibrary: (preferred - more complete metadata)
|
||||
2. isbn_10 or isbn (fallback)
|
||||
"""
|
||||
identifiers = extract_scrapable_identifiers(tags_list)
|
||||
|
||||
if not identifiers:
|
||||
log("No scrapable identifiers found (openlibrary, ISBN, musicbrainz, imdb)")
|
||||
return []
|
||||
|
||||
log(f"Found scrapable identifiers: {', '.join(identifiers.keys())}")
|
||||
|
||||
new_tags = []
|
||||
|
||||
# Prefer OpenLibrary over ISBN (more complete metadata)
|
||||
if 'openlibrary' in identifiers:
|
||||
olid = identifiers['openlibrary']
|
||||
if olid:
|
||||
log(f"Scraping OpenLibrary: {olid}")
|
||||
new_tags.extend(scrape_openlibrary_metadata(olid))
|
||||
elif 'isbn_13' in identifiers or 'isbn_10' in identifiers or 'isbn' in identifiers:
|
||||
isbn = identifiers.get('isbn_13') or identifiers.get('isbn_10') or identifiers.get('isbn')
|
||||
if isbn:
|
||||
log(f"Scraping ISBN: {isbn}")
|
||||
new_tags.extend(scrape_isbn_metadata(isbn))
|
||||
|
||||
existing_tags_lower = {tag.lower() for tag in tags_list}
|
||||
scraped_unique = []
|
||||
seen = set()
|
||||
for tag in new_tags:
|
||||
tag_lower = tag.lower()
|
||||
if tag_lower not in existing_tags_lower and tag_lower not in seen:
|
||||
scraped_unique.append(tag)
|
||||
seen.add(tag_lower)
|
||||
|
||||
if scraped_unique:
|
||||
log(f"Added {len(scraped_unique)} new tag(s) from scraping")
|
||||
|
||||
return scraped_unique
|
||||
|
||||
29
models.py
29
models.py
@@ -150,6 +150,35 @@ class PipeObject:
|
||||
# Truncate key if needed
|
||||
key_display = key if len(key) <= 15 else key[:12] + "..."
|
||||
debug(f"│ {key_display:<15}: {val_display:<42}│")
|
||||
|
||||
# If we have structured provider metadata, expand it for debugging.
|
||||
full_md = self.extra.get("full_metadata")
|
||||
if isinstance(full_md, dict) and full_md:
|
||||
debug("├─────────────────────────────────────────────────────────────┤")
|
||||
debug("│ full_metadata: │")
|
||||
for md_key in sorted(full_md.keys(), key=lambda x: str(x)):
|
||||
md_val = full_md.get(md_key)
|
||||
if isinstance(md_val, (str, int, float)) or md_val is None or isinstance(md_val, bool):
|
||||
md_display = str(md_val)
|
||||
elif isinstance(md_val, list):
|
||||
if len(md_val) <= 6 and all(isinstance(x, (str, int, float, bool)) or x is None for x in md_val):
|
||||
md_display = "[" + ", ".join(str(x) for x in md_val) + "]"
|
||||
else:
|
||||
md_display = f"list({len(md_val)})"
|
||||
elif isinstance(md_val, dict):
|
||||
# Avoid dumping huge nested dicts (like raw provider docs).
|
||||
keys = list(md_val.keys())
|
||||
preview = ",".join(str(k) for k in keys[:6])
|
||||
md_display = f"dict({len(keys)})[{preview}{',...' if len(keys) > 6 else ''}]"
|
||||
else:
|
||||
md_str = str(md_val)
|
||||
md_display = md_str if len(md_str) <= 40 else md_str[:37] + "..."
|
||||
|
||||
md_key_display = str(md_key)
|
||||
md_key_display = md_key_display if len(md_key_display) <= 15 else md_key_display[:12] + "..."
|
||||
if len(md_display) > 42:
|
||||
md_display = md_display[:39] + "..."
|
||||
debug(f"│ {md_key_display:<15}: {md_display:<42}│")
|
||||
|
||||
if self.action:
|
||||
debug("├─────────────────────────────────────────────────────────────┤")
|
||||
|
||||
14
pipeline.py
14
pipeline.py
@@ -575,7 +575,12 @@ def restore_previous_result_table() -> bool:
|
||||
_DISPLAY_ITEMS = []
|
||||
_DISPLAY_TABLE = None
|
||||
_DISPLAY_SUBJECT = None
|
||||
return True
|
||||
# If an underlying table exists, we're done.
|
||||
# Otherwise, fall through to history restore so @.. actually returns to the last table.
|
||||
if _LAST_RESULT_TABLE is not None:
|
||||
return True
|
||||
if not _RESULT_TABLE_HISTORY:
|
||||
return True
|
||||
|
||||
if not _RESULT_TABLE_HISTORY:
|
||||
return False
|
||||
@@ -613,7 +618,12 @@ def restore_next_result_table() -> bool:
|
||||
_DISPLAY_ITEMS = []
|
||||
_DISPLAY_TABLE = None
|
||||
_DISPLAY_SUBJECT = None
|
||||
return True
|
||||
# If an underlying table exists, we're done.
|
||||
# Otherwise, fall through to forward restore when available.
|
||||
if _LAST_RESULT_TABLE is not None:
|
||||
return True
|
||||
if not _RESULT_TABLE_FORWARD:
|
||||
return True
|
||||
|
||||
if not _RESULT_TABLE_FORWARD:
|
||||
return False
|
||||
|
||||
336
test-login.py
Normal file
336
test-login.py
Normal file
@@ -0,0 +1,336 @@
|
||||
import requests
|
||||
import random, string
|
||||
from concurrent import futures
|
||||
from tqdm import tqdm
|
||||
import time
|
||||
from datetime import datetime
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
import shutil
|
||||
import json
|
||||
import re
|
||||
import base64
|
||||
import hashlib
|
||||
from Crypto.Cipher import AES
|
||||
from Crypto.Util import Counter
|
||||
|
||||
def display_error(response, message):
|
||||
print(message)
|
||||
print(response)
|
||||
print(response.text)
|
||||
exit()
|
||||
|
||||
def get_book_infos(session, url):
|
||||
r = session.get(url).text
|
||||
infos_url = "https:" + r.split('"url":"')[1].split('"')[0].replace("\\u0026", "&")
|
||||
response = session.get(infos_url)
|
||||
data = response.json()['data']
|
||||
title = data['brOptions']['bookTitle'].strip().replace(" ", "_")
|
||||
title = ''.join( c for c in title if c not in '<>:"/\\|?*' ) # Filter forbidden chars in directory names (Windows & Linux)
|
||||
title = title[:150] # Trim the title to avoid long file names
|
||||
metadata = data['metadata']
|
||||
links = []
|
||||
for item in data['brOptions']['data']:
|
||||
for page in item:
|
||||
links.append(page['uri'])
|
||||
|
||||
if len(links) > 1:
|
||||
print(f"[+] Found {len(links)} pages")
|
||||
return title, links, metadata
|
||||
else:
|
||||
print(f"[-] Error while getting image links")
|
||||
exit()
|
||||
|
||||
def login(email, password):
|
||||
session = requests.Session()
|
||||
response = session.get("https://archive.org/services/account/login/")
|
||||
login_data = response.json()
|
||||
if not login_data['success']:
|
||||
display_error(response, "[-] Error while getting login token:")
|
||||
|
||||
login_token = login_data["value"]["token"]
|
||||
|
||||
headers = {"Content-Type": "application/x-www-form-urlencoded"}
|
||||
data = {"username":email, "password":password, "t": login_token}
|
||||
|
||||
response = session.post("https://archive.org/services/account/login/", headers=headers, data=json.dumps(data))
|
||||
try:
|
||||
response_json = response.json()
|
||||
except:
|
||||
display_error(response, "[-] Error while login:")
|
||||
|
||||
if response_json["success"] == False:
|
||||
if response_json["value"] == "bad_login":
|
||||
print("[-] Invalid credentials!")
|
||||
exit()
|
||||
display_error(response, "[-] Error while login:")
|
||||
else:
|
||||
print("[+] Successful login")
|
||||
return session
|
||||
|
||||
def loan(session, book_id, verbose=True):
|
||||
data = {
|
||||
"action": "grant_access",
|
||||
"identifier": book_id
|
||||
}
|
||||
response = session.post("https://archive.org/services/loans/loan/searchInside.php", data=data)
|
||||
data['action'] = "browse_book"
|
||||
response = session.post("https://archive.org/services/loans/loan/", data=data)
|
||||
|
||||
if response.status_code == 400 :
|
||||
try:
|
||||
if response.json()["error"] == "This book is not available to borrow at this time. Please try again later.":
|
||||
print("This book doesn't need to be borrowed")
|
||||
return session
|
||||
else :
|
||||
display_error(response, "Something went wrong when trying to borrow the book.")
|
||||
except: # The response is not in JSON format
|
||||
display_error(response, "The book cannot be borrowed")
|
||||
|
||||
data['action'] = "create_token"
|
||||
response = session.post("https://archive.org/services/loans/loan/", data=data)
|
||||
|
||||
if "token" in response.text:
|
||||
if verbose:
|
||||
print("[+] Successful loan")
|
||||
return session
|
||||
else:
|
||||
display_error(response, "Something went wrong when trying to borrow the book, maybe you can't borrow this book.")
|
||||
|
||||
def return_loan(session, book_id):
|
||||
data = {
|
||||
"action": "return_loan",
|
||||
"identifier": book_id
|
||||
}
|
||||
response = session.post("https://archive.org/services/loans/loan/", data=data)
|
||||
if response.status_code == 200 and response.json()["success"]:
|
||||
print("[+] Book returned")
|
||||
else:
|
||||
display_error(response, "Something went wrong when trying to return the book")
|
||||
|
||||
def image_name(pages, page, directory):
|
||||
return f"{directory}/{(len(str(pages)) - len(str(page))) * '0'}{page}.jpg"
|
||||
|
||||
def deobfuscate_image(image_data, link, obf_header):
|
||||
"""
|
||||
@Author: https://github.com/justimm
|
||||
Decrypts the first 1024 bytes of image_data using AES-CTR.
|
||||
The obfuscation_header is expected in the form "1|<base64encoded_counter>"
|
||||
where the base64-decoded counter is 16 bytes.
|
||||
We derive the AES key by taking the SHA-1 digest of the image URL (with protocol/host removed)
|
||||
and using the first 16 bytes.
|
||||
For AES-CTR, we use a 16-byte counter block. The first 8 bytes are used as a fixed prefix,
|
||||
and the remaining 8 bytes (interpreted as a big-endian integer) are used as the initial counter value.
|
||||
"""
|
||||
try:
|
||||
version, counter_b64 = obf_header.split('|')
|
||||
except Exception as e:
|
||||
raise ValueError("Invalid X-Obfuscate header format") from e
|
||||
|
||||
if version != '1':
|
||||
raise ValueError("Unsupported obfuscation version: " + version)
|
||||
|
||||
# Derive AES key: replace protocol/host in link with '/'
|
||||
aesKey = re.sub(r"^https?:\/\/.*?\/", "/", link)
|
||||
sha1_digest = hashlib.sha1(aesKey.encode('utf-8')).digest()
|
||||
key = sha1_digest[:16]
|
||||
|
||||
# Decode the counter (should be 16 bytes)
|
||||
counter_bytes = base64.b64decode(counter_b64)
|
||||
if len(counter_bytes) != 16:
|
||||
raise ValueError(f"Expected counter to be 16 bytes, got {len(counter_bytes)}")
|
||||
|
||||
prefix = counter_bytes[:8]
|
||||
initial_value = int.from_bytes(counter_bytes[8:], byteorder='big')
|
||||
|
||||
# Create AES-CTR cipher with a 64-bit counter length.
|
||||
ctr = Counter.new(64, prefix=prefix, initial_value=initial_value, little_endian=False)
|
||||
cipher = AES.new(key, AES.MODE_CTR, counter=ctr)
|
||||
|
||||
decrypted_part = cipher.decrypt(image_data[:1024])
|
||||
new_data = decrypted_part + image_data[1024:]
|
||||
return new_data
|
||||
|
||||
def download_one_image(session, link, i, directory, book_id, pages):
|
||||
headers = {
|
||||
"Referer": "https://archive.org/",
|
||||
"Accept": "image/avif,image/webp,image/apng,image/*,*/*;q=0.8",
|
||||
"Sec-Fetch-Site": "same-site",
|
||||
"Sec-Fetch-Mode": "no-cors",
|
||||
"Sec-Fetch-Dest": "image",
|
||||
}
|
||||
retry = True
|
||||
response = None
|
||||
while retry:
|
||||
try:
|
||||
response = session.get(link, headers=headers)
|
||||
if response.status_code == 403:
|
||||
session = loan(session, book_id, verbose=False)
|
||||
raise Exception("Borrow again")
|
||||
elif response.status_code == 200:
|
||||
retry = False
|
||||
except:
|
||||
time.sleep(1) # Wait 1 second before retrying
|
||||
|
||||
image = image_name(pages, i, directory)
|
||||
|
||||
obf_header = response.headers.get("X-Obfuscate")
|
||||
image_content = None
|
||||
if obf_header:
|
||||
try:
|
||||
image_content = deobfuscate_image(response.content, link, obf_header)
|
||||
except Exception as e:
|
||||
print(f"[ERROR] Deobfuscation failed: {e}")
|
||||
return
|
||||
else:
|
||||
image_content = response.content
|
||||
|
||||
with open(image, "wb") as f:
|
||||
f.write(image_content)
|
||||
|
||||
def download(session, n_threads, directory, links, scale, book_id):
|
||||
print("Downloading pages...")
|
||||
links = [f"{link}&rotate=0&scale={scale}" for link in links]
|
||||
pages = len(links)
|
||||
|
||||
tasks = []
|
||||
with futures.ThreadPoolExecutor(max_workers=n_threads) as executor:
|
||||
for link in links:
|
||||
i = links.index(link)
|
||||
tasks.append(executor.submit(download_one_image, session=session, link=link, i=i, directory=directory, book_id=book_id, pages=pages))
|
||||
for task in tqdm(futures.as_completed(tasks), total=len(tasks)):
|
||||
pass
|
||||
|
||||
images = [image_name(pages, i, directory) for i in range(len(links))]
|
||||
return images
|
||||
|
||||
def make_pdf(pdf, title, directory):
|
||||
file = title+".pdf"
|
||||
# Handle the case where multiple books with the same name are downloaded
|
||||
i = 1
|
||||
while os.path.isfile(os.path.join(directory, file)):
|
||||
file = f"{title}({i}).pdf"
|
||||
i += 1
|
||||
|
||||
with open(os.path.join(directory, file),"wb") as f:
|
||||
f.write(pdf)
|
||||
print(f"[+] PDF saved as \"{file}\"")
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
my_parser = argparse.ArgumentParser()
|
||||
my_parser.add_argument('-e', '--email', help='Your archive.org email', type=str, required=True)
|
||||
my_parser.add_argument('-p', '--password', help='Your archive.org password', type=str, required=True)
|
||||
my_parser.add_argument('-u', '--url', help='Link to the book (https://archive.org/details/XXXX). You can use this argument several times to download multiple books', action='append', type=str)
|
||||
my_parser.add_argument('-d', '--dir', help='Output directory', type=str)
|
||||
my_parser.add_argument('-f', '--file', help='File where are stored the URLs of the books to download', type=str)
|
||||
my_parser.add_argument('-r', '--resolution', help='Image resolution (10 to 0, 0 is the highest), [default 3]', type=int, default=3)
|
||||
my_parser.add_argument('-t', '--threads', help="Maximum number of threads, [default 50]", type=int, default=50)
|
||||
my_parser.add_argument('-j', '--jpg', help="Output to individual JPG's rather than a PDF", action='store_true')
|
||||
my_parser.add_argument('-m', '--meta', help="Output the metadata of the book to a json file (-j option required)", action='store_true')
|
||||
|
||||
if len(sys.argv) == 1:
|
||||
my_parser.print_help(sys.stderr)
|
||||
sys.exit(1)
|
||||
args = my_parser.parse_args()
|
||||
|
||||
if args.url is None and args.file is None:
|
||||
my_parser.error("At least one of --url and --file required")
|
||||
|
||||
email = args.email
|
||||
password = args.password
|
||||
scale = args.resolution
|
||||
n_threads = args.threads
|
||||
d = args.dir
|
||||
|
||||
if d == None:
|
||||
d = os.getcwd()
|
||||
elif not os.path.isdir(d):
|
||||
print(f"Output directory does not exist!")
|
||||
exit()
|
||||
|
||||
if args.url is not None:
|
||||
urls = args.url
|
||||
else:
|
||||
if os.path.exists(args.file):
|
||||
with open(args.file) as f:
|
||||
urls = f.read().strip().split("\n")
|
||||
else:
|
||||
print(f"{args.file} does not exist!")
|
||||
exit()
|
||||
|
||||
# Check the urls format
|
||||
for url in urls:
|
||||
if not url.startswith("https://archive.org/details/"):
|
||||
print(f"{url} --> Invalid url. URL must starts with \"https://archive.org/details/\"")
|
||||
exit()
|
||||
|
||||
print(f"{len(urls)} Book(s) to download")
|
||||
session = login(email, password)
|
||||
|
||||
for url in urls:
|
||||
book_id = list(filter(None, url.split("/")))[3]
|
||||
print("="*40)
|
||||
print(f"Current book: https://archive.org/details/{book_id}")
|
||||
session = loan(session, book_id)
|
||||
title, links, metadata = get_book_infos(session, url)
|
||||
|
||||
directory = os.path.join(d, title)
|
||||
# Handle the case where multiple books with the same name are downloaded
|
||||
i = 1
|
||||
_directory = directory
|
||||
while os.path.isdir(directory):
|
||||
directory = f"{_directory}({i})"
|
||||
i += 1
|
||||
os.makedirs(directory)
|
||||
|
||||
if args.meta:
|
||||
print("Writing metadata.json...")
|
||||
with open(f"{directory}/metadata.json",'w') as f:
|
||||
json.dump(metadata,f)
|
||||
|
||||
images = download(session, n_threads, directory, links, scale, book_id)
|
||||
|
||||
if not args.jpg: # Create pdf with images and remove the images folder
|
||||
import img2pdf
|
||||
|
||||
# prepare PDF metadata
|
||||
# sometimes archive metadata is missing
|
||||
pdfmeta = { }
|
||||
# ensure metadata are str
|
||||
for key in ["title", "creator", "associated-names"]:
|
||||
if key in metadata:
|
||||
if isinstance(metadata[key], str):
|
||||
pass
|
||||
elif isinstance(metadata[key], list):
|
||||
metadata[key] = "; ".join(metadata[key])
|
||||
else:
|
||||
raise Exception("unsupported metadata type")
|
||||
# title
|
||||
if 'title' in metadata:
|
||||
pdfmeta['title'] = metadata['title']
|
||||
# author
|
||||
if 'creator' in metadata and 'associated-names' in metadata:
|
||||
pdfmeta['author'] = metadata['creator'] + "; " + metadata['associated-names']
|
||||
elif 'creator' in metadata:
|
||||
pdfmeta['author'] = metadata['creator']
|
||||
elif 'associated-names' in metadata:
|
||||
pdfmeta['author'] = metadata['associated-names']
|
||||
# date
|
||||
if 'date' in metadata:
|
||||
try:
|
||||
pdfmeta['creationdate'] = datetime.strptime(metadata['date'][0:4], '%Y')
|
||||
except:
|
||||
pass
|
||||
# keywords
|
||||
pdfmeta['keywords'] = [f"https://archive.org/details/{book_id}"]
|
||||
|
||||
pdf = img2pdf.convert(images, **pdfmeta)
|
||||
make_pdf(pdf, title, args.dir if args.dir != None else "")
|
||||
try:
|
||||
shutil.rmtree(directory)
|
||||
except OSError as e:
|
||||
print ("Error: %s - %s." % (e.filename, e.strerror))
|
||||
|
||||
return_loan(session, book_id)
|
||||
Reference in New Issue
Block a user