Files
Medios-Macina/helper/file_storage.py
2025-12-07 00:21:30 -08:00

1536 lines
67 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""File storage abstraction layer for uploading files to different services.
Supports multiple backend storage services (0x0.st, local directories, Hydrus, etc.)
with a unified interface.
Example:
storage = FileStorage()
# Upload to 0x0.st
url = storage["0x0"].upload(Path("file.mp3"))
# Copy to local directory
path = storage["local"].upload(Path("file.mp3"), location="/home/user/files")
# Upload to Hydrus
hash_result = storage["hydrus"].upload(file_path, config=config)
"""
from __future__ import annotations
from abc import ABC, abstractmethod
from pathlib import Path
from typing import Any, Dict, Optional
import sys
import shutil
import requests
import re
from helper.logger import log, debug
from helper.utils_constant import mime_maps
from helper.utils import sha256_file
HEX_DIGITS = set("0123456789abcdef")
def _normalize_hex_hash(value: Optional[str]) -> Optional[str]:
"""Return a normalized 64-character lowercase hash or None."""
if value is None:
return None
try:
cleaned = ''.join(ch for ch in str(value).strip().lower() if ch in HEX_DIGITS)
except Exception:
return None
if len(cleaned) == 64:
return cleaned
return None
def _resolve_file_hash(candidate: Optional[str], path: Path) -> Optional[str]:
"""Return the given hash if valid, otherwise compute sha256 from disk."""
normalized = _normalize_hex_hash(candidate)
if normalized is not None:
return normalized
if not path.exists():
return None
try:
return sha256_file(path)
except Exception as exc:
debug(f"Failed to compute hash for {path}: {exc}")
return None
class StorageBackend(ABC):
"""Abstract base class for file storage backends.
Backends can optionally support searching by implementing the search() method.
"""
@abstractmethod
def upload(self, file_path: Path, **kwargs: Any) -> str:
"""Upload a file and return a result identifier (URL, hash, path, etc.).
Args:
file_path: Path to the file to upload
**kwargs: Backend-specific options
Returns:
Result identifier (e.g., URL for 0x0.st, hash for Hydrus, path for local)
Raises:
Exception: If upload fails
"""
@abstractmethod
def get_name(self) -> str:
"""Get the unique name of this backend."""
def search(self, query: str, **kwargs: Any) -> list[Dict[str, Any]]:
"""Search for files in backends that support it.
This method is optional and only implemented by searchable backends
(e.g., Hydrus, Debrid, Soulseek).
Args:
query: Search query string
**kwargs: Backend-specific search options
Returns:
List of search results, each as a dict with backend-specific fields.
Common fields: 'name', 'size', 'hash', 'url', 'id', etc.
Raises:
NotImplementedError: If backend doesn't support searching
Exception: If search fails
Example:
results = storage["hydrus"].search("music artist:john")
for result in results:
print(result['name'], result['hash'])
"""
raise NotImplementedError(f"{self.get_name()} backend does not support searching")
def supports_search(self) -> bool:
"""Check if this backend supports searching.
Returns:
True if search() is implemented, False otherwise
"""
return self.search.__func__ is not StorageBackend.search
class LocalStorageBackend(StorageBackend):
"""File storage backend for local file system copy."""
def __init__(self, location: Optional[str] = None) -> None:
"""Initialize local storage backend.
Args:
location: Default directory path for storage operations
"""
self._location = location
def get_name(self) -> str:
return "local"
def upload(self, file_path: Path, **kwargs: Any) -> str:
"""Copy or move file to a local directory.
Args:
file_path: Path to the file to upload
location: Destination directory path (uses default if not provided)
move: When True, move the file instead of copying (default: False)
Returns:
Absolute path to the copied/moved file
Raises:
ValueError: If location not provided and no default configured
Exception: If copy fails or duplicate detected
"""
from helper.utils import unique_path as utils_unique_path
from helper.utils import sha256_file
from helper.local_library import LocalLibraryDB
location = kwargs.get("location") or self._location
move_file = bool(kwargs.get("move"))
if not location:
raise ValueError("'location' parameter required for local storage (not configured)")
try:
# Compute file hash
file_hash = sha256_file(file_path)
debug(f"File hash: {file_hash}", file=sys.stderr)
dest_dir = Path(location).expanduser()
dest_dir.mkdir(parents=True, exist_ok=True)
# Check for duplicate files using LocalLibraryDB (fast - uses index)
try:
with LocalLibraryDB(dest_dir) as db:
existing_path = db.search_by_hash(file_hash)
if existing_path and existing_path.exists():
log(
f"✓ File already in local storage: {existing_path}",
file=sys.stderr,
)
return str(existing_path)
except Exception as exc:
log(f"⚠️ Could not check for duplicates in DB: {exc}", file=sys.stderr)
dest_file = dest_dir / file_path.name
dest_file = utils_unique_path(dest_file)
if move_file:
shutil.move(str(file_path), dest_file)
debug(f"Local move: {dest_file}", file=sys.stderr)
else:
shutil.copy2(file_path, dest_file)
debug(f"Local copy: {dest_file}", file=sys.stderr)
return str(dest_file)
except Exception as exc:
debug(f"Local copy failed: {exc}", file=sys.stderr)
raise
def search(self, query: str, **kwargs: Any) -> list[Dict[str, Any]]:
"""Search local database for files by title tag or filename.
Args:
query: Search string supporting:
- Title tag search: "title:document" or just searches DB for matching title tags
- Tag namespace search: "creator:Mac*" matches tags in database
- Filename fallback: if query not in DB, searches filesystem
- "*" means "match all files"
location: Directory to search in (uses default if not provided)
recursive: Search subdirectories (default: True)
Returns:
List of dicts with 'name', 'path', 'size' fields
"""
from fnmatch import fnmatch
from helper.local_library import LocalLibraryDB
location = kwargs.get("location") or self._location
if not location:
raise ValueError("'location' parameter required for local search (not configured)")
limit = kwargs.get("limit")
try:
limit = int(limit) if limit is not None else None
except (TypeError, ValueError):
limit = None
if isinstance(limit, int) and limit <= 0:
limit = None
query_lower = query.lower()
match_all = query_lower == "*"
results = []
search_dir = Path(location).expanduser()
debug(f"Searching local storage at: {search_dir}")
# Support comma-separated AND queries (token1,token2,...). Each token must match.
tokens = [t.strip() for t in query.split(',') if t.strip()]
# Require explicit namespace for hash lookups to avoid accidental filename matches
if not match_all and len(tokens) == 1 and _normalize_hex_hash(query_lower):
debug("Hash queries require 'hash:' prefix for local search")
return results
# Require explicit namespace for hash lookups to avoid accidental filename matches
if not match_all and _normalize_hex_hash(query_lower):
debug("Hash queries require 'hash:' prefix for local search")
return results
def _create_entry(file_path: Path, tags: list[str], size_bytes: int | None, db_hash: Optional[str]) -> dict[str, Any]:
path_str = str(file_path)
entry = {
"name": file_path.stem,
"title": next((t.split(':', 1)[1] for t in tags if t.lower().startswith('title:')), file_path.stem),
"ext": file_path.suffix.lstrip('.'),
"path": path_str,
"target": path_str,
"origin": "local",
"size": size_bytes,
"size_bytes": size_bytes,
"tags": tags,
}
hash_value = _resolve_file_hash(db_hash, file_path)
if hash_value:
entry["hash"] = hash_value
entry["hash_hex"] = hash_value
entry["file_hash"] = hash_value
return entry
try:
if not search_dir.exists():
debug(f"Search directory does not exist: {search_dir}")
return results
# Try database search first (much faster than filesystem scan)
try:
with LocalLibraryDB(search_dir) as db:
cursor = db.connection.cursor()
# Check if query is a tag namespace search (format: "namespace:pattern")
if tokens and len(tokens) > 1:
# AND mode across comma-separated tokens
def _like_pattern(term: str) -> str:
return term.replace('*', '%').replace('?', '_')
def _ids_for_token(token: str, cursor) -> set[int]:
token = token.strip()
if not token:
return set()
# Namespaced token
if ':' in token and not token.startswith(':'):
namespace, pattern = token.split(':', 1)
namespace = namespace.strip().lower()
pattern = pattern.strip().lower()
if namespace == 'hash':
normalized_hash = _normalize_hex_hash(pattern)
if not normalized_hash:
return set()
cursor.execute(
"""
SELECT id FROM files
WHERE LOWER(file_hash) = ?
""",
(normalized_hash,)
)
return {row[0] for row in cursor.fetchall()}
if namespace == 'store':
# Local backend only serves local store
if pattern not in {'local', 'file', 'filesystem'}:
return set()
cursor.execute("SELECT id FROM files")
return {row[0] for row in cursor.fetchall()}
# Generic namespace match on tags
query_pattern = f"{namespace}:%"
cursor.execute(
"""
SELECT DISTINCT f.id, t.tag
FROM files f
JOIN tags t ON f.id = t.file_id
WHERE LOWER(t.tag) LIKE ?
""",
(query_pattern,)
)
matched: set[int] = set()
for file_id, tag_val in cursor.fetchall():
if not tag_val:
continue
tag_lower = str(tag_val).lower()
if not tag_lower.startswith(f"{namespace}:"):
continue
value = tag_lower[len(namespace)+1:]
if fnmatch(value, pattern):
matched.add(int(file_id))
return matched
# Bare token: match filename OR any tag (including title)
term = token.lower()
like_pattern = f"%{_like_pattern(term)}%"
ids: set[int] = set()
# Filename match
cursor.execute(
"""
SELECT DISTINCT id FROM files
WHERE LOWER(file_path) LIKE ?
""",
(like_pattern,)
)
ids.update(int(row[0]) for row in cursor.fetchall())
# Tag match (any namespace, including title)
cursor.execute(
"""
SELECT DISTINCT f.id
FROM files f
JOIN tags t ON f.id = t.file_id
WHERE LOWER(t.tag) LIKE ?
""",
(like_pattern,)
)
ids.update(int(row[0]) for row in cursor.fetchall())
return ids
try:
with LocalLibraryDB(search_dir) as db:
cursor = db.connection.cursor()
matching_ids: set[int] | None = None
for token in tokens:
ids = _ids_for_token(token, cursor)
matching_ids = ids if matching_ids is None else matching_ids & ids
if not matching_ids:
return results
if not matching_ids:
return results
# Fetch rows for matching IDs
placeholders = ",".join(["?"] * len(matching_ids))
fetch_sql = f"""
SELECT id, file_path, file_size, file_hash
FROM files
WHERE id IN ({placeholders})
ORDER BY file_path
LIMIT ?
"""
cursor.execute(fetch_sql, (*matching_ids, limit or len(matching_ids)))
rows = cursor.fetchall()
for file_id, file_path_str, size_bytes, file_hash in rows:
if not file_path_str:
continue
file_path = Path(file_path_str)
if not file_path.exists():
continue
if size_bytes is None:
try:
size_bytes = file_path.stat().st_size
except OSError:
size_bytes = None
cursor.execute(
"""
SELECT tag FROM tags WHERE file_id = ?
""",
(file_id,),
)
tags = [row[0] for row in cursor.fetchall()]
entry = _create_entry(file_path, tags, size_bytes, file_hash)
results.append(entry)
if limit is not None and len(results) >= limit:
return results
return results
except Exception as exc:
log(f"⚠️ AND search failed: {exc}", file=sys.stderr)
debug(f"AND search exception details: {exc}")
return []
if ":" in query and not query.startswith(":"):
namespace, pattern = query.split(":", 1)
namespace = namespace.strip().lower()
pattern = pattern.strip().lower()
debug(f"Performing namespace search: {namespace}:{pattern}")
# Special-case hash: lookups against file_hash column
if namespace == "hash":
normalized_hash = _normalize_hex_hash(pattern)
if not normalized_hash:
return results
cursor.execute(
"""
SELECT id, file_path, file_size, file_hash
FROM files
WHERE LOWER(file_hash) = ?
ORDER BY file_path
LIMIT ?
""",
(normalized_hash, limit or 1000),
)
for file_id, file_path_str, size_bytes, file_hash in cursor.fetchall():
if not file_path_str:
continue
file_path = Path(file_path_str)
if not file_path.exists():
continue
if size_bytes is None:
try:
size_bytes = file_path.stat().st_size
except OSError:
size_bytes = None
cursor.execute(
"""
SELECT tag FROM tags WHERE file_id = ?
""",
(file_id,),
)
all_tags = [row[0] for row in cursor.fetchall()]
entry = _create_entry(file_path, all_tags, size_bytes, file_hash)
results.append(entry)
if limit is not None and len(results) >= limit:
return results
return results
# Search for tags matching the namespace and pattern
query_pattern = f"{namespace}:%"
cursor.execute("""
SELECT DISTINCT f.id, f.file_path, f.file_size, f.file_hash
FROM files f
JOIN tags t ON f.id = t.file_id
WHERE LOWER(t.tag) LIKE ?
ORDER BY f.file_path
LIMIT ?
""", (query_pattern, limit or 1000))
rows = cursor.fetchall()
debug(f"Found {len(rows)} potential matches in DB")
# Filter results by pattern match
for file_id, file_path_str, size_bytes, file_hash in rows:
if not file_path_str:
continue
# Get the file's tags and check if any match the pattern
cursor.execute("""
SELECT DISTINCT tag FROM tags
WHERE file_id = ?
AND LOWER(tag) LIKE ?
""", (file_id, query_pattern))
tags = [row[0] for row in cursor.fetchall()]
# Check if any tag matches the pattern (case-insensitive wildcard)
for tag in tags:
tag_lower = tag.lower()
# Extract the value part after "namespace:"
if tag_lower.startswith(f"{namespace}:"):
value = tag_lower[len(namespace)+1:]
# Use fnmatch for wildcard matching
if fnmatch(value, pattern):
file_path = Path(file_path_str)
if file_path.exists():
if size_bytes is None:
size_bytes = file_path.stat().st_size
cursor.execute("""
SELECT tag FROM tags WHERE file_id = ?
""", (file_id,))
all_tags = [row[0] for row in cursor.fetchall()]
entry = _create_entry(file_path, all_tags, size_bytes, file_hash)
results.append(entry)
else:
debug(f"File missing on disk: {file_path}")
break # Don't add same file multiple times
if limit is not None and len(results) >= limit:
return results
elif not match_all:
# Search by filename or simple tags (namespace-agnostic for plain text)
# For plain text search, match:
# 1. Filenames containing the query
# 2. Simple tags (without namespace) containing the query
# NOTE: Does NOT match namespaced tags (e.g., "joe" won't match "channel:Joe Mullan")
# Use explicit namespace search for that (e.g., "channel:joe*")
# Split query into terms for AND logic
terms = [t.strip() for t in query_lower.replace(',', ' ').split() if t.strip()]
if not terms:
terms = [query_lower]
debug(f"Performing filename/tag search for terms: {terms}")
# Fetch more results than requested to allow for filtering
fetch_limit = (limit or 45) * 50
# 1. Filename search (AND logic)
conditions = ["LOWER(f.file_path) LIKE ?" for _ in terms]
params = [f"%{t}%" for t in terms]
where_clause = " AND ".join(conditions)
cursor.execute(f"""
SELECT DISTINCT f.id, f.file_path, f.file_size, f.file_hash
FROM files f
WHERE {where_clause}
ORDER BY f.file_path
LIMIT ?
""", (*params, fetch_limit))
rows = cursor.fetchall()
debug(f"Found {len(rows)} filename matches in DB (before whole-word filter)")
# Compile regex for whole word matching (only if single term, otherwise skip)
word_regex = None
if len(terms) == 1:
term = terms[0]
# Check if term contains wildcard characters
has_wildcard = '*' in term or '?' in term
if has_wildcard:
# Use fnmatch for wildcard patterns (e.g., "sie*" matches "SiebeliebenWohl...")
try:
from fnmatch import translate
word_regex = re.compile(translate(term), re.IGNORECASE)
except Exception:
word_regex = None
else:
# Use custom boundary that treats underscores as separators
# \b treats _ as a word character, so "foo_bar" wouldn't match "bar" with \b
try:
# Match if not preceded or followed by alphanumeric chars
pattern = r'(?<![a-zA-Z0-9])' + re.escape(term) + r'(?![a-zA-Z0-9])'
word_regex = re.compile(pattern, re.IGNORECASE)
except Exception:
word_regex = None
seen_files = set()
for file_id, file_path_str, size_bytes, file_hash in rows:
if not file_path_str or file_path_str in seen_files:
continue
# Apply whole word filter on filename if single term
if word_regex:
p = Path(file_path_str)
if not word_regex.search(p.name):
continue
seen_files.add(file_path_str)
file_path = Path(file_path_str)
if file_path.exists():
path_str = str(file_path)
if size_bytes is None:
size_bytes = file_path.stat().st_size
cursor.execute("""
SELECT tag FROM tags WHERE file_id = ?
""", (file_id,))
tags = [row[0] for row in cursor.fetchall()]
entry = _create_entry(file_path, tags, size_bytes, file_hash)
results.append(entry)
if limit is not None and len(results) >= limit:
return results
# Title-tag search: treat freeform terms as title namespace queries (AND across terms)
if terms:
title_hits: dict[int, dict[str, Any]] = {}
for term in terms:
cursor.execute(
"""
SELECT DISTINCT f.id, f.file_path, f.file_size, f.file_hash
FROM files f
JOIN tags t ON f.id = t.file_id
WHERE LOWER(t.tag) LIKE ?
ORDER BY f.file_path
LIMIT ?
""",
(f"title:%{term}%", fetch_limit),
)
for file_id, file_path_str, size_bytes, file_hash in cursor.fetchall():
if not file_path_str:
continue
entry = title_hits.get(file_id)
if entry:
entry["count"] += 1
if size_bytes is not None:
entry["size"] = size_bytes
else:
title_hits[file_id] = {
"path": file_path_str,
"size": size_bytes,
"hash": file_hash,
"count": 1,
}
if title_hits:
required = len(terms)
for file_id, info in title_hits.items():
if info.get("count") != required:
continue
file_path_str = info.get("path")
if not file_path_str or file_path_str in seen_files:
continue
file_path = Path(file_path_str)
if not file_path.exists():
continue
seen_files.add(file_path_str)
size_bytes = info.get("size")
if size_bytes is None:
try:
size_bytes = file_path.stat().st_size
except OSError:
size_bytes = None
cursor.execute(
"""
SELECT tag FROM tags WHERE file_id = ?
""",
(file_id,),
)
tags = [row[0] for row in cursor.fetchall()]
entry = _create_entry(file_path, tags, size_bytes, info.get("hash"))
results.append(entry)
if limit is not None and len(results) >= limit:
return results
# Also search for simple tags (without namespace) containing the query
# Only perform tag search if single term, or if we want to support multi-term tag search
# For now, fallback to single pattern search for tags if multiple terms
# (searching for a tag that contains "term1 term2" or "term1,term2")
# This is less useful for AND logic across multiple tags, but consistent with previous behavior
query_pattern = f"%{query_lower}%"
cursor.execute("""
SELECT DISTINCT f.id, f.file_path, f.file_size, f.file_hash
FROM files f
JOIN tags t ON f.id = t.file_id
WHERE LOWER(t.tag) LIKE ? AND LOWER(t.tag) NOT LIKE '%:%'
ORDER BY f.file_path
LIMIT ?
""", (query_pattern, limit or 1000))
tag_rows = cursor.fetchall()
for file_id, file_path_str, size_bytes, file_hash in tag_rows:
if not file_path_str or file_path_str in seen_files:
continue
seen_files.add(file_path_str)
file_path = Path(file_path_str)
if file_path.exists():
path_str = str(file_path)
if size_bytes is None:
size_bytes = file_path.stat().st_size
# Fetch tags for this file
cursor.execute("""
SELECT tag FROM tags WHERE file_id = ?
""", (file_id,))
tags = [row[0] for row in cursor.fetchall()]
entry = _create_entry(file_path, tags, size_bytes, file_hash)
results.append(entry)
if limit is not None and len(results) >= limit:
return results
else:
# Match all - get all files from database
cursor.execute("""
SELECT id, file_path, file_size, file_hash
FROM files
ORDER BY file_path
LIMIT ?
""", (limit or 1000,))
rows = cursor.fetchall()
for file_id, file_path_str, size_bytes, file_hash in rows:
if file_path_str:
file_path = Path(file_path_str)
if file_path.exists():
path_str = str(file_path)
if size_bytes is None:
size_bytes = file_path.stat().st_size
# Fetch tags for this file
cursor.execute("""
SELECT tag FROM tags WHERE file_id = ?
""", (file_id,))
tags = [row[0] for row in cursor.fetchall()]
entry = _create_entry(file_path, tags, size_bytes, file_hash)
results.append(entry)
if results:
debug(f"Returning {len(results)} results from DB")
else:
debug("No results found in DB")
return results
except Exception as e:
log(f"⚠️ Database search failed: {e}", file=sys.stderr)
debug(f"DB search exception details: {e}")
return []
except Exception as exc:
log(f"❌ Local search failed: {exc}", file=sys.stderr)
raise
class HydrusStorageBackend(StorageBackend):
"""File storage backend for Hydrus client."""
def __init__(self, config: Optional[Dict[str, Any]] = None) -> None:
"""Initialize Hydrus storage backend.
Args:
config: Configuration dict with Hydrus settings (HydrusNetwork section)
"""
self._config = config or {}
def get_name(self) -> str:
return "hydrus"
def upload(self, file_path: Path, **kwargs: Any) -> str:
"""Upload file to Hydrus.
Args:
file_path: Path to the file to upload
tags: Optional list of tags to add (uses default config if not provided)
config: Optional override for config (uses default if not provided)
Returns:
File hash from Hydrus
Raises:
Exception: If upload fails
"""
from helper import hydrus as hydrus_wrapper
from helper.utils import sha256_file
config = kwargs.get("config") or self._config
if not config:
raise ValueError("'config' parameter required for Hydrus storage (not configured)")
tags = kwargs.get("tags", [])
try:
# Compute file hash
file_hash = sha256_file(file_path)
debug(f"File hash: {file_hash}")
# Build Hydrus client
client = hydrus_wrapper.get_client(config)
if client is None:
raise Exception("Hydrus client unavailable")
# Check if file already exists in Hydrus
try:
metadata = client.fetch_file_metadata(hashes=[file_hash])
if metadata and isinstance(metadata, dict):
files = metadata.get("file_metadata", [])
if files:
log(
f" Duplicate detected - file already in Hydrus with hash: {file_hash}",
file=sys.stderr,
)
# Even if duplicate, we should add tags if provided
if tags:
try:
service_name = hydrus_wrapper.get_tag_service_name(config)
except Exception:
service_name = "my tags"
try:
debug(f"Adding {len(tags)} tag(s) to existing file in Hydrus: {tags}")
client.add_tags(file_hash, tags, service_name)
log(f"✅ Tags added to existing file via '{service_name}'", file=sys.stderr)
except Exception as exc:
log(f"⚠️ Failed to add tags to existing file: {exc}", file=sys.stderr)
return file_hash
except Exception:
pass
# Upload file to Hydrus
log(f"Uploading to Hydrus: {file_path.name}", file=sys.stderr)
response = client.add_file(file_path)
# Extract hash from response
hydrus_hash: Optional[str] = None
if isinstance(response, dict):
hydrus_hash = response.get("hash") or response.get("file_hash")
if not hydrus_hash:
hashes = response.get("hashes")
if isinstance(hashes, list) and hashes:
hydrus_hash = hashes[0]
if not hydrus_hash:
raise Exception(f"Hydrus response missing file hash: {response}")
file_hash = hydrus_hash
log(f"Hydrus: {file_hash}", file=sys.stderr)
# Add tags if provided
if tags:
try:
service_name = hydrus_wrapper.get_tag_service_name(config)
except Exception:
service_name = "my tags"
try:
debug(f"Adding {len(tags)} tag(s) to Hydrus: {tags}")
client.add_tags(file_hash, tags, service_name)
log(f"✅ Tags added via '{service_name}'", file=sys.stderr)
except Exception as exc:
log(f"⚠️ Failed to add tags: {exc}", file=sys.stderr)
return file_hash
except Exception as exc:
log(f"❌ Hydrus upload failed: {exc}", file=sys.stderr)
raise
def search(self, query: str, **kwargs: Any) -> list[Dict[str, Any]]:
"""Search Hydrus database for files matching query.
Args:
query: Search query (tags, filenames, hashes, etc.)
limit: Maximum number of results to return (default: 100)
config: Optional override for config (uses default if not provided)
Returns:
List of dicts with 'name', 'hash', 'size', 'tags' fields
Example:
results = storage["hydrus"].search("artist:john_doe music")
results = storage["hydrus"].search("Simple Man")
"""
from helper import hydrus as hydrus_wrapper
config = kwargs.get("config") or self._config
if not config:
raise ValueError("'config' parameter required for Hydrus search (not configured)")
limit = kwargs.get("limit", 100)
try:
client = hydrus_wrapper.get_client(config)
if client is None:
raise Exception("Hydrus client unavailable")
debug(f"Searching Hydrus for: {query}")
# Parse the query into tags
# Handle both simple tags and complex queries
# "*" means "match all" - use system:everything tag in Hydrus
if query.strip() == "*":
# Use system:everything to match all files in Hydrus
tags = ["system:everything"]
else:
query_lower = query.lower().strip()
# If query doesn't have a namespace (no ':'), search all files and filter by title/tags
# If query has explicit namespace, use it as a tag search
if ':' not in query_lower:
# No namespace provided: search all files, then filter by title/tags containing the query
tags = ["system:everything"]
else:
# User provided explicit namespace (e.g., "creator:john" or "system:has_audio")
# Use it as a tag search
tags = [query_lower]
if not tags:
debug(f"Found 0 result(s)")
return []
# Search files with the tags
search_result = client.search_files(
tags=tags,
return_hashes=True,
return_file_ids=True
)
# Extract file IDs from search result
file_ids = search_result.get("file_ids", [])
hashes = search_result.get("hashes", [])
if not file_ids and not hashes:
debug(f"Found 0 result(s)")
return []
# Fetch metadata for the found files
results = []
query_lower = query.lower().strip()
# Split by comma or space for AND logic
search_terms = set(query_lower.replace(',', ' ').split()) # For substring matching
if file_ids:
metadata = client.fetch_file_metadata(file_ids=file_ids)
metadata_list = metadata.get("metadata", [])
for meta in metadata_list:
if len(results) >= limit:
break
file_id = meta.get("file_id")
hash_hex = meta.get("hash")
size = meta.get("size", 0)
# Get tags for this file and extract title
tags_set = meta.get("tags", {})
all_tags = []
title = f"Hydrus File {file_id}" # Default fallback
all_tags_str = "" # For substring matching
# debug(f"[HydrusBackend.search] Processing file_id={file_id}, tags type={type(tags_set)}")
if isinstance(tags_set, dict):
# Collect both storage_tags and display_tags to capture siblings/parents and ensure title: is seen
def _collect(tag_list: Any) -> None:
nonlocal title, all_tags_str
if not isinstance(tag_list, list):
return
for tag in tag_list:
tag_text = str(tag) if tag else ""
if not tag_text:
continue
all_tags.append(tag_text)
all_tags_str += " " + tag_text.lower()
if tag_text.lower().startswith("title:") and title == f"Hydrus File {file_id}":
title = tag_text.split(":", 1)[1].strip()
for service_name, service_tags in tags_set.items():
if not isinstance(service_tags, dict):
continue
storage_tags = service_tags.get("storage_tags", {})
if isinstance(storage_tags, dict):
for tag_list in storage_tags.values():
_collect(tag_list)
display_tags = service_tags.get("display_tags", [])
_collect(display_tags)
# Also consider top-level flattened tags payload if provided (Hydrus API sometimes includes it)
top_level_tags = meta.get("tags_flat", []) or meta.get("tags", [])
_collect(top_level_tags)
# Resolve extension from MIME type
mime_type = meta.get("mime")
ext = ""
if mime_type:
for category in mime_maps.values():
for ext_key, info in category.items():
if mime_type in info.get("mimes", []):
ext = info.get("ext", "").lstrip('.')
break
if ext:
break
# Filter results based on query type
# If user provided explicit namespace (has ':'), don't do substring filtering
# Just include what the tag search returned
has_namespace = ':' in query_lower
if has_namespace:
# Explicit namespace search - already filtered by Hydrus tag search
# Include this result as-is
results.append({
"hash": hash_hex,
"hash_hex": hash_hex,
"target": hash_hex,
"name": title,
"title": title,
"size": size,
"size_bytes": size,
"origin": "hydrus",
"tags": all_tags,
"file_id": file_id,
"mime": mime_type,
"ext": ext,
})
else:
# Free-form search: check if search terms match the title or tags
# Match if ALL search terms are found in title or tags (AND logic)
# AND use whole word matching
# Combine title and tags for searching
searchable_text = (title + " " + all_tags_str).lower()
match = True
if query_lower != "*":
for term in search_terms:
# Regex for whole word: \bterm\b
# Escape term to handle special chars
pattern = r'\b' + re.escape(term) + r'\b'
if not re.search(pattern, searchable_text):
match = False
break
if match:
results.append({
"hash": hash_hex,
"hash_hex": hash_hex,
"target": hash_hex,
"name": title,
"title": title,
"size": size,
"size_bytes": size,
"origin": "hydrus",
"tags": all_tags,
"file_id": file_id,
"mime": mime_type,
"ext": ext,
})
debug(f"Found {len(results)} result(s)")
return results[:limit]
except Exception as exc:
log(f"❌ Hydrus search failed: {exc}", file=sys.stderr)
import traceback
traceback.print_exc(file=sys.stderr)
raise
class MatrixStorageBackend(StorageBackend):
"""File storage backend for Matrix (Element) chat rooms."""
def get_name(self) -> str:
return "matrix"
def list_rooms(self, config: Dict[str, Any]) -> List[Dict[str, Any]]:
"""List joined rooms with their names."""
matrix_conf = config.get('storage', {}).get('matrix', {})
homeserver = matrix_conf.get('homeserver')
access_token = matrix_conf.get('access_token')
if not homeserver or not access_token:
return []
if not homeserver.startswith('http'):
homeserver = f"https://{homeserver}"
headers = {"Authorization": f"Bearer {access_token}"}
try:
# Get joined rooms
resp = requests.get(f"{homeserver}/_matrix/client/v3/joined_rooms", headers=headers, timeout=10)
if resp.status_code != 200:
return []
room_ids = resp.json().get('joined_rooms', [])
rooms = []
for rid in room_ids:
# Try to get room name
name = "Unknown Room"
try:
# Get state event for name
name_resp = requests.get(
f"{homeserver}/_matrix/client/v3/rooms/{rid}/state/m.room.name",
headers=headers,
timeout=2
)
if name_resp.status_code == 200:
name = name_resp.json().get('name', name)
else:
# Try canonical alias
alias_resp = requests.get(
f"{homeserver}/_matrix/client/v3/rooms/{rid}/state/m.room.canonical_alias",
headers=headers,
timeout=2
)
if alias_resp.status_code == 200:
name = alias_resp.json().get('alias', name)
except Exception:
pass
rooms.append({'id': rid, 'name': name})
return rooms
except Exception as e:
log(f"Error listing Matrix rooms: {e}", file=sys.stderr)
return []
def upload(self, file_path: Path, **kwargs: Any) -> str:
"""Upload file to Matrix room.
Requires 'config' in kwargs with 'storage.matrix' settings:
- homeserver: URL of homeserver (e.g. https://matrix.org)
- user_id: User ID (e.g. @user:matrix.org)
- access_token: Access token (preferred) OR password
- room_id: Room ID to upload to (e.g. !roomid:matrix.org)
"""
config = kwargs.get('config', {})
if not config:
raise ValueError("Config required for Matrix upload")
matrix_conf = config.get('storage', {}).get('matrix', {})
if not matrix_conf:
raise ValueError("Matrix storage not configured in config.json")
homeserver = matrix_conf.get('homeserver')
# user_id = matrix_conf.get('user_id') # Not strictly needed if we have token
access_token = matrix_conf.get('access_token')
room_id = matrix_conf.get('room_id')
if not homeserver:
raise ValueError("Matrix homeserver required")
# Ensure homeserver has protocol
if not homeserver.startswith('http'):
homeserver = f"https://{homeserver}"
# Login if no access token (optional implementation, for now assume token)
if not access_token:
raise ValueError("Matrix access_token required (login not yet implemented)")
# Handle room selection if not provided
if not room_id:
log("No room_id configured. Fetching joined rooms...", file=sys.stderr)
rooms = self.list_rooms(config)
if not rooms:
raise ValueError("No joined rooms found or failed to fetch rooms.")
from result_table import ResultTable
table = ResultTable("Matrix Rooms")
for i, room in enumerate(rooms):
row = table.add_row()
row.add_column("#", str(i + 1))
row.add_column("Name", room['name'])
row.add_column("ID", room['id'])
print(table)
# Simple interactive selection
try:
selection = input("Select room # to upload to: ")
idx = int(selection) - 1
if 0 <= idx < len(rooms):
room_id = rooms[idx]['id']
log(f"Selected room: {rooms[idx]['name']} ({room_id})", file=sys.stderr)
else:
raise ValueError("Invalid selection")
except Exception:
raise ValueError("Invalid room selection")
if not room_id:
raise ValueError("Matrix room_id required")
# 1. Upload Media
upload_url = f"{homeserver}/_matrix/media/r3/upload"
headers = {
"Authorization": f"Bearer {access_token}",
"Content-Type": "application/octet-stream" # Or guess mime type
}
import mimetypes
mime_type, _ = mimetypes.guess_type(file_path)
if mime_type:
headers["Content-Type"] = mime_type
filename = file_path.name
try:
with open(file_path, 'rb') as f:
resp = requests.post(upload_url, headers=headers, data=f, params={"filename": filename})
if resp.status_code != 200:
raise Exception(f"Matrix upload failed: {resp.text}")
content_uri = resp.json().get('content_uri')
if not content_uri:
raise Exception("No content_uri returned from Matrix upload")
# 2. Send Message
send_url = f"{homeserver}/_matrix/client/r0/rooms/{room_id}/send/m.room.message"
# Determine msgtype
msgtype = "m.file"
if mime_type:
if mime_type.startswith("image/"): msgtype = "m.image"
elif mime_type.startswith("video/"): msgtype = "m.video"
elif mime_type.startswith("audio/"): msgtype = "m.audio"
payload = {
"msgtype": msgtype,
"body": filename,
"url": content_uri,
"info": {
"mimetype": mime_type,
"size": file_path.stat().st_size
}
}
resp = requests.post(send_url, headers=headers, json=payload)
if resp.status_code != 200:
raise Exception(f"Matrix send message failed: {resp.text}")
event_id = resp.json().get('event_id')
return f"matrix://{room_id}/{event_id}"
except Exception as e:
log(f"❌ Matrix upload error: {e}", file=sys.stderr)
raise
class RemoteStorageBackend(StorageBackend):
"""File storage backend for remote Android/network storage servers.
Connects to a remote storage server (e.g., running on Android phone)
via REST API. All operations are proxied to the remote server.
"""
def __init__(self, server_url: str, timeout: int = 30, api_key: str = None) -> None:
"""Initialize remote storage backend.
Args:
server_url: Base URL of remote storage server (e.g., http://192.168.1.100:5000)
timeout: Request timeout in seconds
api_key: Optional API key for authentication
"""
try:
import requests
except ImportError:
raise ImportError("requests library required for RemoteStorageBackend. Install with: pip install requests")
self.server_url = server_url.rstrip('/')
self.timeout = timeout
self.api_key = api_key
self._session = requests.Session()
# Add API key to default headers if provided
if self.api_key:
self._session.headers.update({'X-API-Key': self.api_key})
def get_name(self) -> str:
return "remote"
def _request(self, method: str, endpoint: str, **kwargs) -> Dict[str, Any]:
"""Make HTTP request to remote server."""
import requests
from urllib.parse import urljoin
url = urljoin(self.server_url, endpoint)
try:
response = self._session.request(
method,
url,
timeout=self.timeout,
**kwargs
)
if response.status_code == 404:
raise Exception(f"Remote resource not found: {endpoint}")
if response.status_code >= 400:
try:
error_data = response.json()
error_msg = error_data.get('error', response.text)
except:
error_msg = response.text
raise Exception(f"Remote server error {response.status_code}: {error_msg}")
return response.json()
except requests.exceptions.RequestException as e:
raise Exception(f"Connection to {self.server_url} failed: {e}")
def upload(self, file_path: Path, **kwargs: Any) -> str:
"""Upload file to remote storage.
Args:
file_path: Path to the file to upload
tags: Optional list of tags to add
urls: Optional list of known URLs
Returns:
Remote file hash
"""
from helper.utils import sha256_file
if not file_path.exists():
raise ValueError(f"File not found: {file_path}")
try:
# Index the file on remote server
data = {"path": str(file_path)}
tags = kwargs.get("tags", [])
if tags:
data["tags"] = tags
urls = kwargs.get("urls", [])
if urls:
data["urls"] = urls
result = self._request('POST', '/files/index', json=data)
file_hash = result.get('hash')
if file_hash:
log(f"✓ File indexed on remote storage: {file_hash}", file=sys.stderr)
return file_hash
else:
raise Exception("Remote server did not return file hash")
except Exception as exc:
debug(f"Remote upload failed: {exc}", file=sys.stderr)
raise
def search(self, query: str, **kwargs: Any) -> list[Dict[str, Any]]:
"""Search files on remote storage.
Args:
query: Search query
limit: Maximum results
Returns:
List of search results
"""
limit = kwargs.get("limit")
try:
limit = int(limit) if limit is not None else 100
except (TypeError, ValueError):
limit = 100
if limit <= 0:
limit = 100
try:
response = self._request('GET', '/files/search', params={
'q': query,
'limit': limit
})
files = response.get('files', [])
# Transform remote format to standard result format
results = []
for f in files:
results.append({
"name": f.get('name', '').split('/')[-1], # Get filename from path
"title": f.get('name', f.get('path', '')).split('/')[-1],
"ext": f.get('ext', ''),
"path": f.get('path', ''),
"target": f.get('path', ''),
"hash": f.get('hash', ''),
"origin": "remote",
"size": f.get('size', 0),
"size_bytes": f.get('size', 0),
"tags": f.get('tags', []),
})
debug(f"Remote search found {len(results)} results", file=sys.stderr)
return results
except Exception as exc:
log(f"❌ Remote search failed: {exc}", file=sys.stderr)
raise
class FileStorage:
"""Unified file storage interface supporting multiple backend services.
Example:
storage = FileStorage(config)
# Upload to different backends (uses configured locations)
url = storage["0x0"].upload(Path("file.mp3"))
local_path = storage["local"].upload(Path("file.mp3")) # Uses config["Local"]["path"]
hydrus_hash = storage["hydrus"].upload(Path("file.mp3"), tags=["music"])
# Search with searchable backends (uses configured locations)
results = storage["hydrus"].search("music")
results = storage["local"].search("song") # Uses config["Local"]["path"]
"""
def __init__(self, config: Optional[Dict[str, Any]] = None) -> None:
"""Initialize the file storage system with available backends.
Args:
config: Configuration dict with backend settings (Local.path, HydrusNetwork, Debrid, etc.)
"""
config = config or {}
# Extract backend-specific settings from config
from config import get_local_storage_path
local_path = get_local_storage_path(config)
local_path_str = str(local_path) if local_path else None
self._backends: Dict[str, StorageBackend] = {}
# Always include local backend (even if no default path configured)
# The location can be specified at upload time if not configured globally
self._backends["local"] = LocalStorageBackend(location=local_path_str)
# Include Hydrus backend (configuration optional)
self._backends["hydrus"] = HydrusStorageBackend(config=config)
# Include Matrix backend
self._backends["matrix"] = MatrixStorageBackend()
# Include remote storage backends from config (for Android/network servers)
remote_storages = config.get("remote_storages", [])
if isinstance(remote_storages, list):
for remote_config in remote_storages:
if isinstance(remote_config, dict):
name = remote_config.get("name", "remote")
url = remote_config.get("url")
timeout = remote_config.get("timeout", 30)
api_key = remote_config.get("api_key")
if url:
try:
backend = RemoteStorageBackend(url, timeout=timeout, api_key=api_key)
self._backends[name] = backend
auth_status = " (with auth)" if api_key else " (no auth)"
log(f"Registered remote storage backend: {name} -> {url}{auth_status}", file=sys.stderr)
except Exception as e:
log(f"Failed to register remote storage '{name}': {e}", file=sys.stderr)
def list_backends(self) -> list[str]:
"""Return available backend keys for autocomplete and validation."""
return sorted(self._backends.keys())
def __getitem__(self, backend_name: str) -> StorageBackend:
"""Get a storage backend by name.
Args:
backend_name: Name of the backend ('0x0', 'local', 'hydrus')
Returns:
StorageBackend instance
Raises:
KeyError: If backend not found
"""
if backend_name not in self._backends:
raise KeyError(
f"Unknown storage backend: {backend_name}. "
f"Available: {list(self._backends.keys())}"
)
return self._backends[backend_name]
def register(self, backend: StorageBackend) -> None:
"""Register a custom storage backend.
Args:
backend: StorageBackend instance to register
"""
name = backend.get_name()
self._backends[name] = backend
log(f"Registered storage backend: {name}", file=sys.stderr)
def list_backends(self) -> list[str]:
"""Get list of available backend names.
Returns:
List of backend names
"""
return list(self._backends.keys())
def is_available(self, backend_name: str) -> bool:
"""Check if a backend is available.
Args:
backend_name: Name of the backend
Returns:
True if backend is registered
"""
return backend_name in self._backends
def list_searchable_backends(self) -> list[str]:
"""Get list of backends that support searching.
Returns:
List of searchable backend names
"""
return [
name for name, backend in self._backends.items()
if backend.supports_search()
]
def supports_search(self, backend_name: str) -> bool:
"""Check if a backend supports searching.
Args:
backend_name: Name of the backend
Returns:
True if backend supports search(), False otherwise
"""
if backend_name not in self._backends:
return False
return self._backends[backend_name].supports_search()