This commit is contained in:
nose
2025-12-03 15:18:57 -08:00
parent 89aa24961b
commit 5e4df11dbf
12 changed files with 1953 additions and 346 deletions

View File

@@ -137,14 +137,14 @@ class LocalStorageBackend(StorageBackend):
# Check for duplicate files using LocalLibraryDB (fast - uses index)
try:
db = LocalLibraryDB(dest_dir)
existing_path = db.search_by_hash(file_hash)
if existing_path and existing_path.exists():
log(
f"✓ File already in local storage: {existing_path}",
file=sys.stderr,
)
return str(existing_path)
with LocalLibraryDB(dest_dir) as db:
existing_path = db.search_by_hash(file_hash)
if existing_path and existing_path.exists():
log(
f"✓ File already in local storage: {existing_path}",
file=sys.stderr,
)
return str(existing_path)
except Exception as exc:
log(f"⚠️ Could not check for duplicates in DB: {exc}", file=sys.stderr)
@@ -205,247 +205,156 @@ class LocalStorageBackend(StorageBackend):
# Try database search first (much faster than filesystem scan)
try:
db = LocalLibraryDB(search_dir)
cursor = db.connection.cursor()
# Check if query is a tag namespace search (format: "namespace:pattern")
if ":" in query and not query.startswith(":"):
namespace, pattern = query.split(":", 1)
namespace = namespace.strip().lower()
pattern = pattern.strip().lower()
debug(f"Performing namespace search: {namespace}:{pattern}")
with LocalLibraryDB(search_dir) as db:
cursor = db.connection.cursor()
# Search for tags matching the namespace and pattern
query_pattern = f"{namespace}:%"
cursor.execute("""
SELECT DISTINCT f.id, f.file_path, f.file_size
FROM files f
JOIN tags t ON f.id = t.file_id
WHERE LOWER(t.tag) LIKE ?
ORDER BY f.file_path
LIMIT ?
""", (query_pattern, limit or 1000))
rows = cursor.fetchall()
debug(f"Found {len(rows)} potential matches in DB")
# Filter results by pattern match
for file_id, file_path_str, size_bytes in rows:
if not file_path_str:
continue
# Check if query is a tag namespace search (format: "namespace:pattern")
if ":" in query and not query.startswith(":"):
namespace, pattern = query.split(":", 1)
namespace = namespace.strip().lower()
pattern = pattern.strip().lower()
debug(f"Performing namespace search: {namespace}:{pattern}")
# Search for tags matching the namespace and pattern
query_pattern = f"{namespace}:%"
# Get the file's tags and check if any match the pattern
cursor.execute("""
SELECT DISTINCT tag FROM tags
WHERE file_id = ?
AND LOWER(tag) LIKE ?
""", (file_id, query_pattern))
SELECT DISTINCT f.id, f.file_path, f.file_size
FROM files f
JOIN tags t ON f.id = t.file_id
WHERE LOWER(t.tag) LIKE ?
ORDER BY f.file_path
LIMIT ?
""", (query_pattern, limit or 1000))
tags = [row[0] for row in cursor.fetchall()]
rows = cursor.fetchall()
debug(f"Found {len(rows)} potential matches in DB")
# Check if any tag matches the pattern (case-insensitive wildcard)
for tag in tags:
tag_lower = tag.lower()
# Extract the value part after "namespace:"
if tag_lower.startswith(f"{namespace}:"):
value = tag_lower[len(namespace)+1:]
# Use fnmatch for wildcard matching
if fnmatch(value, pattern):
file_path = Path(file_path_str)
if file_path.exists():
path_str = str(file_path)
if size_bytes is None:
size_bytes = file_path.stat().st_size
# Fetch all tags for this file
cursor.execute("""
SELECT tag FROM tags WHERE file_id = ?
""", (file_id,))
all_tags = [row[0] for row in cursor.fetchall()]
# Use title tag if present
title_tag = next((t.split(':', 1)[1] for t in all_tags if t.lower().startswith('title:')), None)
results.append({
"name": file_path.stem,
"title": title_tag or file_path.stem,
"ext": file_path.suffix.lstrip('.'),
"path": path_str,
"target": path_str,
"origin": "local",
"size": size_bytes,
"size_bytes": size_bytes,
"tags": all_tags,
})
else:
debug(f"File missing on disk: {file_path}")
break # Don't add same file multiple times
if limit is not None and len(results) >= limit:
return results
elif not match_all:
# Search by filename or simple tags (namespace-agnostic for plain text)
# For plain text search, match:
# 1. Filenames containing the query
# 2. Simple tags (without namespace) containing the query
# NOTE: Does NOT match namespaced tags (e.g., "joe" won't match "channel:Joe Mullan")
# Use explicit namespace search for that (e.g., "channel:joe*")
# Split query into terms for AND logic
terms = [t.strip() for t in query_lower.replace(',', ' ').split() if t.strip()]
if not terms:
terms = [query_lower]
debug(f"Performing filename/tag search for terms: {terms}")
# Fetch more results than requested to allow for filtering
fetch_limit = (limit or 45) * 50
# 1. Filename search (AND logic)
conditions = ["LOWER(f.file_path) LIKE ?" for _ in terms]
params = [f"%{t}%" for t in terms]
where_clause = " AND ".join(conditions)
cursor.execute(f"""
SELECT DISTINCT f.id, f.file_path, f.file_size
FROM files f
WHERE {where_clause}
ORDER BY f.file_path
LIMIT ?
""", (*params, fetch_limit))
rows = cursor.fetchall()
debug(f"Found {len(rows)} filename matches in DB (before whole-word filter)")
# Compile regex for whole word matching (only if single term, otherwise skip)
word_regex = None
if len(terms) == 1:
term = terms[0]
# Check if term contains wildcard characters
has_wildcard = '*' in term or '?' in term
if has_wildcard:
# Use fnmatch for wildcard patterns (e.g., "sie*" matches "SiebeliebenWohl...")
try:
from fnmatch import translate
word_regex = re.compile(translate(term), re.IGNORECASE)
except Exception:
word_regex = None
else:
# Use custom boundary that treats underscores as separators
# \b treats _ as a word character, so "foo_bar" wouldn't match "bar" with \b
try:
# Match if not preceded or followed by alphanumeric chars
pattern = r'(?<![a-zA-Z0-9])' + re.escape(term) + r'(?![a-zA-Z0-9])'
word_regex = re.compile(pattern, re.IGNORECASE)
except Exception:
word_regex = None
seen_files = set()
for file_id, file_path_str, size_bytes in rows:
if not file_path_str or file_path_str in seen_files:
continue
# Apply whole word filter on filename if single term
if word_regex:
p = Path(file_path_str)
if not word_regex.search(p.name):
# Filter results by pattern match
for file_id, file_path_str, size_bytes in rows:
if not file_path_str:
continue
seen_files.add(file_path_str)
file_path = Path(file_path_str)
if file_path.exists():
path_str = str(file_path)
if size_bytes is None:
size_bytes = file_path.stat().st_size
# Fetch tags for this file
# Get the file's tags and check if any match the pattern
cursor.execute("""
SELECT tag FROM tags WHERE file_id = ?
""", (file_id,))
SELECT DISTINCT tag FROM tags
WHERE file_id = ?
AND LOWER(tag) LIKE ?
""", (file_id, query_pattern))
tags = [row[0] for row in cursor.fetchall()]
# Use title tag if present
title_tag = next((t.split(':', 1)[1] for t in tags if t.lower().startswith('title:')), None)
results.append({
"name": file_path.stem,
"title": title_tag or file_path.stem,
"ext": file_path.suffix.lstrip('.'),
"path": path_str,
"target": path_str,
"origin": "local",
"size": size_bytes,
"size_bytes": size_bytes,
"tags": tags,
})
# Also search for simple tags (without namespace) containing the query
# Only perform tag search if single term, or if we want to support multi-term tag search
# For now, fallback to single pattern search for tags if multiple terms
# (searching for a tag that contains "term1 term2" or "term1,term2")
# This is less useful for AND logic across multiple tags, but consistent with previous behavior
query_pattern = f"%{query_lower}%"
cursor.execute("""
SELECT DISTINCT f.id, f.file_path, f.file_size
FROM files f
JOIN tags t ON f.id = t.file_id
WHERE LOWER(t.tag) LIKE ? AND LOWER(t.tag) NOT LIKE '%:%'
ORDER BY f.file_path
LIMIT ?
""", (query_pattern, limit or 1000))
tag_rows = cursor.fetchall()
for file_id, file_path_str, size_bytes in tag_rows:
if not file_path_str or file_path_str in seen_files:
continue
seen_files.add(file_path_str)
file_path = Path(file_path_str)
if file_path.exists():
path_str = str(file_path)
if size_bytes is None:
size_bytes = file_path.stat().st_size
# Fetch tags for this file
cursor.execute("""
SELECT tag FROM tags WHERE file_id = ?
""", (file_id,))
tags = [row[0] for row in cursor.fetchall()]
# Use title tag if present
title_tag = next((t.split(':', 1)[1] for t in tags if t.lower().startswith('title:')), None)
results.append({
"name": file_path.stem,
"title": title_tag or file_path.stem,
"ext": file_path.suffix.lstrip('.'),
"path": path_str,
"target": path_str,
"origin": "local",
"size": size_bytes,
"size_bytes": size_bytes,
"tags": tags,
})
# Check if any tag matches the pattern (case-insensitive wildcard)
for tag in tags:
tag_lower = tag.lower()
# Extract the value part after "namespace:"
if tag_lower.startswith(f"{namespace}:"):
value = tag_lower[len(namespace)+1:]
# Use fnmatch for wildcard matching
if fnmatch(value, pattern):
file_path = Path(file_path_str)
if file_path.exists():
path_str = str(file_path)
if size_bytes is None:
size_bytes = file_path.stat().st_size
# Fetch all tags for this file
cursor.execute("""
SELECT tag FROM tags WHERE file_id = ?
""", (file_id,))
all_tags = [row[0] for row in cursor.fetchall()]
# Use title tag if present
title_tag = next((t.split(':', 1)[1] for t in all_tags if t.lower().startswith('title:')), None)
results.append({
"name": file_path.stem,
"title": title_tag or file_path.stem,
"ext": file_path.suffix.lstrip('.'),
"path": path_str,
"target": path_str,
"origin": "local",
"size": size_bytes,
"size_bytes": size_bytes,
"tags": all_tags,
})
else:
debug(f"File missing on disk: {file_path}")
break # Don't add same file multiple times
if limit is not None and len(results) >= limit:
return results
else:
# Match all - get all files from database
cursor.execute("""
SELECT id, file_path, file_size
FROM files
ORDER BY file_path
LIMIT ?
""", (limit or 1000,))
rows = cursor.fetchall()
for file_id, file_path_str, size_bytes in rows:
if file_path_str:
elif not match_all:
# Search by filename or simple tags (namespace-agnostic for plain text)
# For plain text search, match:
# 1. Filenames containing the query
# 2. Simple tags (without namespace) containing the query
# NOTE: Does NOT match namespaced tags (e.g., "joe" won't match "channel:Joe Mullan")
# Use explicit namespace search for that (e.g., "channel:joe*")
# Split query into terms for AND logic
terms = [t.strip() for t in query_lower.replace(',', ' ').split() if t.strip()]
if not terms:
terms = [query_lower]
debug(f"Performing filename/tag search for terms: {terms}")
# Fetch more results than requested to allow for filtering
fetch_limit = (limit or 45) * 50
# 1. Filename search (AND logic)
conditions = ["LOWER(f.file_path) LIKE ?" for _ in terms]
params = [f"%{t}%" for t in terms]
where_clause = " AND ".join(conditions)
cursor.execute(f"""
SELECT DISTINCT f.id, f.file_path, f.file_size
FROM files f
WHERE {where_clause}
ORDER BY f.file_path
LIMIT ?
""", (*params, fetch_limit))
rows = cursor.fetchall()
debug(f"Found {len(rows)} filename matches in DB (before whole-word filter)")
# Compile regex for whole word matching (only if single term, otherwise skip)
word_regex = None
if len(terms) == 1:
term = terms[0]
# Check if term contains wildcard characters
has_wildcard = '*' in term or '?' in term
if has_wildcard:
# Use fnmatch for wildcard patterns (e.g., "sie*" matches "SiebeliebenWohl...")
try:
from fnmatch import translate
word_regex = re.compile(translate(term), re.IGNORECASE)
except Exception:
word_regex = None
else:
# Use custom boundary that treats underscores as separators
# \b treats _ as a word character, so "foo_bar" wouldn't match "bar" with \b
try:
# Match if not preceded or followed by alphanumeric chars
pattern = r'(?<![a-zA-Z0-9])' + re.escape(term) + r'(?![a-zA-Z0-9])'
word_regex = re.compile(pattern, re.IGNORECASE)
except Exception:
word_regex = None
seen_files = set()
for file_id, file_path_str, size_bytes in rows:
if not file_path_str or file_path_str in seen_files:
continue
# Apply whole word filter on filename if single term
if word_regex:
p = Path(file_path_str)
if not word_regex.search(p.name):
continue
seen_files.add(file_path_str)
file_path = Path(file_path_str)
if file_path.exists():
path_str = str(file_path)
@@ -472,12 +381,103 @@ class LocalStorageBackend(StorageBackend):
"size_bytes": size_bytes,
"tags": tags,
})
if results:
debug(f"Returning {len(results)} results from DB")
else:
debug("No results found in DB")
return results
# Also search for simple tags (without namespace) containing the query
# Only perform tag search if single term, or if we want to support multi-term tag search
# For now, fallback to single pattern search for tags if multiple terms
# (searching for a tag that contains "term1 term2" or "term1,term2")
# This is less useful for AND logic across multiple tags, but consistent with previous behavior
query_pattern = f"%{query_lower}%"
cursor.execute("""
SELECT DISTINCT f.id, f.file_path, f.file_size
FROM files f
JOIN tags t ON f.id = t.file_id
WHERE LOWER(t.tag) LIKE ? AND LOWER(t.tag) NOT LIKE '%:%'
ORDER BY f.file_path
LIMIT ?
""", (query_pattern, limit or 1000))
tag_rows = cursor.fetchall()
for file_id, file_path_str, size_bytes in tag_rows:
if not file_path_str or file_path_str in seen_files:
continue
seen_files.add(file_path_str)
file_path = Path(file_path_str)
if file_path.exists():
path_str = str(file_path)
if size_bytes is None:
size_bytes = file_path.stat().st_size
# Fetch tags for this file
cursor.execute("""
SELECT tag FROM tags WHERE file_id = ?
""", (file_id,))
tags = [row[0] for row in cursor.fetchall()]
# Use title tag if present
title_tag = next((t.split(':', 1)[1] for t in tags if t.lower().startswith('title:')), None)
results.append({
"name": file_path.stem,
"title": title_tag or file_path.stem,
"ext": file_path.suffix.lstrip('.'),
"path": path_str,
"target": path_str,
"origin": "local",
"size": size_bytes,
"size_bytes": size_bytes,
"tags": tags,
})
if limit is not None and len(results) >= limit:
return results
else:
# Match all - get all files from database
cursor.execute("""
SELECT id, file_path, file_size
FROM files
ORDER BY file_path
LIMIT ?
""", (limit or 1000,))
rows = cursor.fetchall()
for file_id, file_path_str, size_bytes in rows:
if file_path_str:
file_path = Path(file_path_str)
if file_path.exists():
path_str = str(file_path)
if size_bytes is None:
size_bytes = file_path.stat().st_size
# Fetch tags for this file
cursor.execute("""
SELECT tag FROM tags WHERE file_id = ?
""", (file_id,))
tags = [row[0] for row in cursor.fetchall()]
# Use title tag if present
title_tag = next((t.split(':', 1)[1] for t in tags if t.lower().startswith('title:')), None)
results.append({
"name": file_path.stem,
"title": title_tag or file_path.stem,
"ext": file_path.suffix.lstrip('.'),
"path": path_str,
"target": path_str,
"origin": "local",
"size": size_bytes,
"size_bytes": size_bytes,
"tags": tags,
})
if results:
debug(f"Returning {len(results)} results from DB")
else:
debug("No results found in DB")
return results
except Exception as e:
log(f"⚠️ Database search failed: {e}", file=sys.stderr)
@@ -1175,6 +1175,161 @@ class MatrixStorageBackend(StorageBackend):
raise
class RemoteStorageBackend(StorageBackend):
"""File storage backend for remote Android/network storage servers.
Connects to a remote storage server (e.g., running on Android phone)
via REST API. All operations are proxied to the remote server.
"""
def __init__(self, server_url: str, timeout: int = 30, api_key: str = None) -> None:
"""Initialize remote storage backend.
Args:
server_url: Base URL of remote storage server (e.g., http://192.168.1.100:5000)
timeout: Request timeout in seconds
api_key: Optional API key for authentication
"""
try:
import requests
except ImportError:
raise ImportError("requests library required for RemoteStorageBackend. Install with: pip install requests")
self.server_url = server_url.rstrip('/')
self.timeout = timeout
self.api_key = api_key
self._session = requests.Session()
# Add API key to default headers if provided
if self.api_key:
self._session.headers.update({'X-API-Key': self.api_key})
def get_name(self) -> str:
return "remote"
def _request(self, method: str, endpoint: str, **kwargs) -> Dict[str, Any]:
"""Make HTTP request to remote server."""
import requests
from urllib.parse import urljoin
url = urljoin(self.server_url, endpoint)
try:
response = self._session.request(
method,
url,
timeout=self.timeout,
**kwargs
)
if response.status_code == 404:
raise Exception(f"Remote resource not found: {endpoint}")
if response.status_code >= 400:
try:
error_data = response.json()
error_msg = error_data.get('error', response.text)
except:
error_msg = response.text
raise Exception(f"Remote server error {response.status_code}: {error_msg}")
return response.json()
except requests.exceptions.RequestException as e:
raise Exception(f"Connection to {self.server_url} failed: {e}")
def upload(self, file_path: Path, **kwargs: Any) -> str:
"""Upload file to remote storage.
Args:
file_path: Path to the file to upload
tags: Optional list of tags to add
urls: Optional list of known URLs
Returns:
Remote file hash
"""
from helper.utils import sha256_file
if not file_path.exists():
raise ValueError(f"File not found: {file_path}")
try:
# Index the file on remote server
data = {"path": str(file_path)}
tags = kwargs.get("tags", [])
if tags:
data["tags"] = tags
urls = kwargs.get("urls", [])
if urls:
data["urls"] = urls
result = self._request('POST', '/files/index', json=data)
file_hash = result.get('hash')
if file_hash:
log(f"✓ File indexed on remote storage: {file_hash}", file=sys.stderr)
return file_hash
else:
raise Exception("Remote server did not return file hash")
except Exception as exc:
debug(f"Remote upload failed: {exc}", file=sys.stderr)
raise
def search(self, query: str, **kwargs: Any) -> list[Dict[str, Any]]:
"""Search files on remote storage.
Args:
query: Search query
limit: Maximum results
Returns:
List of search results
"""
limit = kwargs.get("limit")
try:
limit = int(limit) if limit is not None else 100
except (TypeError, ValueError):
limit = 100
if limit <= 0:
limit = 100
try:
response = self._request('GET', '/files/search', params={
'q': query,
'limit': limit
})
files = response.get('files', [])
# Transform remote format to standard result format
results = []
for f in files:
results.append({
"name": f.get('name', '').split('/')[-1], # Get filename from path
"title": f.get('name', f.get('path', '')).split('/')[-1],
"ext": f.get('ext', ''),
"path": f.get('path', ''),
"target": f.get('path', ''),
"hash": f.get('hash', ''),
"origin": "remote",
"size": f.get('size', 0),
"size_bytes": f.get('size', 0),
"tags": f.get('tags', []),
})
debug(f"Remote search found {len(results)} results", file=sys.stderr)
return results
except Exception as exc:
log(f"❌ Remote search failed: {exc}", file=sys.stderr)
raise
class FileStorage:
"""Unified file storage interface supporting multiple backend services.
@@ -1223,6 +1378,25 @@ class FileStorage:
# Include Matrix backend
self._backends["matrix"] = MatrixStorageBackend()
# Include remote storage backends from config (for Android/network servers)
remote_storages = config.get("remote_storages", [])
if isinstance(remote_storages, list):
for remote_config in remote_storages:
if isinstance(remote_config, dict):
name = remote_config.get("name", "remote")
url = remote_config.get("url")
timeout = remote_config.get("timeout", 30)
api_key = remote_config.get("api_key")
if url:
try:
backend = RemoteStorageBackend(url, timeout=timeout, api_key=api_key)
self._backends[name] = backend
auth_status = " (with auth)" if api_key else " (no auth)"
log(f"Registered remote storage backend: {name} -> {url}{auth_status}", file=sys.stderr)
except Exception as e:
log(f"Failed to register remote storage '{name}': {e}", file=sys.stderr)
def __getitem__(self, backend_name: str) -> StorageBackend:
"""Get a storage backend by name.