From 8b7f518725c0b9c63faccb6d30f15650b8859b54 Mon Sep 17 00:00:00 2001 From: Nose Date: Mon, 12 Jan 2026 20:26:45 -0800 Subject: [PATCH] h --- API/folder.py | 121 ++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 117 insertions(+), 4 deletions(-) diff --git a/API/folder.py b/API/folder.py index 8fbeaf1..d5b1ad3 100644 --- a/API/folder.py +++ b/API/folder.py @@ -25,7 +25,8 @@ from SYS.utils import sha256_file, expand_path from SYS.logger import debug as mm_debug logger = logging.getLogger(__name__) -WORKER_LOG_MAX_ENTRIES = 99 +WORKER_LOG_MAX_ENTRIES = 50 # Reduced from 99 to keep log size down +MAX_FINISHED_WORKERS = 100 # Only keep 100 finished workers globally # Helper: decorate DB write methods to retry transient SQLITE 'database is locked' errors def _db_retry(max_attempts: int = 6, base_sleep: float = 0.1): @@ -304,10 +305,22 @@ class API_folder_store: ) self.connection.row_factory = sqlite3.Row - # Enable Write-Ahead Logging (WAL) for better concurrency + # Performance & Size Optimizations + # 1. WAL mode for better concurrency and fewer locks self.connection.execute("PRAGMA journal_mode=WAL") - # Enable foreign keys + # 2. auto_vacuum=FULL to automatically reclaim space from deleted rows/logs + self.connection.execute("PRAGMA auto_vacuum = FULL") + # 3. Increase page size for modern file systems + self.connection.execute("PRAGMA page_size = 4096") + # 4. Memory and Sync optimizations + self.connection.execute("PRAGMA synchronous = NORMAL") + self.connection.execute("PRAGMA temp_store = MEMORY") + self.connection.execute("PRAGMA cache_size = -2000") + # Use memory mapping for the entire DB (up to 30MB) for near-instant reads + self.connection.execute("PRAGMA mmap_size = 30000000") + # 5. Standard features self.connection.execute("PRAGMA foreign_keys = ON") + # Bound how long sqlite will wait on locks before raising. try: self.connection.execute("PRAGMA busy_timeout = 5000") @@ -315,6 +328,10 @@ class API_folder_store: pass self._create_tables() + + # Run maintenance if the DB has grown suspiciously large + self._run_maintenance_if_needed() + logger.info(f"Database initialized at {self.db_path}") except Exception as e: logger.error(f"Failed to initialize database: {e}", exc_info=True) @@ -326,6 +343,84 @@ class API_folder_store: self.connection = None raise + def _run_maintenance_if_needed(self) -> None: + """Perform a one-time VACUUM if the database file is large.""" + try: + if not self.db_path.exists(): + return + + # Global cleanup of old workers and logs regardless of size + self._global_cleanup() + + # If the database is larger than 30MB, run a vacuum to ensure space is reclaimed. + # We only do this on startup to minimize performance impact. + file_stats = self.db_path.stat() + size_mb = file_stats.st_size / (1024 * 1024) + + if size_mb > 30: + logger.debug(f"Database size ({size_mb:.1f}MB) exceeds maintenance threshold. Vacuuming...") + # We use a cursor to avoid blocking the main connection state if possible + self.connection.execute("VACUUM") + # Also optimize the query planner indices + self.connection.execute("ANALYZE") + + new_size_mb = self.db_path.stat().st_size / (1024 * 1024) + reduction = size_mb - new_size_mb + if reduction > 1.0: + logger.info(f"Maintenance reclaimed {reduction:.1f}MB. Current size: {new_size_mb:.1f}MB") + except Exception as e: + # Maintenance should never block application startup + logger.warning(f"Database maintenance skipped: {e}") + + def _global_cleanup(self) -> None: + """Aggressively prune old workers and logs to prevent database bloat.""" + try: + cursor = self.connection.cursor() + + # 1. Prune finished/failed workers older than MAX_FINISHED_WORKERS + # We keep the newest ones based on completed_at or started_at + cursor.execute( + """ + DELETE FROM worker + WHERE status != 'running' + AND id NOT IN ( + SELECT id FROM worker + WHERE status != 'running' + ORDER BY COALESCE(completed_at, started_at) DESC + LIMIT ? + ) + """, + (MAX_FINISHED_WORKERS,) + ) + worker_deletes = cursor.rowcount + + # 2. Orphans check: Remove logs that no longer have a parent worker + cursor.execute( + "DELETE FROM worker_log WHERE worker_id NOT IN (SELECT worker_id FROM worker)" + ) + log_orphans = cursor.rowcount + + # 3. Global log limit: Ensure we don't have millions of log rows even if workers are within limit + # Limit total log entries to something reasonable like 5,000 + cursor.execute( + """ + DELETE FROM worker_log + WHERE id NOT IN ( + SELECT id FROM worker_log + ORDER BY created_at DESC + LIMIT 5000 + ) + """ + ) + log_limit_deletes = cursor.rowcount + + if worker_deletes > 0 or log_orphans > 0 or log_limit_deletes > 0: + logger.info(f"Global cleanup: Removed {worker_deletes} workers and {log_orphans + log_limit_deletes} log entries.") + self.connection.commit() + + except Exception as e: + logger.warning(f"Global cleanup failed: {e}") + def _create_tables(self) -> None: """Create database tables if they don't exist.""" cursor = self.connection.cursor() @@ -417,6 +512,15 @@ class API_folder_store: # Notes indices (after migration so columns exist) cursor.execute("CREATE INDEX IF NOT EXISTS idx_note_hash ON note(hash)") cursor.execute("CREATE INDEX IF NOT EXISTS idx_note_name ON note(name)") + + # Additional optimizations for search speed + # Covering index for tags helps query 'tags for hash' without hitting the table + cursor.execute("CREATE INDEX IF NOT EXISTS idx_tag_covering ON tag(hash, tag)") + + # Index on metadata size and imports for common sorting + cursor.execute("CREATE INDEX IF NOT EXISTS idx_metadata_size ON metadata(size)") + cursor.execute("CREATE INDEX IF NOT EXISTS idx_metadata_imported ON metadata(time_imported)") + self.connection.commit() logger.debug("Database tables created/verified") @@ -1938,8 +2042,17 @@ class API_folder_store: total_steps ), ) + worker_rowid = cursor.lastrowid or 0 + + # Prune occasionally (1 in 50 chance) or just run it to keep it clean + # Running it every time might be overkill, but let's do a light version + cursor.execute( + "DELETE FROM worker WHERE status != 'running' AND id < (SELECT MAX(id) - ? FROM worker)", + (MAX_FINISHED_WORKERS * 2,) + ) + self.connection.commit() - return cursor.lastrowid or 0 + return worker_rowid except sqlite3.IntegrityError: return self.update_worker_status(worker_id, "running") except Exception as e: