This commit is contained in:
nose
2025-12-14 00:53:52 -08:00
parent 52a79b0086
commit a03eb0d1be
24 changed files with 2785 additions and 1868 deletions

View File

@@ -407,38 +407,53 @@ class API_folder_store:
logger.error(f"Error clearing worker log for {worker_id}: {exc}", exc_info=True)
def _migrate_metadata_schema(self, cursor) -> None:
"""Import legacy metadata from old schema if present. Existing hash-based schema is ready to use."""
"""Ensure metadata schema is up-to-date.
- If a legacy schema is detected, attempt to import/upgrade (best-effort).
- If the hash-based schema exists, add any missing columns expected by current code.
"""
try:
# Check if this is a fresh new database (hash-based schema)
cursor.execute('PRAGMA table_info(metadata)')
existing_columns = {row[1] for row in cursor.fetchall()}
# If hash column exists, we're already on the new schema
if 'hash' in existing_columns:
logger.info("Database is already using hash-based schema - no migration needed")
return
# Legacy migration: If old schema exists, try to import data
# Legacy migration: If old schema exists, try to import data.
# Old schema would have had: id (INTEGER PRIMARY KEY), file_hash (TEXT), etc.
if 'id' in existing_columns and 'file_hash' in existing_columns:
logger.info("Detected legacy metadata schema - importing to new hash-based schema")
# This would be complex legacy migration - for now just note it
logger.info("Legacy metadata table detected but import not yet implemented")
if 'hash' not in existing_columns:
if 'id' in existing_columns and 'file_hash' in existing_columns:
logger.info("Detected legacy metadata schema - importing to new hash-based schema")
# This would be complex legacy migration - for now just note it.
logger.info("Legacy metadata table detected but import not yet implemented")
return
# Unknown/unsupported schema; nothing we can safely do here.
return
# Add any missing columns to the new schema
for col_name, col_def in [('size', 'INTEGER'), ('ext', 'TEXT'),
('type', 'TEXT'),
('time_imported', 'TIMESTAMP DEFAULT CURRENT_TIMESTAMP'),
('time_modified', 'TIMESTAMP DEFAULT CURRENT_TIMESTAMP')]:
# Hash-based schema exists: add any missing columns expected by current code.
# These are safe ALTER TABLE additions for older DBs.
column_specs = {
'size': 'INTEGER',
'ext': 'TEXT',
'type': 'TEXT',
'url': 'TEXT',
'relationships': 'TEXT',
'duration': 'REAL',
'time_imported': 'TIMESTAMP DEFAULT CURRENT_TIMESTAMP',
'time_modified': 'TIMESTAMP DEFAULT CURRENT_TIMESTAMP',
'created_at': 'TIMESTAMP DEFAULT CURRENT_TIMESTAMP',
'updated_at': 'TIMESTAMP DEFAULT CURRENT_TIMESTAMP',
}
for col_name, col_def in column_specs.items():
if col_name not in existing_columns:
try:
cursor.execute(f"ALTER TABLE metadata ADD COLUMN {col_name} {col_def}")
existing_columns.add(col_name)
logger.info(f"Added '{col_name}' column to metadata table")
except Exception as e:
logger.debug(f"Column '{col_name}' may already exist: {e}")
# Populate type column from ext if not already populated
# Populate type column from ext if not already populated.
if 'type' in existing_columns and 'ext' in existing_columns:
try:
from SYS.utils_constant import get_type_from_ext
@@ -451,7 +466,7 @@ class API_folder_store:
logger.info(f"Populated type column for {len(rows)} metadata entries")
except Exception as e:
logger.debug(f"Could not populate type column: {e}")
self.connection.commit()
except Exception as e:
logger.debug(f"Note: Schema import/migration completed with status: {e}")
@@ -929,6 +944,13 @@ class API_folder_store:
if not fields:
return
# Ensure a metadata row exists so updates don't silently no-op.
# This can happen for older DBs or entries created without explicit metadata.
cursor.execute(
"INSERT OR IGNORE INTO metadata (hash) VALUES (?)",
(file_hash,),
)
values.append(file_hash)
sql = f"UPDATE metadata SET {', '.join(fields)}, time_modified = CURRENT_TIMESTAMP, updated_at = CURRENT_TIMESTAMP WHERE hash = ?"
@@ -1681,6 +1703,84 @@ class DatabaseAPI:
)
return {row[0] for row in cursor.fetchall()}
def get_file_hashes_with_any_url(self, limit: Optional[int] = None) -> Set[str]:
"""Get hashes of files that have any non-empty URL metadata."""
cursor = self.get_cursor()
cursor.execute(
"""
SELECT DISTINCT f.hash
FROM files f
JOIN metadata m ON f.hash = m.hash
WHERE m.url IS NOT NULL
AND TRIM(m.url) != ''
AND TRIM(m.url) != '[]'
LIMIT ?
""",
(limit or 10000,),
)
return {row[0] for row in cursor.fetchall()}
def get_file_hashes_by_url_like(self, like_pattern: str, limit: Optional[int] = None) -> Set[str]:
"""Get hashes of files whose URL metadata contains a substring (case-insensitive)."""
cursor = self.get_cursor()
cursor.execute(
"""
SELECT DISTINCT f.hash
FROM files f
JOIN metadata m ON f.hash = m.hash
WHERE m.url IS NOT NULL
AND LOWER(m.url) LIKE ?
LIMIT ?
""",
(like_pattern.lower(), limit or 10000),
)
return {row[0] for row in cursor.fetchall()}
def get_files_with_any_url(self, limit: Optional[int] = None) -> List[tuple]:
"""Get files that have any non-empty URL metadata.
Returns (hash, file_path, size, ext) tuples.
"""
cursor = self.get_cursor()
cursor.execute(
"""
SELECT f.hash, f.file_path,
COALESCE((SELECT size FROM metadata WHERE hash = f.hash), 0) as size,
COALESCE((SELECT ext FROM metadata WHERE hash = f.hash), '') as ext
FROM files f
JOIN metadata m ON f.hash = m.hash
WHERE m.url IS NOT NULL
AND TRIM(m.url) != ''
AND TRIM(m.url) != '[]'
ORDER BY f.file_path
LIMIT ?
""",
(limit or 10000,),
)
return cursor.fetchall()
def get_files_by_url_like(self, like_pattern: str, limit: Optional[int] = None) -> List[tuple]:
"""Get files whose URL metadata contains a substring (case-insensitive).
Returns (hash, file_path, size, ext) tuples.
"""
cursor = self.get_cursor()
cursor.execute(
"""
SELECT f.hash, f.file_path,
COALESCE((SELECT size FROM metadata WHERE hash = f.hash), 0) as size,
COALESCE((SELECT ext FROM metadata WHERE hash = f.hash), '') as ext
FROM files f
JOIN metadata m ON f.hash = m.hash
WHERE m.url IS NOT NULL
AND LOWER(m.url) LIKE ?
ORDER BY f.file_path
LIMIT ?
""",
(like_pattern.lower(), limit or 10000),
)
return cursor.fetchall()
def get_file_metadata(self, file_hashes: Set[str], limit: Optional[int] = None) -> List[tuple]:
"""Get metadata for files given their hashes. Returns (hash, file_path, size, extension) tuples."""
if not file_hashes: