dfdfsdd

2025-12-14 00:53:52 -08:00
parent 52a79b0086
commit a03eb0d1be
24 changed files with 2785 additions and 1868 deletions
--- a/API/folder.py
+++ b/API/folder.py
@@ -407,38 +407,53 @@ class API_folder_store:
            logger.error(f"Error clearing worker log for {worker_id}: {exc}", exc_info=True)
    
    def _migrate_metadata_schema(self, cursor) -> None:
-        """Import legacy metadata from old schema if present. Existing hash-based schema is ready to use."""
+        """Ensure metadata schema is up-to-date.
+
+        - If a legacy schema is detected, attempt to import/upgrade (best-effort).
+        - If the hash-based schema exists, add any missing columns expected by current code.
+        """
        try:
            # Check if this is a fresh new database (hash-based schema)
            cursor.execute('PRAGMA table_info(metadata)')
            existing_columns = {row[1] for row in cursor.fetchall()}
-            
-            # If hash column exists, we're already on the new schema
-            if 'hash' in existing_columns:
-                logger.info("Database is already using hash-based schema - no migration needed")
-                return
-            
-            # Legacy migration: If old schema exists, try to import data
+
+            # Legacy migration: If old schema exists, try to import data.
            # Old schema would have had: id (INTEGER PRIMARY KEY), file_hash (TEXT), etc.
-            if 'id' in existing_columns and 'file_hash' in existing_columns:
-                logger.info("Detected legacy metadata schema - importing to new hash-based schema")
-                # This would be complex legacy migration - for now just note it
-                logger.info("Legacy metadata table detected but import not yet implemented")
+            if 'hash' not in existing_columns:
+                if 'id' in existing_columns and 'file_hash' in existing_columns:
+                    logger.info("Detected legacy metadata schema - importing to new hash-based schema")
+                    # This would be complex legacy migration - for now just note it.
+                    logger.info("Legacy metadata table detected but import not yet implemented")
+                    return
+
+                # Unknown/unsupported schema; nothing we can safely do here.
                return
-            
-            # Add any missing columns to the new schema
-            for col_name, col_def in [('size', 'INTEGER'), ('ext', 'TEXT'),
-                                      ('type', 'TEXT'),
-                                      ('time_imported', 'TIMESTAMP DEFAULT CURRENT_TIMESTAMP'),
-                                      ('time_modified', 'TIMESTAMP DEFAULT CURRENT_TIMESTAMP')]:
+
+            # Hash-based schema exists: add any missing columns expected by current code.
+            # These are safe ALTER TABLE additions for older DBs.
+            column_specs = {
+                'size': 'INTEGER',
+                'ext': 'TEXT',
+                'type': 'TEXT',
+                'url': 'TEXT',
+                'relationships': 'TEXT',
+                'duration': 'REAL',
+                'time_imported': 'TIMESTAMP DEFAULT CURRENT_TIMESTAMP',
+                'time_modified': 'TIMESTAMP DEFAULT CURRENT_TIMESTAMP',
+                'created_at': 'TIMESTAMP DEFAULT CURRENT_TIMESTAMP',
+                'updated_at': 'TIMESTAMP DEFAULT CURRENT_TIMESTAMP',
+            }
+
+            for col_name, col_def in column_specs.items():
                if col_name not in existing_columns:
                    try:
                        cursor.execute(f"ALTER TABLE metadata ADD COLUMN {col_name} {col_def}")
+                        existing_columns.add(col_name)
                        logger.info(f"Added '{col_name}' column to metadata table")
                    except Exception as e:
                        logger.debug(f"Column '{col_name}' may already exist: {e}")
-            
-            # Populate type column from ext if not already populated
+
+            # Populate type column from ext if not already populated.
            if 'type' in existing_columns and 'ext' in existing_columns:
                try:
                    from SYS.utils_constant import get_type_from_ext
@@ -451,7 +466,7 @@ class API_folder_store:
                        logger.info(f"Populated type column for {len(rows)} metadata entries")
                except Exception as e:
                    logger.debug(f"Could not populate type column: {e}")
-            
+
            self.connection.commit()
        except Exception as e:
            logger.debug(f"Note: Schema import/migration completed with status: {e}")
@@ -929,6 +944,13 @@ class API_folder_store:
            if not fields:
                return

+            # Ensure a metadata row exists so updates don't silently no-op.
+            # This can happen for older DBs or entries created without explicit metadata.
+            cursor.execute(
+                "INSERT OR IGNORE INTO metadata (hash) VALUES (?)",
+                (file_hash,),
+            )
+
            values.append(file_hash)
            
            sql = f"UPDATE metadata SET {', '.join(fields)}, time_modified = CURRENT_TIMESTAMP, updated_at = CURRENT_TIMESTAMP WHERE hash = ?"
@@ -1681,6 +1703,84 @@ class DatabaseAPI:
        )
        return {row[0] for row in cursor.fetchall()}

+    def get_file_hashes_with_any_url(self, limit: Optional[int] = None) -> Set[str]:
+        """Get hashes of files that have any non-empty URL metadata."""
+        cursor = self.get_cursor()
+        cursor.execute(
+            """
+            SELECT DISTINCT f.hash
+            FROM files f
+            JOIN metadata m ON f.hash = m.hash
+            WHERE m.url IS NOT NULL
+              AND TRIM(m.url) != ''
+              AND TRIM(m.url) != '[]'
+            LIMIT ?
+            """,
+            (limit or 10000,),
+        )
+        return {row[0] for row in cursor.fetchall()}
+
+    def get_file_hashes_by_url_like(self, like_pattern: str, limit: Optional[int] = None) -> Set[str]:
+        """Get hashes of files whose URL metadata contains a substring (case-insensitive)."""
+        cursor = self.get_cursor()
+        cursor.execute(
+            """
+            SELECT DISTINCT f.hash
+            FROM files f
+            JOIN metadata m ON f.hash = m.hash
+            WHERE m.url IS NOT NULL
+              AND LOWER(m.url) LIKE ?
+            LIMIT ?
+            """,
+            (like_pattern.lower(), limit or 10000),
+        )
+        return {row[0] for row in cursor.fetchall()}
+
+    def get_files_with_any_url(self, limit: Optional[int] = None) -> List[tuple]:
+        """Get files that have any non-empty URL metadata.
+
+        Returns (hash, file_path, size, ext) tuples.
+        """
+        cursor = self.get_cursor()
+        cursor.execute(
+            """
+            SELECT f.hash, f.file_path,
+                   COALESCE((SELECT size FROM metadata WHERE hash = f.hash), 0) as size,
+                   COALESCE((SELECT ext FROM metadata WHERE hash = f.hash), '') as ext
+            FROM files f
+            JOIN metadata m ON f.hash = m.hash
+            WHERE m.url IS NOT NULL
+              AND TRIM(m.url) != ''
+              AND TRIM(m.url) != '[]'
+            ORDER BY f.file_path
+            LIMIT ?
+            """,
+            (limit or 10000,),
+        )
+        return cursor.fetchall()
+
+    def get_files_by_url_like(self, like_pattern: str, limit: Optional[int] = None) -> List[tuple]:
+        """Get files whose URL metadata contains a substring (case-insensitive).
+
+        Returns (hash, file_path, size, ext) tuples.
+        """
+        cursor = self.get_cursor()
+        cursor.execute(
+            """
+            SELECT f.hash, f.file_path,
+                   COALESCE((SELECT size FROM metadata WHERE hash = f.hash), 0) as size,
+                   COALESCE((SELECT ext FROM metadata WHERE hash = f.hash), '') as ext
+            FROM files f
+            JOIN metadata m ON f.hash = m.hash
+            WHERE m.url IS NOT NULL
+              AND LOWER(m.url) LIKE ?
+            ORDER BY f.file_path
+            LIMIT ?
+            """,
+            (like_pattern.lower(), limit or 10000),
+        )
+        return cursor.fetchall()
+
    def get_file_metadata(self, file_hashes: Set[str], limit: Optional[int] = None) -> List[tuple]:
        """Get metadata for files given their hashes. Returns (hash, file_path, size, extension) tuples."""
        if not file_hashes: