This commit is contained in:
nose
2025-12-13 12:09:50 -08:00
parent 30eb628aa3
commit 52a79b0086
16 changed files with 729 additions and 655 deletions

View File

@@ -1842,8 +1842,21 @@ class LocalLibraryInitializer:
self.db.connection.commit()
self._import_sidecars_batch()
self.db.connection.commit()
# Ensure files without sidecars are still imported + renamed to hash.
self._hash_and_rename_non_sidecar_media_files()
self.db.connection.commit()
self._cleanup_orphaned_sidecars()
self.db.connection.commit()
try:
cursor = self.db.connection.cursor()
cursor.execute("SELECT COUNT(*) FROM files")
row = cursor.fetchone()
self.stats['files_total_db'] = int(row[0]) if row and row[0] is not None else 0
except Exception:
self.stats['files_total_db'] = 0
logger.info(f"Library scan complete. Stats: {self.stats}")
return self.stats
@@ -1853,12 +1866,140 @@ class LocalLibraryInitializer:
raise
finally:
self.db.close()
def _hash_and_rename_non_sidecar_media_files(self) -> None:
"""Ensure media files are hash-named even when they have no sidecars.
This keeps the library stable across restarts:
- New files get hashed + renamed to <sha256><ext>
- DB file_path is updated by hash so the same file isn't re-counted as "new".
"""
try:
renamed = 0
skipped_existing_target = 0
duplicates_quarantined = 0
for file_path in self._find_media_files():
try:
if not file_path.is_file():
continue
stem = file_path.stem.lower()
is_hash_named = len(stem) == 64 and all(ch in "0123456789abcdef" for ch in stem)
if is_hash_named:
continue
# If any sidecars exist for this file, let the sidecar importer handle it.
if (
file_path.with_name(file_path.name + ".tag").exists()
or file_path.with_name(file_path.name + ".metadata").exists()
or file_path.with_name(file_path.name + ".notes").exists()
):
continue
file_hash = sha256_file(file_path)
target_path = file_path.with_name(f"{file_hash}{file_path.suffix}")
# Ensure the DB entry exists with a title tag derived from the original filename.
# This intentionally happens BEFORE rename.
self.db.get_or_create_file_entry(file_path, file_hash)
if target_path == file_path:
continue
if target_path.exists():
skipped_existing_target += 1
# The canonical file already exists as a hash-named file. Keep the DB pointing
# at the canonical hash-named path and quarantine this duplicate so it doesn't
# get counted as "new" again on future restarts.
try:
cursor = self.db.connection.cursor()
cursor.execute(
"UPDATE files SET file_path = ?, updated_at = CURRENT_TIMESTAMP WHERE hash = ?",
(str(target_path.resolve()), file_hash),
)
except Exception as exc:
logger.debug(f"Failed to reset DB path to canonical file for {file_hash}: {exc}")
try:
dup_dir = self.library_root / ".duplicates"
dup_dir.mkdir(parents=True, exist_ok=True)
dest = dup_dir / file_path.name
if dest.exists():
ts = int(datetime.now().timestamp())
dest = dup_dir / f"{file_path.stem}__dup__{ts}{file_path.suffix}"
logger.warning(
f"Duplicate content (hash={file_hash}) detected; moving {file_path} -> {dest}"
)
file_path.rename(dest)
duplicates_quarantined += 1
except Exception as exc:
logger.warning(
f"Duplicate content (hash={file_hash}) detected but could not quarantine {file_path}: {exc}"
)
continue
try:
file_path.rename(target_path)
except Exception as exc:
logger.warning(f"Failed to rename {file_path} -> {target_path}: {exc}")
self.stats['errors'] += 1
continue
# Update DB path by hash (more robust than matching the old path).
try:
cursor = self.db.connection.cursor()
cursor.execute(
"UPDATE files SET file_path = ?, updated_at = CURRENT_TIMESTAMP WHERE hash = ?",
(str(target_path.resolve()), file_hash),
)
except Exception:
pass
# Ensure basic metadata exists.
try:
stat_result = target_path.stat()
self.db.save_metadata(
target_path,
{
"hash": file_hash,
"ext": target_path.suffix,
"size": stat_result.st_size,
},
)
except Exception:
pass
renamed += 1
except Exception as exc:
logger.warning(f"Error hashing/renaming file {file_path}: {exc}")
self.stats['errors'] += 1
if renamed:
self.stats['files_hashed_renamed'] = int(self.stats.get('files_hashed_renamed', 0) or 0) + renamed
if skipped_existing_target:
self.stats['files_hashed_skipped_target_exists'] = int(
self.stats.get('files_hashed_skipped_target_exists', 0) or 0
) + skipped_existing_target
if duplicates_quarantined:
self.stats['duplicates_quarantined'] = int(self.stats.get('duplicates_quarantined', 0) or 0) + duplicates_quarantined
except Exception as exc:
logger.error(f"Error hashing/renaming non-sidecar media files: {exc}", exc_info=True)
self.stats['errors'] += 1
def _find_media_files(self) -> List[Path]:
"""Find all media files in the library folder."""
media_files = []
try:
for file_path in self.library_root.rglob("*"):
# Don't repeatedly re-scan quarantined duplicates.
try:
if ".duplicates" in file_path.parts:
continue
except Exception:
pass
if file_path.is_file() and file_path.suffix.lower() in MEDIA_EXTENSIONS:
media_files.append(file_path)
except Exception as e:
@@ -1882,7 +2023,7 @@ class LocalLibraryInitializer:
logger.error(f"Error getting database files: {e}", exc_info=True)
return {}
def _process_file(self, file_path: Path, db_files: Dict[str, int]) -> None:
def _process_file(self, file_path: Path, db_files: Dict[str, str]) -> None:
"""Process a single media file."""
try:
normalized = str(file_path.resolve()).lower()
@@ -1890,8 +2031,23 @@ class LocalLibraryInitializer:
if normalized in db_files:
self.stats['files_existing'] += 1
else:
self.db.get_or_create_file_entry(file_path)
self.stats['files_new'] += 1
# Path not known. If this file's hash is already in DB, it's duplicate content and
# should not be counted as "new".
file_hash = sha256_file(file_path)
try:
cursor = self.db.connection.cursor()
cursor.execute("SELECT 1 FROM files WHERE hash = ?", (file_hash,))
exists_by_hash = cursor.fetchone() is not None
except Exception:
exists_by_hash = False
if exists_by_hash:
self.stats['files_existing'] += 1
self.stats['duplicates_found'] = int(self.stats.get('duplicates_found', 0) or 0) + 1
logger.info(f"Duplicate content detected during scan (hash={file_hash}): {file_path}")
else:
self.db.get_or_create_file_entry(file_path, file_hash)
self.stats['files_new'] += 1
self.stats['files_scanned'] += 1
except Exception as e: