This commit is contained in:
nose
2025-12-01 01:10:16 -08:00
parent 2b93edac10
commit 6b9ed7d4ab
17 changed files with 1644 additions and 470 deletions

View File

@@ -264,9 +264,12 @@ class LocalStorageBackend(StorageBackend):
""", (file_id,))
all_tags = [row[0] for row in cursor.fetchall()]
# Use title tag if present
title_tag = next((t.split(':', 1)[1] for t in all_tags if t.lower().startswith('title:')), None)
results.append({
"name": file_path.stem,
"title": file_path.stem,
"title": title_tag or file_path.stem,
"ext": file_path.suffix.lstrip('.'),
"path": path_str,
"target": path_str,
@@ -364,9 +367,12 @@ class LocalStorageBackend(StorageBackend):
""", (file_id,))
tags = [row[0] for row in cursor.fetchall()]
# Use title tag if present
title_tag = next((t.split(':', 1)[1] for t in tags if t.lower().startswith('title:')), None)
results.append({
"name": file_path.stem,
"title": file_path.stem,
"title": title_tag or file_path.stem,
"ext": file_path.suffix.lstrip('.'),
"path": path_str,
"target": path_str,
@@ -410,9 +416,12 @@ class LocalStorageBackend(StorageBackend):
""", (file_id,))
tags = [row[0] for row in cursor.fetchall()]
# Use title tag if present
title_tag = next((t.split(':', 1)[1] for t in tags if t.lower().startswith('title:')), None)
results.append({
"name": file_path.stem,
"title": file_path.stem,
"title": title_tag or file_path.stem,
"ext": file_path.suffix.lstrip('.'),
"path": path_str,
"target": path_str,
@@ -449,9 +458,12 @@ class LocalStorageBackend(StorageBackend):
""", (file_id,))
tags = [row[0] for row in cursor.fetchall()]
# Use title tag if present
title_tag = next((t.split(':', 1)[1] for t in tags if t.lower().startswith('title:')), None)
results.append({
"name": file_path.stem,
"title": file_path.stem,
"title": title_tag or file_path.stem,
"ext": file_path.suffix.lstrip('.'),
"path": path_str,
"target": path_str,

View File

@@ -497,6 +497,10 @@ class LocalLibraryDB:
cursor = self.connection.cursor()
# Update file hash in files table if present
if metadata.get('hash'):
cursor.execute("UPDATE files SET file_hash = ? WHERE id = ?", (metadata['hash'], file_id))
known_urls = metadata.get('known_urls', [])
if not isinstance(known_urls, str):
known_urls = json.dumps(known_urls)
@@ -534,6 +538,72 @@ class LocalLibraryDB:
except Exception as e:
logger.error(f"[save_metadata] ❌ Error saving metadata for {file_path}: {e}", exc_info=True)
raise
def save_file_info(self, file_path: Path, metadata: Dict[str, Any], tags: List[str]) -> None:
"""Save metadata and tags for a file in a single transaction."""
try:
str_path = str(file_path.resolve())
logger.debug(f"[save_file_info] Starting save for: {str_path}")
file_id = self.get_or_create_file_entry(file_path)
cursor = self.connection.cursor()
# Update file hash in files table if present
if metadata.get('hash'):
cursor.execute("UPDATE files SET file_hash = ? WHERE id = ?", (metadata['hash'], file_id))
# 1. Save Metadata
known_urls = metadata.get('known_urls', [])
if not isinstance(known_urls, str):
known_urls = json.dumps(known_urls)
relationships = metadata.get('relationships', [])
if not isinstance(relationships, str):
relationships = json.dumps(relationships)
cursor.execute("""
INSERT INTO metadata (
file_id, hash, known_urls, relationships,
duration, size, ext, media_type, media_kind,
time_imported, time_modified
)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP)
ON CONFLICT(file_id) DO UPDATE SET
hash = excluded.hash,
known_urls = excluded.known_urls,
relationships = excluded.relationships,
duration = excluded.duration,
size = excluded.size,
ext = excluded.ext,
media_type = excluded.media_type,
media_kind = excluded.media_kind,
time_modified = CURRENT_TIMESTAMP,
updated_at = CURRENT_TIMESTAMP
""", (
file_id, metadata.get('hash'), known_urls, relationships,
metadata.get('duration'), metadata.get('size'), metadata.get('ext'),
metadata.get('media_type'), metadata.get('media_kind')
))
# 2. Save Tags
# We assume tags list is complete and includes title if needed
cursor.execute("DELETE FROM tags WHERE file_id = ?", (file_id,))
for tag in tags:
tag = tag.strip()
if tag:
cursor.execute("""
INSERT OR IGNORE INTO tags (file_id, tag, tag_type)
VALUES (?, ?, 'user')
""", (file_id, tag))
self.connection.commit()
logger.debug(f"[save_file_info] ✅ Committed metadata and tags for file_id {file_id}")
except Exception as e:
logger.error(f"[save_file_info] ❌ Error saving file info for {file_path}: {e}", exc_info=True)
raise
def get_tags(self, file_path: Path) -> List[str]:
"""Get all tags for a file."""
@@ -572,12 +642,15 @@ class LocalLibraryDB:
cursor.execute("DELETE FROM tags WHERE file_id = ?", (file_id,))
logger.debug(f"[save_tags] Deleted existing tags for file_id {file_id}")
if existing_title:
# Check if new tags provide a title
new_title_provided = any(str(t).strip().lower().startswith("title:") for t in tags)
if existing_title and not new_title_provided:
cursor.execute("""
INSERT INTO tags (file_id, tag, tag_type) VALUES (?, ?, 'user')
""", (file_id, existing_title[0]))
logger.debug(f"[save_tags] Preserved existing title tag")
else:
elif not existing_title and not new_title_provided:
filename_without_ext = file_path.stem
if filename_without_ext:
# Normalize underscores to spaces for consistency

View File

@@ -28,9 +28,16 @@ from typing import Any, Dict, List, Optional, Sequence, Tuple
from dataclasses import dataclass
from pathlib import Path
import sys
try:
from playwright.sync_api import sync_playwright
PLAYWRIGHT_AVAILABLE = True
except ImportError:
PLAYWRIGHT_AVAILABLE = False
import subprocess
import json
import shutil
from helper.logger import log, debug
from helper.logger import log, debug
@@ -1580,8 +1587,293 @@ class YoutubeSearchProvider(SearchProvider):
return shutil.which("yt-dlp") is not None
class BandcampProvider(SearchProvider):
"""
Search provider for Bandcamp using Playwright scraper.
"""
RESULT_FIELDS = [
("name", "Name", None),
("artist", "Artist/Loc", None),
("type", "Type", None)
]
def search(
self,
query: str,
limit: int = 50,
filters: Optional[Dict[str, Any]] = None,
**kwargs
) -> List[SearchResult]:
if not PLAYWRIGHT_AVAILABLE:
print("Playwright library not available. Please install it (pip install playwright).")
return []
results = []
try:
with sync_playwright() as p:
# Launch browser (headless)
browser = p.chromium.launch(headless=True)
page = browser.new_page()
# Check if query is a URL (Artist/Album Scraping Mode)
if query.startswith("http://") or query.startswith("https://"):
return self._scrape_url(page, query, limit)
# Search Mode
# Parse query for prefixes
search_type = "t" # Default to track
clean_query = query
if "artist:" in query.lower():
search_type = "b"
clean_query = query.lower().replace("artist:", "").strip()
elif "album:" in query.lower():
search_type = "a"
clean_query = query.lower().replace("album:", "").strip()
elif "track:" in query.lower():
search_type = "t"
clean_query = query.lower().replace("track:", "").strip()
elif "label:" in query.lower():
search_type = "b"
clean_query = query.lower().replace("label:", "").strip()
# Filters override prefix
if filters:
ftype = filters.get("type", "").lower()
if ftype in ["album", "albums"]:
search_type = "a"
elif ftype in ["artist", "artists", "label", "labels"]:
search_type = "b"
elif ftype in ["track", "tracks"]:
search_type = "t"
# Construct URL with item_type
url = f"https://bandcamp.com/search?q={clean_query}&item_type={search_type}"
debug(f"[Bandcamp] Navigating to search URL: {url}")
page.goto(url)
page.wait_for_load_state("domcontentloaded")
# Wait for results
try:
# Wait for the search results to appear in the DOM
page.wait_for_selector(".searchresult", timeout=10000)
except Exception as e:
# No results found or timeout
log(f"Bandcamp search timeout or no results: {e}")
browser.close()
return []
# Extract items
items = page.query_selector_all(".searchresult")
debug(f"[Bandcamp] Found {len(items)} results")
for item in items:
if len(results) >= limit:
break
try:
# Extract data
heading_el = item.query_selector(".heading a")
if not heading_el:
debug("[Bandcamp] Skipping item: No heading found")
continue
name = heading_el.inner_text().strip()
item_url = heading_el.get_attribute("href")
# Clean URL (remove query params)
if item_url and "?" in item_url:
item_url = item_url.split("?")[0]
item_type_el = item.query_selector(".itemtype")
item_type = item_type_el.inner_text().strip() if item_type_el else "Unknown"
subhead_el = item.query_selector(".subhead")
subhead = subhead_el.inner_text().strip() if subhead_el else ""
art_el = item.query_selector(".art img")
img = art_el.get_attribute("src") if art_el else None
# Map to metadata
metadata = {
"name": name,
"type": item_type,
"url": item_url,
"img": img,
"subhead": subhead
}
# Refine metadata based on type
artist_or_loc = subhead
if "ALBUM" in item_type.upper():
artist_or_loc = subhead.replace("by ", "").strip()
metadata["artist"] = artist_or_loc
elif "ARTIST" in item_type.upper() or "LABEL" in item_type.upper():
metadata["location"] = subhead
elif "TRACK" in item_type.upper():
artist_or_loc = subhead.replace("by ", "").strip()
metadata["artist"] = artist_or_loc
columns = [
("Name", name),
("Artist/Loc", artist_or_loc),
("Type", item_type)
]
results.append(SearchResult(
origin="bandcamp",
title=name,
target=item_url,
full_metadata=metadata,
columns=columns
))
except Exception as e:
# Skip malformed items
debug(f"[Bandcamp] Error parsing item: {e}")
continue
browser.close()
except Exception as e:
log(f"Bandcamp search error: {e}")
return []
return results
def _scrape_url(self, page, url: str, limit: int) -> List[SearchResult]:
"""Scrape a Bandcamp artist or album page."""
debug(f"[Bandcamp] Scraping URL: {url}")
# If it's an artist page, try to go to /music to see all
if ".bandcamp.com" in url and "/music" not in url and "/album/" not in url and "/track/" not in url:
# Check if it's likely an artist root
url = url.rstrip("/") + "/music"
debug(f"[Bandcamp] Adjusted to music page: {url}")
page.goto(url)
page.wait_for_load_state("domcontentloaded")
results = []
# Check for grid items (Artist page /music)
grid_items = page.query_selector_all(".music-grid-item")
if grid_items:
debug(f"[Bandcamp] Found {len(grid_items)} grid items")
# Try to get global artist name from page metadata/header as fallback
page_artist = ""
try:
og_site_name = page.query_selector('meta[property="og:site_name"]')
if og_site_name:
page_artist = og_site_name.get_attribute("content") or ""
if not page_artist:
band_name = page.query_selector('#band-name-location .title')
if band_name:
page_artist = band_name.inner_text().strip()
except Exception:
pass
for item in grid_items:
if len(results) >= limit:
break
try:
title_el = item.query_selector(".title")
# Sanitize title to remove newlines which break the table
title = title_el.inner_text().strip().replace("\n", " ").replace("\r", "") if title_el else "Unknown"
# Remove extra spaces
title = " ".join(title.split())
link_el = item.query_selector("a")
href = link_el.get_attribute("href") if link_el else ""
if href and not href.startswith("http"):
# Relative link, construct full URL
base = url.split("/music")[0]
href = base + href
artist_el = item.query_selector(".artist")
artist = artist_el.inner_text().replace("by ", "").strip() if artist_el else ""
# Use page artist if item artist is missing
if not artist and page_artist:
artist = page_artist
# Sanitize artist
artist = artist.replace("\n", " ").replace("\r", "")
artist = " ".join(artist.split())
columns = [
("Name", title),
("Artist", artist),
("Type", "Album/Track")
]
results.append(SearchResult(
origin="bandcamp",
title=title,
target=href,
full_metadata={"artist": artist},
columns=columns
))
except Exception as e:
debug(f"[Bandcamp] Error parsing grid item: {e}")
continue
return results
# Check for track list (Album page)
track_rows = page.query_selector_all(".track_row_view")
if track_rows:
debug(f"[Bandcamp] Found {len(track_rows)} track rows")
# Get Album Artist
artist_el = page.query_selector("#name-section h3 span a")
album_artist = artist_el.inner_text().strip() if artist_el else "Unknown"
for row in track_rows:
if len(results) >= limit:
break
try:
title_el = row.query_selector(".track-title")
# Sanitize title
title = title_el.inner_text().strip().replace("\n", " ").replace("\r", "") if title_el else "Unknown"
title = " ".join(title.split())
# Track link
link_el = row.query_selector(".title a")
href = link_el.get_attribute("href") if link_el else ""
if href and not href.startswith("http"):
base = url.split(".com")[0] + ".com"
href = base + href
duration_el = row.query_selector(".time")
duration = duration_el.inner_text().strip() if duration_el else ""
columns = [
("Name", title),
("Artist", album_artist),
("Duration", duration)
]
results.append(SearchResult(
origin="bandcamp",
title=title,
target=href,
full_metadata={"artist": album_artist, "duration": duration},
columns=columns
))
except Exception as e:
debug(f"[Bandcamp] Error parsing track row: {e}")
continue
return results
debug("[Bandcamp] No recognizable items found on page")
return []
def get_result_args(self) -> List[str]:
return ["-url"]
# Provider registry
_PROVIDERS = {
"bandcamp": BandcampProvider,
"local": LocalStorageProvider,
"libgen": LibGenProvider,
"soulseek": SoulSeekProvider,