Files
Medios-Macina/Provider/bandcamp.py

356 lines
12 KiB
Python

from __future__ import annotations
import sys
from urllib.parse import urlparse
from typing import Any, Dict, List, Optional
from ProviderCore.base import Provider, SearchResult
from SYS.logger import log, debug
from tool.playwright import PlaywrightTool
class Bandcamp(Provider):
"""Search provider for Bandcamp."""
TABLE_AUTO_STAGES = {
"bandcamp": ["download-file"],
}
@staticmethod
def _base_url(raw_url: str) -> str:
"""Normalize a Bandcamp URL down to scheme://netloc."""
text = str(raw_url or "").strip()
if not text:
return ""
try:
parsed = urlparse(text)
if not parsed.scheme or not parsed.netloc:
return text
return f"{parsed.scheme}://{parsed.netloc}"
except Exception:
return text
@classmethod
def _discography_url(cls, raw_url: str) -> str:
base = cls._base_url(raw_url)
if not base:
return ""
# Bandcamp discography lives under /music.
return base.rstrip("/") + "/music"
def _scrape_artist_page(self,
page: Any,
artist_url: str,
limit: int = 50) -> List[SearchResult]:
"""Scrape an artist page for albums/tracks (discography)."""
base = self._base_url(artist_url)
discography_url = self._discography_url(artist_url)
if not base or not discography_url:
return []
debug(f"[bandcamp] Scraping artist page: {discography_url}")
page.goto(discography_url)
page.wait_for_load_state("domcontentloaded")
results: List[SearchResult] = []
cards = page.query_selector_all("li.music-grid-item") or []
if not cards:
# Fallback selector
cards = page.query_selector_all(".music-grid-item") or []
for item in cards[:limit]:
try:
link = item.query_selector("a")
if not link:
continue
href = link.get_attribute("href") or ""
href = str(href).strip()
if not href:
continue
if href.startswith("/"):
target = base.rstrip("/") + href
elif href.startswith("http://") or href.startswith("https://"):
target = href
else:
target = base.rstrip("/") + "/" + href
title_node = item.query_selector("p.title"
) or item.query_selector(".title")
title = title_node.inner_text().strip() if title_node else ""
if title:
title = " ".join(title.split())
if not title:
title = target.rsplit("/", 1)[-1]
kind = (
"album" if "/album/" in target else
("track" if "/track/" in target else "item")
)
results.append(
SearchResult(
table="bandcamp",
title=title,
path=target,
detail="",
annotations=[kind],
media_kind="audio",
columns=[
("Title",
title),
("Type",
kind),
("Url",
target),
],
full_metadata={
"type": kind,
"url": target,
"artist_url": base,
},
)
)
except Exception as exc:
debug(f"[bandcamp] Error parsing artist item: {exc}")
return results
def selector(
self,
selected_items: List[Any],
*,
ctx: Any,
stage_is_last: bool = True,
**_kwargs: Any
) -> bool:
"""Handle Bandcamp `@N` selection.
If the selected item is an ARTIST result, selecting it auto-expands into
a discography table by scraping the artist URL.
"""
if not stage_is_last:
return False
# Playwright is required; proceed to handle artist selection
# Only handle artist selections.
chosen: List[Dict[str, Any]] = []
for item in selected_items or []:
payload: Dict[str,
Any] = {}
if isinstance(item, dict):
payload = item
else:
try:
if hasattr(item, "to_dict"):
payload = item.to_dict() # type: ignore[assignment]
except Exception:
payload = {}
if not payload:
try:
payload = {
"title": getattr(item,
"title",
None),
"url": getattr(item,
"url",
None),
"path": getattr(item,
"path",
None),
"metadata": getattr(item,
"metadata",
None),
"extra": getattr(item,
"extra",
None),
}
except Exception:
payload = {}
meta = payload.get("metadata") or payload.get("full_metadata") or {}
if not isinstance(meta, dict):
meta = {}
extra = payload.get("extra")
if isinstance(extra, dict):
meta = {
**meta,
**extra
}
type_val = str(meta.get("type") or "").strip().lower()
if type_val != "artist":
continue
title = str(payload.get("title") or "").strip()
url_val = str(
payload.get("url") or payload.get("path") or meta.get("url") or ""
).strip()
base = self._base_url(url_val)
if not base:
continue
chosen.append(
{
"title": title,
"url": base,
"location": str(meta.get("artist") or "").strip()
}
)
if not chosen:
return False
# Build a new table from artist discography.
try:
from SYS.result_table import ResultTable
from SYS.rich_display import stdout_console
except Exception:
return False
artist_title = chosen[0].get("title") or "artist"
artist_url = chosen[0].get("url") or ""
try:
tool = PlaywrightTool({})
tool.require()
with tool.open_page(headless=True) as page:
discography = self._scrape_artist_page(page, artist_url, limit=50)
except Exception as exc:
print(f"bandcamp artist lookup failed: {exc}\n")
return True
table = ResultTable(f"Bandcamp: artist:{artist_title}").set_preserve_order(True)
table.set_table("bandcamp")
try:
table.set_value_case("lower")
except Exception:
pass
results_payload: List[Dict[str, Any]] = []
for r in discography:
table.add_result(r)
try:
results_payload.append(r.to_dict())
except Exception:
results_payload.append(
{
"table": "bandcamp",
"title": getattr(r,
"title",
""),
"path": getattr(r,
"path",
""),
}
)
try:
ctx.set_last_result_table(table, results_payload)
ctx.set_current_stage_table(table)
except Exception:
pass
try:
stdout_console().print()
stdout_console().print(table)
except Exception:
pass
return True
def search(
self,
query: str,
limit: int = 50,
filters: Optional[Dict[str,
Any]] = None,
**kwargs: Any,
) -> List[SearchResult]:
try:
tool = PlaywrightTool({})
tool.require()
with tool.open_page(headless=True) as page:
if query.strip().lower().startswith("artist:"):
artist_name = query[7:].strip().strip('"')
search_url = f"https://bandcamp.com/search?q={artist_name}&item_type=b"
else:
search_url = f"https://bandcamp.com/search?q={query}&item_type=a"
results = self._scrape_url(page, search_url, limit)
return results
except Exception as exc:
log(f"[bandcamp] Search error: {exc}", file=sys.stderr)
return []
def _scrape_url(self, page: Any, url: str, limit: int) -> List[SearchResult]:
debug(f"[bandcamp] Scraping: {url}")
page.goto(url)
page.wait_for_load_state("domcontentloaded")
results: List[SearchResult] = []
search_results = page.query_selector_all(".searchresult")
if not search_results:
return results
for item in search_results[:limit]:
try:
heading = item.query_selector(".heading")
if not heading:
continue
link = heading.query_selector("a")
if not link:
continue
title = link.inner_text().strip()
target_url = link.get_attribute("href")
base_url = self._base_url(str(target_url or ""))
subhead = item.query_selector(".subhead")
artist = subhead.inner_text().strip() if subhead else "Unknown"
itemtype = item.query_selector(".itemtype")
media_type = itemtype.inner_text().strip() if itemtype else "album"
results.append(
SearchResult(
table="bandcamp",
title=title,
path=target_url,
detail=f"By: {artist}",
annotations=[media_type],
media_kind="audio",
columns=[
("Title",
title),
("Location",
artist),
("Type",
media_type),
("Url",
base_url or str(target_url or "")),
],
full_metadata={
"artist": artist,
"type": media_type,
"url": base_url or str(target_url or ""),
},
)
)
except Exception as exc:
debug(f"[bandcamp] Error parsing result: {exc}")
return results
def validate(self) -> bool:
# Playwright is required for the provider to function
return True