Files
Medios-Macina/helper/provider.py

819 lines
29 KiB
Python
Raw Normal View History

2025-12-11 12:47:30 -08:00
"""Provider interfaces for search and file upload functionality.
This module defines two distinct provider types:
1. SearchProvider: For searching content (books, music, videos, games)
2. FileProvider: For uploading files to hosting services
No legacy code or backwards compatibility - clean, single source of truth.
"""
from __future__ import annotations
from abc import ABC, abstractmethod
from typing import Any, Dict, List, Optional, Tuple
from dataclasses import dataclass, field
from pathlib import Path
import sys
import os
import json
import re
import time
import asyncio
import subprocess
import shutil
import mimetypes
import traceback
import requests
from helper.logger import log, debug
# Optional dependencies
try:
from playwright.sync_api import sync_playwright
PLAYWRIGHT_AVAILABLE = True
except ImportError:
PLAYWRIGHT_AVAILABLE = False
# ============================================================================
# SEARCH PROVIDERS
# ============================================================================
@dataclass
class SearchResult:
"""Unified search result format across all search providers."""
origin: str # Provider name: "libgen", "soulseek", "debrid", "bandcamp", etc.
title: str # Display title/filename
path: str # Download target (URL, path, magnet, identifier)
detail: str = "" # Additional description
annotations: List[str] = field(default_factory=list) # Tags: ["120MB", "flac", "ready"]
media_kind: str = "other" # Type: "book", "audio", "video", "game", "magnet"
size_bytes: Optional[int] = None
tags: set[str] = field(default_factory=set) # Searchable tags
columns: List[Tuple[str, str]] = field(default_factory=list) # Display columns
full_metadata: Dict[str, Any] = field(default_factory=dict) # Extra metadata
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary for pipeline processing."""
return {
"origin": self.origin,
"title": self.title,
"path": self.path,
"detail": self.detail,
"annotations": self.annotations,
"media_kind": self.media_kind,
"size_bytes": self.size_bytes,
"tags": list(self.tags),
"columns": list(self.columns),
"full_metadata": self.full_metadata,
}
class SearchProvider(ABC):
"""Base class for search providers."""
def __init__(self, config: Dict[str, Any] = None):
self.config = config or {}
self.name = self.__class__.__name__.lower()
@abstractmethod
def search(
self,
query: str,
limit: int = 50,
filters: Optional[Dict[str, Any]] = None,
**kwargs
) -> List[SearchResult]:
"""Search for items matching the query.
Args:
query: Search query string
limit: Maximum results to return
filters: Optional filtering criteria
**kwargs: Provider-specific arguments
Returns:
List of SearchResult objects
"""
pass
def validate(self) -> bool:
"""Check if provider is available and properly configured."""
return True
class Libgen(SearchProvider):
"""Search provider for Library Genesis books."""
def search(
self,
query: str,
limit: int = 50,
filters: Optional[Dict[str, Any]] = None,
**kwargs
) -> List[SearchResult]:
filters = filters or {}
try:
from helper.unified_book_downloader import UnifiedBookDownloader
from helper.query_parser import parse_query, get_field, get_free_text
parsed = parse_query(query)
isbn = get_field(parsed, 'isbn')
author = get_field(parsed, 'author')
title = get_field(parsed, 'title')
free_text = get_free_text(parsed)
search_query = isbn or title or author or free_text or query
downloader = UnifiedBookDownloader(config=self.config)
books = downloader.search_libgen(search_query, limit=limit)
results = []
for idx, book in enumerate(books, 1):
title = book.get("title", "Unknown")
author = book.get("author", "Unknown")
year = book.get("year", "Unknown")
pages = book.get("pages") or book.get("pages_str") or ""
extension = book.get("extension", "") or book.get("ext", "")
filesize = book.get("filesize_str", "Unknown")
isbn = book.get("isbn", "")
mirror_url = book.get("mirror_url", "")
columns = [
("Title", title),
("Author", author),
("Pages", str(pages)),
("Ext", str(extension)),
]
detail = f"By: {author}"
if year and year != "Unknown":
detail += f" ({year})"
annotations = [f"{filesize}"]
if isbn:
annotations.append(f"ISBN: {isbn}")
results.append(SearchResult(
origin="libgen",
title=title,
path=mirror_url or f"libgen:{book.get('id', '')}",
detail=detail,
annotations=annotations,
media_kind="book",
columns=columns,
full_metadata={
"number": idx,
"author": author,
"year": year,
"isbn": isbn,
"filesize": filesize,
"pages": pages,
"extension": extension,
"book_id": book.get("book_id", ""),
"md5": book.get("md5", ""),
},
))
return results
except Exception as e:
log(f"[libgen] Search error: {e}", file=sys.stderr)
return []
def validate(self) -> bool:
try:
from helper.unified_book_downloader import UnifiedBookDownloader
return True
except Exception:
return False
class Soulseek(SearchProvider):
"""Search provider for Soulseek P2P network."""
MUSIC_EXTENSIONS = {
'.flac', '.mp3', '.m4a', '.aac', '.ogg', '.opus',
'.wav', '.alac', '.wma', '.ape', '.aiff', '.dsf',
'.dff', '.wv', '.tta', '.tak', '.ac3', '.dts'
}
USERNAME = "asjhkjljhkjfdsd334"
PASSWORD = "khhhg"
DOWNLOAD_DIR = "./downloads"
MAX_WAIT_TRANSFER = 1200
async def perform_search(
self,
query: str,
timeout: float = 9.0,
limit: int = 50
) -> List[Dict[str, Any]]:
"""Perform async Soulseek search."""
import os
from aioslsk.client import SoulSeekClient
from aioslsk.settings import Settings, CredentialsSettings
os.makedirs(self.DOWNLOAD_DIR, exist_ok=True)
settings = Settings(credentials=CredentialsSettings(username=self.USERNAME, password=self.PASSWORD))
client = SoulSeekClient(settings)
try:
await client.start()
await client.login()
except Exception as e:
log(f"[soulseek] Login failed: {type(e).__name__}: {e}", file=sys.stderr)
return []
try:
search_request = await client.searches.search(query)
await self._collect_results(client, search_request, timeout=timeout)
return self._flatten_results(search_request)[:limit]
except Exception as e:
log(f"[soulseek] Search error: {type(e).__name__}: {e}", file=sys.stderr)
return []
finally:
try:
await client.stop()
except Exception:
pass
def _flatten_results(self, search_request) -> List[dict]:
flat = []
for result in search_request.results:
username = getattr(result, "username", "?")
for file_data in getattr(result, "shared_items", []):
flat.append({
"file": file_data,
"username": username,
"filename": getattr(file_data, "filename", "?"),
"size": getattr(file_data, "filesize", 0),
})
for file_data in getattr(result, "locked_results", []):
flat.append({
"file": file_data,
"username": username,
"filename": getattr(file_data, "filename", "?"),
"size": getattr(file_data, "filesize", 0),
})
return flat
async def _collect_results(self, client, search_request, timeout: float = 75.0) -> None:
end = time.time() + timeout
last_count = 0
while time.time() < end:
current_count = len(search_request.results)
if current_count > last_count:
debug(f"[soulseek] Got {current_count} result(s)...")
last_count = current_count
await asyncio.sleep(0.5)
def search(
self,
query: str,
limit: int = 50,
filters: Optional[Dict[str, Any]] = None,
**kwargs
) -> List[SearchResult]:
filters = filters or {}
try:
flat_results = asyncio.run(self.perform_search(query, timeout=9.0, limit=limit))
if not flat_results:
return []
# Filter to music files only
music_results = []
for item in flat_results:
filename = item['filename']
ext = '.' + filename.rsplit('.', 1)[-1].lower() if '.' in filename else ''
if ext in self.MUSIC_EXTENSIONS:
music_results.append(item)
if not music_results:
return []
# Extract metadata
enriched_results = []
for item in music_results:
filename = item['filename']
ext = '.' + filename.rsplit('.', 1)[-1].lower() if '.' in filename else ''
# Get display filename
display_name = filename.split('\\')[-1] if '\\' in filename else filename.split('/')[-1] if '/' in filename else filename
# Extract path hierarchy
path_parts = filename.replace('\\', '/').split('/')
artist = path_parts[-3] if len(path_parts) >= 3 else ''
album = path_parts[-2] if len(path_parts) >= 3 else path_parts[-2] if len(path_parts) == 2 else ''
# Extract track number and title
base_name = display_name.rsplit('.', 1)[0] if '.' in display_name else display_name
track_num = ''
title = base_name
filename_artist = ''
match = re.match(r'^(\d{1,3})\s*[\.\-]?\s+(.+)$', base_name)
if match:
track_num = match.group(1)
rest = match.group(2)
if ' - ' in rest:
filename_artist, title = rest.split(' - ', 1)
else:
title = rest
if filename_artist:
artist = filename_artist
enriched_results.append({
**item,
'artist': artist,
'album': album,
'title': title,
'track_num': track_num,
'ext': ext
})
# Apply filters
if filters:
artist_filter = filters.get('artist', '').lower() if filters.get('artist') else ''
album_filter = filters.get('album', '').lower() if filters.get('album') else ''
track_filter = filters.get('track', '').lower() if filters.get('track') else ''
if artist_filter or album_filter or track_filter:
filtered = []
for item in enriched_results:
if artist_filter and artist_filter not in item['artist'].lower():
continue
if album_filter and album_filter not in item['album'].lower():
continue
if track_filter and track_filter not in item['title'].lower():
continue
filtered.append(item)
enriched_results = filtered
# Sort: .flac first, then by size
enriched_results.sort(key=lambda item: (item['ext'].lower() != '.flac', -item['size']))
# Convert to SearchResult
results = []
for idx, item in enumerate(enriched_results, 1):
artist_display = item['artist'] if item['artist'] else "(no artist)"
album_display = item['album'] if item['album'] else "(no album)"
size_mb = int(item['size'] / 1024 / 1024)
columns = [
("Track", item['track_num'] or "?"),
("Title", item['title'][:40]),
("Artist", artist_display[:32]),
("Album", album_display[:32]),
("Size", f"{size_mb} MB"),
]
results.append(SearchResult(
origin="soulseek",
title=item['title'],
path=item['filename'],
detail=f"{artist_display} - {album_display}",
annotations=[f"{size_mb} MB", item['ext'].lstrip('.').upper()],
media_kind="audio",
size_bytes=item['size'],
columns=columns,
full_metadata={
"username": item['username'],
"filename": item['filename'],
"artist": item['artist'],
"album": item['album'],
"track_num": item['track_num'],
"ext": item['ext'],
},
))
return results
except Exception as e:
log(f"[soulseek] Search error: {e}", file=sys.stderr)
return []
def validate(self) -> bool:
try:
from aioslsk.client import SoulSeekClient
return True
except ImportError:
return False
class Bandcamp(SearchProvider):
"""Search provider for Bandcamp."""
def search(
self,
query: str,
limit: int = 50,
filters: Optional[Dict[str, Any]] = None,
**kwargs
) -> List[SearchResult]:
if not PLAYWRIGHT_AVAILABLE:
log("[bandcamp] Playwright not available. Install with: pip install playwright", file=sys.stderr)
return []
results = []
try:
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
# Parse query for artist: prefix
if query.strip().lower().startswith("artist:"):
artist_name = query[7:].strip().strip('"')
search_url = f"https://bandcamp.com/search?q={artist_name}&item_type=b"
else:
search_url = f"https://bandcamp.com/search?q={query}&item_type=a"
results = self._scrape_url(page, search_url, limit)
browser.close()
except Exception as e:
log(f"[bandcamp] Search error: {e}", file=sys.stderr)
return []
return results
def _scrape_url(self, page, url: str, limit: int) -> List[SearchResult]:
debug(f"[bandcamp] Scraping: {url}")
page.goto(url)
page.wait_for_load_state("domcontentloaded")
results = []
# Check for search results
search_results = page.query_selector_all(".searchresult")
if search_results:
for item in search_results[:limit]:
try:
heading = item.query_selector(".heading")
if not heading:
continue
link = heading.query_selector("a")
if not link:
continue
title = link.inner_text().strip()
target_url = link.get_attribute("href")
subhead = item.query_selector(".subhead")
artist = subhead.inner_text().strip() if subhead else "Unknown"
itemtype = item.query_selector(".itemtype")
media_type = itemtype.inner_text().strip() if itemtype else "album"
results.append(SearchResult(
origin="bandcamp",
title=title,
path=target_url,
detail=f"By: {artist}",
annotations=[media_type],
media_kind="audio",
columns=[
("Name", title),
("Artist", artist),
("Type", media_type),
],
full_metadata={
"artist": artist,
"type": media_type,
},
))
except Exception as e:
debug(f"[bandcamp] Error parsing result: {e}")
continue
return results
def validate(self) -> bool:
return PLAYWRIGHT_AVAILABLE
class YouTube(SearchProvider):
"""Search provider for YouTube using yt-dlp."""
def search(
self,
query: str,
limit: int = 10,
filters: Optional[Dict[str, Any]] = None,
**kwargs
) -> List[SearchResult]:
ytdlp_path = shutil.which("yt-dlp")
if not ytdlp_path:
log("[youtube] yt-dlp not found in PATH", file=sys.stderr)
return []
search_query = f"ytsearch{limit}:{query}"
cmd = [
ytdlp_path,
"--dump-json",
"--flat-playlist",
"--no-warnings",
search_query
]
try:
process = subprocess.run(
cmd,
capture_output=True,
text=True,
encoding="utf-8",
errors="replace"
)
if process.returncode != 0:
log(f"[youtube] yt-dlp failed: {process.stderr}", file=sys.stderr)
return []
results = []
for line in process.stdout.splitlines():
if not line.strip():
continue
try:
video_data = json.loads(line)
title = video_data.get("title", "Unknown")
video_id = video_data.get("id", "")
url = video_data.get("url") or f"https://youtube.com/watch?v={video_id}"
uploader = video_data.get("uploader", "Unknown")
duration = video_data.get("duration", 0)
view_count = video_data.get("view_count", 0)
duration_str = f"{int(duration//60)}:{int(duration%60):02d}" if duration else ""
views_str = f"{view_count:,}" if view_count else ""
results.append(SearchResult(
origin="youtube",
title=title,
path=url,
detail=f"By: {uploader}",
annotations=[duration_str, f"{views_str} views"],
media_kind="video",
columns=[
("Title", title),
("Uploader", uploader),
("Duration", duration_str),
("Views", views_str),
],
full_metadata={
"video_id": video_id,
"uploader": uploader,
"duration": duration,
"view_count": view_count,
},
))
except json.JSONDecodeError:
continue
return results
except Exception as e:
log(f"[youtube] Error: {e}", file=sys.stderr)
return []
def validate(self) -> bool:
return shutil.which("yt-dlp") is not None
def pipe(self, path: str, config: Optional[Dict[str, Any]] = None) -> Optional[str]:
"""Return the playable URL for MPV (just the path for YouTube)."""
return path
# Search provider registry
_SEARCH_PROVIDERS = {
"libgen": Libgen,
"soulseek": Soulseek,
"bandcamp": Bandcamp,
"youtube": YouTube,
}
def get_search_provider(name: str, config: Optional[Dict[str, Any]] = None) -> Optional[SearchProvider]:
"""Get a search provider by name."""
provider_class = _SEARCH_PROVIDERS.get(name.lower())
if provider_class is None:
log(f"[provider] Unknown search provider: {name}", file=sys.stderr)
return None
try:
provider = provider_class(config)
if not provider.validate():
log(f"[provider] Provider '{name}' is not available", file=sys.stderr)
return None
return provider
except Exception as e:
log(f"[provider] Error initializing '{name}': {e}", file=sys.stderr)
return None
def list_search_providers(config: Optional[Dict[str, Any]] = None) -> Dict[str, bool]:
"""List all search providers and their availability."""
availability = {}
for name, provider_class in _SEARCH_PROVIDERS.items():
try:
provider = provider_class(config)
availability[name] = provider.validate()
except Exception:
availability[name] = False
return availability
# ============================================================================
# FILE PROVIDERS
# ============================================================================
class FileProvider(ABC):
"""Base class for file upload providers."""
def __init__(self, config: Optional[Dict[str, Any]] = None):
self.config = config or {}
self.name = self.__class__.__name__.lower()
@abstractmethod
def upload(self, file_path: str, **kwargs: Any) -> str:
"""Upload a file and return the URL."""
pass
def validate(self) -> bool:
"""Check if provider is available/configured."""
return True
class ZeroXZero(FileProvider):
"""File provider for 0x0.st."""
def upload(self, file_path: str, **kwargs: Any) -> str:
from helper.http_client import HTTPClient
if not os.path.exists(file_path):
raise FileNotFoundError(f"File not found: {file_path}")
try:
headers = {"User-Agent": "Medeia-Macina/1.0"}
with HTTPClient(headers=headers) as client:
with open(file_path, 'rb') as f:
response = client.post(
"https://0x0.st",
files={"file": f}
)
if response.status_code == 200:
return response.text.strip()
else:
raise Exception(f"Upload failed: {response.status_code} - {response.text}")
except Exception as e:
log(f"[0x0] Upload error: {e}", file=sys.stderr)
raise
def validate(self) -> bool:
return True
class Matrix(FileProvider):
"""File provider for Matrix (Element) chat rooms."""
def validate(self) -> bool:
if not self.config:
return False
matrix_conf = self.config.get('storage', {}).get('matrix', {})
return bool(
matrix_conf.get('homeserver') and
matrix_conf.get('room_id') and
(matrix_conf.get('access_token') or matrix_conf.get('password'))
)
def upload(self, file_path: str, **kwargs: Any) -> str:
from pathlib import Path
path = Path(file_path)
if not path.exists():
raise FileNotFoundError(f"File not found: {file_path}")
matrix_conf = self.config.get('storage', {}).get('matrix', {})
homeserver = matrix_conf.get('homeserver')
access_token = matrix_conf.get('access_token')
room_id = matrix_conf.get('room_id')
if not homeserver.startswith('http'):
homeserver = f"https://{homeserver}"
# Upload media
upload_url = f"{homeserver}/_matrix/media/v3/upload"
headers = {
"Authorization": f"Bearer {access_token}",
"Content-Type": "application/octet-stream"
}
mime_type, _ = mimetypes.guess_type(path)
if mime_type:
headers["Content-Type"] = mime_type
filename = path.name
with open(path, 'rb') as f:
resp = requests.post(upload_url, headers=headers, data=f, params={"filename": filename})
if resp.status_code != 200:
raise Exception(f"Matrix upload failed: {resp.text}")
content_uri = resp.json().get('content_uri')
if not content_uri:
raise Exception("No content_uri returned")
# Send message
send_url = f"{homeserver}/_matrix/client/v3/rooms/{room_id}/send/m.room.message"
# Determine message type
msgtype = "m.file"
ext = path.suffix.lower()
AUDIO_EXTS = {'.mp3', '.flac', '.wav', '.m4a', '.aac', '.ogg', '.opus', '.wma', '.mka', '.alac'}
VIDEO_EXTS = {'.mp4', '.mkv', '.webm', '.mov', '.avi', '.flv', '.mpg', '.mpeg', '.ts', '.m4v', '.wmv'}
IMAGE_EXTS = {'.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp', '.tiff'}
if ext in AUDIO_EXTS:
msgtype = "m.audio"
elif ext in VIDEO_EXTS:
msgtype = "m.video"
elif ext in IMAGE_EXTS:
msgtype = "m.image"
info = {
"mimetype": mime_type,
"size": path.stat().st_size
}
payload = {
"msgtype": msgtype,
"body": filename,
"url": content_uri,
"info": info
}
resp = requests.post(send_url, headers=headers, json=payload)
if resp.status_code != 200:
raise Exception(f"Matrix send message failed: {resp.text}")
event_id = resp.json().get('event_id')
return f"https://matrix.to/#/{room_id}/{event_id}"
# File provider registry
_FILE_PROVIDERS = {
"0x0": ZeroXZero,
"matrix": Matrix,
}
def get_file_provider(name: str, config: Optional[Dict[str, Any]] = None) -> Optional[FileProvider]:
"""Get a file provider by name."""
provider_class = _FILE_PROVIDERS.get(name.lower())
if provider_class is None:
log(f"[provider] Unknown file provider: {name}", file=sys.stderr)
return None
try:
provider = provider_class(config)
if not provider.validate():
log(f"[provider] File provider '{name}' is not available", file=sys.stderr)
return None
return provider
except Exception as e:
log(f"[provider] Error initializing file provider '{name}': {e}", file=sys.stderr)
return None
def list_file_providers(config: Optional[Dict[str, Any]] = None) -> Dict[str, bool]:
"""List all file providers and their availability."""
availability = {}
for name, provider_class in _FILE_PROVIDERS.items():
try:
provider = provider_class(config)
availability[name] = provider.validate()
except Exception:
availability[name] = False
return availability