This commit is contained in:
nose
2025-12-11 12:47:30 -08:00
parent 6b05dc5552
commit 65d12411a2
92 changed files with 17447 additions and 14308 deletions

View File

@@ -50,7 +50,6 @@ UrlPolicy = _utils.UrlPolicy
DownloadOptions = _download.DownloadOptions
DownloadError = _download.DownloadError
DownloadMediaResult = _download.DownloadMediaResult
download_media = _download.download_media
is_url_supported_by_ytdlp = _download.is_url_supported_by_ytdlp
probe_url = _download.probe_url
# Hydrus utilities

View File

@@ -35,7 +35,7 @@ class AllDebridClient:
"""Client for AllDebrid API."""
# Try both v4 and v3 APIs
BASE_URLS = [
BASE_url = [
"https://api.alldebrid.com/v4",
"https://api.alldebrid.com/v3",
]
@@ -49,7 +49,7 @@ class AllDebridClient:
self.api_key = api_key.strip()
if not self.api_key:
raise AllDebridError("AllDebrid API key is empty")
self.base_url = self.BASE_URLS[0] # Start with v4
self.base_url = self.BASE_url[0] # Start with v4
def _request(self, endpoint: str, params: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
"""Make a request to AllDebrid API.
@@ -738,7 +738,7 @@ def parse_magnet_or_hash(uri: str) -> Optional[str]:
def unlock_link_cmdlet(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
"""Unlock a restricted link using AllDebrid.
Converts free hosters and restricted links to direct download URLs.
Converts free hosters and restricted links to direct download url.
Usage:
unlock-link <link>

View File

@@ -378,7 +378,7 @@ def download(
session: Authenticated requests.Session
n_threads: Number of download threads
directory: Directory to save images to
links: List of image URLs
links: List of image url
scale: Image resolution (0=highest, 10=lowest)
book_id: Archive.org book ID (for re-borrowing)

View File

@@ -0,0 +1,195 @@
"""Lightweight console notifier for background WorkerManager tasks.
Registers a refresh callback on WorkerManager and prints concise updates when
workers start, progress, or finish. Intended for CLI background workflows.
Filters to show only workers related to the current pipeline session to avoid
cluttering the terminal with workers from previous sessions.
"""
from __future__ import annotations
from typing import Any, Callable, Dict, Optional, Set
from helper.logger import log, debug
class BackgroundNotifier:
"""Simple notifier that prints worker status changes for a session."""
def __init__(
self,
manager: Any,
output: Callable[[str], None] = log,
session_worker_ids: Optional[Set[str]] = None,
only_terminal_updates: bool = False,
overlay_mode: bool = False,
) -> None:
self.manager = manager
self.output = output
self.session_worker_ids = session_worker_ids if session_worker_ids is not None else set()
self.only_terminal_updates = only_terminal_updates
self.overlay_mode = overlay_mode
self._filter_enabled = session_worker_ids is not None
self._last_state: Dict[str, str] = {}
try:
self.manager.add_refresh_callback(self._on_refresh)
self.manager.start_auto_refresh()
except Exception as exc: # pragma: no cover - best effort
debug(f"[notifier] Could not attach refresh callback: {exc}")
def _render_line(self, worker: Dict[str, Any]) -> Optional[str]:
# Use worker_id (the actual worker ID we set) for filtering and display
worker_id = str(worker.get("worker_id") or "").strip()
if not worker_id:
# Fallback to database id if worker_id is not set
worker_id = str(worker.get("id") or "").strip()
if not worker_id:
return None
status = str(worker.get("status") or "running")
progress_val = worker.get("progress") or worker.get("progress_percent")
progress = ""
if isinstance(progress_val, (int, float)):
progress = f" {progress_val:.1f}%"
elif progress_val:
progress = f" {progress_val}"
step = str(worker.get("current_step") or worker.get("description") or "").strip()
parts = [f"[worker:{worker_id}] {status}{progress}"]
if step:
parts.append(step)
return " - ".join(parts)
def _on_refresh(self, workers: list[Dict[str, Any]]) -> None:
overlay_active_workers = 0
for worker in workers:
# Use worker_id (the actual worker ID we set) for filtering
worker_id = str(worker.get("worker_id") or "").strip()
if not worker_id:
# Fallback to database id if worker_id is not set
worker_id = str(worker.get("id") or "").strip()
if not worker_id:
continue
# If filtering is enabled, skip workers not in this session
if self._filter_enabled and worker_id not in self.session_worker_ids:
continue
status = str(worker.get("status") or "running")
# Overlay mode: only emit on completion; suppress start/progress spam
if self.overlay_mode:
if status in ("completed", "finished", "error"):
progress_val = worker.get("progress") or worker.get("progress_percent") or ""
step = str(worker.get("current_step") or worker.get("description") or "").strip()
signature = f"{status}|{progress_val}|{step}"
if self._last_state.get(worker_id) == signature:
continue
self._last_state[worker_id] = signature
line = self._render_line(worker)
if line:
try:
self.output(line)
except Exception:
pass
self._last_state.pop(worker_id, None)
self.session_worker_ids.discard(worker_id)
continue
# For terminal-only mode, emit once when the worker finishes and skip intermediate updates
if self.only_terminal_updates:
if status in ("completed", "finished", "error"):
if self._last_state.get(worker_id) == status:
continue
self._last_state[worker_id] = status
line = self._render_line(worker)
if line:
try:
self.output(line)
except Exception:
pass
# Stop tracking this worker after terminal notification
self.session_worker_ids.discard(worker_id)
continue
# Skip finished workers after showing them once (standard verbose mode)
if status in ("completed", "finished", "error"):
if worker_id in self._last_state:
# Already shown, remove from tracking
self._last_state.pop(worker_id, None)
self.session_worker_ids.discard(worker_id)
continue
progress_val = worker.get("progress") or worker.get("progress_percent") or ""
step = str(worker.get("current_step") or worker.get("description") or "").strip()
signature = f"{status}|{progress_val}|{step}"
if self._last_state.get(worker_id) == signature:
continue
self._last_state[worker_id] = signature
line = self._render_line(worker)
if line:
try:
self.output(line)
except Exception:
pass
if self.overlay_mode:
try:
# If nothing active for this session, clear the overlay text
if overlay_active_workers == 0:
self.output("")
except Exception:
pass
def ensure_background_notifier(
manager: Any,
output: Callable[[str], None] = log,
session_worker_ids: Optional[Set[str]] = None,
only_terminal_updates: bool = False,
overlay_mode: bool = False,
) -> Optional[BackgroundNotifier]:
"""Attach a BackgroundNotifier to a WorkerManager if not already present.
Args:
manager: WorkerManager instance
output: Function to call for printing updates
session_worker_ids: Set of worker IDs belonging to this pipeline session.
If None, show all workers. If a set (even empty), only show workers in that set.
"""
if manager is None:
return None
existing = getattr(manager, "_background_notifier", None)
if isinstance(existing, BackgroundNotifier):
# Update session IDs if provided
if session_worker_ids is not None:
existing._filter_enabled = True
existing.session_worker_ids.update(session_worker_ids)
# Respect the most restrictive setting for terminal-only updates
if only_terminal_updates:
existing.only_terminal_updates = True
# Enable overlay mode if requested later
if overlay_mode:
existing.overlay_mode = True
return existing
notifier = BackgroundNotifier(
manager,
output,
session_worker_ids=session_worker_ids,
only_terminal_updates=only_terminal_updates,
overlay_mode=overlay_mode,
)
try:
manager._background_notifier = notifier # type: ignore[attr-defined]
except Exception:
pass
return notifier

223
helper/cmdlet_catalog.py Normal file
View File

@@ -0,0 +1,223 @@
from __future__ import annotations
from importlib import import_module
from typing import Any, Dict, List, Optional
try:
from cmdlets import REGISTRY
except Exception:
REGISTRY = {} # type: ignore
try:
from cmdnats import register_native_commands as _register_native_commands
except Exception:
_register_native_commands = None
def ensure_registry_loaded() -> None:
"""Ensure native commands are registered into REGISTRY (idempotent)."""
if _register_native_commands and REGISTRY is not None:
try:
_register_native_commands(REGISTRY)
except Exception:
pass
def _normalize_mod_name(mod_name: str) -> str:
"""Normalize a command/module name for import resolution."""
normalized = (mod_name or "").strip()
if normalized.startswith('.'):
normalized = normalized.lstrip('.')
normalized = normalized.replace('-', '_')
return normalized
def import_cmd_module(mod_name: str):
"""Import a cmdlet/native module from cmdnats or cmdlets packages."""
normalized = _normalize_mod_name(mod_name)
if not normalized:
return None
for package in ("cmdnats", "cmdlets", None):
try:
qualified = f"{package}.{normalized}" if package else normalized
return import_module(qualified)
except ModuleNotFoundError:
continue
except Exception:
continue
return None
def _normalize_arg(arg: Any) -> Dict[str, Any]:
"""Convert a CmdletArg/dict into a plain metadata dict."""
if isinstance(arg, dict):
name = arg.get("name", "")
return {
"name": str(name).lstrip("-"),
"type": arg.get("type", "string"),
"required": bool(arg.get("required", False)),
"description": arg.get("description", ""),
"choices": arg.get("choices", []) or [],
"alias": arg.get("alias", ""),
"variadic": arg.get("variadic", False),
}
name = getattr(arg, "name", "") or ""
return {
"name": str(name).lstrip("-"),
"type": getattr(arg, "type", "string"),
"required": bool(getattr(arg, "required", False)),
"description": getattr(arg, "description", ""),
"choices": getattr(arg, "choices", []) or [],
"alias": getattr(arg, "alias", ""),
"variadic": getattr(arg, "variadic", False),
}
def get_cmdlet_metadata(cmd_name: str) -> Optional[Dict[str, Any]]:
"""Return normalized metadata for a cmdlet, if available (aliases supported)."""
ensure_registry_loaded()
normalized = cmd_name.replace("-", "_")
mod = import_cmd_module(normalized)
data = getattr(mod, "CMDLET", None) if mod else None
# Fallback: resolve via registered function's module (covers aliases)
if data is None:
try:
reg_fn = (REGISTRY or {}).get(cmd_name.replace('_', '-').lower())
if reg_fn:
owner_mod = getattr(reg_fn, "__module__", "")
if owner_mod:
owner = import_module(owner_mod)
data = getattr(owner, "CMDLET", None)
except Exception:
data = None
if not data:
return None
if hasattr(data, "to_dict"):
base = data.to_dict()
elif isinstance(data, dict):
base = data
else:
base = {}
name = getattr(data, "name", base.get("name", cmd_name)) or cmd_name
aliases = getattr(data, "aliases", base.get("aliases", [])) or []
usage = getattr(data, "usage", base.get("usage", ""))
summary = getattr(data, "summary", base.get("summary", ""))
details = getattr(data, "details", base.get("details", [])) or []
args_list = getattr(data, "args", base.get("args", [])) or []
args = [_normalize_arg(arg) for arg in args_list]
return {
"name": str(name).replace("_", "-").lower(),
"aliases": [str(a).replace("_", "-").lower() for a in aliases if a],
"usage": usage,
"summary": summary,
"details": details,
"args": args,
"raw": data,
}
def list_cmdlet_metadata() -> Dict[str, Dict[str, Any]]:
"""Collect metadata for all registered cmdlets keyed by canonical name."""
ensure_registry_loaded()
entries: Dict[str, Dict[str, Any]] = {}
for reg_name in (REGISTRY or {}).keys():
meta = get_cmdlet_metadata(reg_name)
canonical = str(reg_name).replace("_", "-").lower()
if meta:
canonical = meta.get("name", canonical)
aliases = meta.get("aliases", [])
base = entries.get(
canonical,
{
"name": canonical,
"aliases": [],
"usage": "",
"summary": "",
"details": [],
"args": [],
"raw": meta.get("raw"),
},
)
merged_aliases = set(base.get("aliases", [])) | set(aliases)
if canonical != reg_name:
merged_aliases.add(reg_name)
base["aliases"] = sorted(a for a in merged_aliases if a and a != canonical)
if not base.get("usage") and meta.get("usage"):
base["usage"] = meta["usage"]
if not base.get("summary") and meta.get("summary"):
base["summary"] = meta["summary"]
if not base.get("details") and meta.get("details"):
base["details"] = meta["details"]
if not base.get("args") and meta.get("args"):
base["args"] = meta["args"]
if not base.get("raw"):
base["raw"] = meta.get("raw")
entries[canonical] = base
else:
entries.setdefault(
canonical,
{"name": canonical, "aliases": [], "usage": "", "summary": "", "details": [], "args": [], "raw": None},
)
return entries
def list_cmdlet_names(include_aliases: bool = True) -> List[str]:
"""Return sorted cmdlet names (optionally including aliases)."""
ensure_registry_loaded()
entries = list_cmdlet_metadata()
names = set()
for meta in entries.values():
names.add(meta.get("name", ""))
if include_aliases:
for alias in meta.get("aliases", []):
names.add(alias)
return sorted(n for n in names if n)
def get_cmdlet_arg_flags(cmd_name: str) -> List[str]:
"""Return flag variants for cmdlet arguments (e.g., -name/--name)."""
meta = get_cmdlet_metadata(cmd_name)
if not meta:
return []
raw = meta.get("raw")
if raw and hasattr(raw, "build_flag_registry"):
try:
registry = raw.build_flag_registry()
flags: List[str] = []
for flag_set in registry.values():
flags.extend(flag_set)
return sorted(set(flags))
except Exception:
pass
flags: List[str] = []
for arg in meta.get("args", []):
name = arg.get("name")
if not name:
continue
flags.append(f"-{name}")
flags.append(f"--{name}")
alias = arg.get("alias")
if alias:
flags.append(f"-{alias}")
return flags
def get_cmdlet_arg_choices(cmd_name: str, arg_name: str) -> List[str]:
"""Return declared choices for a cmdlet argument."""
meta = get_cmdlet_metadata(cmd_name)
if not meta:
return []
target = arg_name.lstrip("-")
for arg in meta.get("args", []):
if arg.get("name") == target:
return list(arg.get("choices", []) or [])
return []

View File

@@ -28,7 +28,6 @@ from helper.logger import log, debug
from .utils import ensure_directory, sha256_file
from .http_client import HTTPClient
from models import DownloadError, DownloadOptions, DownloadMediaResult, DebugLogger, ProgressBar
from hydrus_health_check import get_cookies_file_path
try:
import yt_dlp # type: ignore
@@ -145,7 +144,7 @@ def list_formats(url: str, no_playlist: bool = False, playlist_items: Optional[s
return None
def _download_with_sections_via_cli(url: str, ytdl_options: Dict[str, Any], sections: List[str]) -> tuple[Optional[str], Dict[str, Any]]:
def _download_with_sections_via_cli(url: str, ytdl_options: Dict[str, Any], sections: List[str], quiet: bool = False) -> tuple[Optional[str], Dict[str, Any]]:
"""Download each section separately so merge-file can combine them.
yt-dlp with multiple --download-sections args merges them into one file.
@@ -204,11 +203,14 @@ def _download_with_sections_via_cli(url: str, ytdl_options: Dict[str, Any], sect
info_dict = json.loads(meta_result.stdout.strip())
first_section_info = info_dict
title_from_first = info_dict.get('title')
debug(f"Extracted title from metadata: {title_from_first}")
if not quiet:
debug(f"Extracted title from metadata: {title_from_first}")
except json.JSONDecodeError:
debug("Could not parse JSON metadata")
if not quiet:
debug("Could not parse JSON metadata")
except Exception as e:
debug(f"Error extracting metadata: {e}")
if not quiet:
debug(f"Error extracting metadata: {e}")
# Build yt-dlp command for downloading this section
cmd = ["yt-dlp"]
@@ -240,8 +242,9 @@ def _download_with_sections_via_cli(url: str, ytdl_options: Dict[str, Any], sect
# Add the URL
cmd.append(url)
debug(f"Running yt-dlp for section {section_idx}/{len(sections_list)}: {section}")
debug(f"Command: {' '.join(cmd)}")
if not quiet:
debug(f"Running yt-dlp for section {section_idx}/{len(sections_list)}: {section}")
debug(f"Command: {' '.join(cmd)}")
# Run the subprocess - don't capture output so progress is shown
try:
@@ -273,13 +276,15 @@ def _build_ytdlp_options(opts: DownloadOptions) -> Dict[str, Any]:
"fragment_retries": 10,
"http_chunk_size": 10_485_760,
"restrictfilenames": True,
"progress_hooks": [_progress_callback],
"progress_hooks": [] if opts.quiet else [_progress_callback],
}
if opts.cookies_path and opts.cookies_path.is_file():
base_options["cookiefile"] = str(opts.cookies_path)
else:
# Check global cookies file
# Check global cookies file lazily to avoid import cycles
from hydrus_health_check import get_cookies_file_path # local import
global_cookies = get_cookies_file_path()
if global_cookies:
base_options["cookiefile"] = global_cookies
@@ -287,7 +292,7 @@ def _build_ytdlp_options(opts: DownloadOptions) -> Dict[str, Any]:
# Fallback to browser cookies
base_options["cookiesfrombrowser"] = ("chrome",)
# Add no-playlist option if specified (for single video from playlist URLs)
# Add no-playlist option if specified (for single video from playlist url)
if opts.no_playlist:
base_options["noplaylist"] = True
@@ -336,7 +341,8 @@ def _build_ytdlp_options(opts: DownloadOptions) -> Dict[str, Any]:
if opts.playlist_items:
base_options["playlist_items"] = opts.playlist_items
debug(f"yt-dlp: mode={opts.mode}, format={base_options.get('format')}")
if not opts.quiet:
debug(f"yt-dlp: mode={opts.mode}, format={base_options.get('format')}")
return base_options
@@ -411,8 +417,8 @@ def _extract_sha256(info: Dict[str, Any]) -> Optional[str]:
def _get_libgen_download_url(libgen_url: str) -> Optional[str]:
"""Extract the actual download link from LibGen redirect URL.
LibGen URLs like https://libgen.gl/file.php?id=123456 redirect to
actual mirror URLs. This follows the redirect chain to get the real file.
LibGen url like https://libgen.gl/file.php?id=123456 redirect to
actual mirror url. This follows the redirect chain to get the real file.
Args:
libgen_url: LibGen file.php URL
@@ -491,6 +497,7 @@ def _download_direct_file(
url: str,
output_dir: Path,
debug_logger: Optional[DebugLogger] = None,
quiet: bool = False,
) -> DownloadMediaResult:
"""Download a direct file (PDF, image, document, etc.) without yt-dlp."""
ensure_directory(output_dir)
@@ -535,9 +542,11 @@ def _download_direct_file(
extracted_name = match.group(1) or match.group(2)
if extracted_name:
filename = unquote(extracted_name)
debug(f"Filename from Content-Disposition: {filename}")
if not quiet:
debug(f"Filename from Content-Disposition: {filename}")
except Exception as e:
log(f"Could not get filename from headers: {e}", file=sys.stderr)
if not quiet:
log(f"Could not get filename from headers: {e}", file=sys.stderr)
# Fallback if we still don't have a good filename
if not filename or "." not in filename:
@@ -546,7 +555,8 @@ def _download_direct_file(
file_path = output_dir / filename
progress_bar = ProgressBar()
debug(f"Direct download: {filename}")
if not quiet:
debug(f"Direct download: {filename}")
try:
start_time = time.time()
@@ -577,7 +587,8 @@ def _download_direct_file(
speed_str=speed_str,
eta_str=eta_str,
)
debug(progress_line)
if not quiet:
debug(progress_line)
last_progress_time[0] = now
with HTTPClient(timeout=30.0) as client:
@@ -585,7 +596,8 @@ def _download_direct_file(
elapsed = time.time() - start_time
avg_speed_str = progress_bar.format_bytes(downloaded_bytes[0] / elapsed if elapsed > 0 else 0) + "/s"
debug(f"✓ Downloaded in {elapsed:.1f}s at {avg_speed_str}")
if not quiet:
debug(f"✓ Downloaded in {elapsed:.1f}s at {avg_speed_str}")
# For direct file downloads, create minimal info dict without filename as title
# This prevents creating duplicate title: tags when filename gets auto-generated
@@ -658,375 +670,98 @@ def _download_direct_file(
raise DownloadError(f"Error downloading file: {exc}") from exc
def probe_url(url: str, no_playlist: bool = False) -> Optional[Dict[str, Any]]:
def probe_url(url: str, no_playlist: bool = False, timeout_seconds: int = 15) -> Optional[Dict[str, Any]]:
"""Probe URL to extract metadata WITHOUT downloading.
Args:
url: URL to probe
no_playlist: If True, ignore playlists and probe only the single video
timeout_seconds: Max seconds to wait for probe (default 15s)
Returns:
Dict with keys: extractor, title, entries (if playlist), duration, etc.
Returns None if not supported by yt-dlp.
Returns None if not supported by yt-dlp or on timeout.
"""
if not is_url_supported_by_ytdlp(url):
return None
_ensure_yt_dlp_ready()
# Wrap probe in timeout to prevent hanging on large playlists
import threading
from typing import cast
assert yt_dlp is not None
try:
# Extract info without downloading
# Use extract_flat='in_playlist' to get full metadata for playlist items
ydl_opts = {
"quiet": True, # Suppress all output
"no_warnings": True,
"socket_timeout": 10,
"retries": 3,
"skip_download": True, # Don't actually download
"extract_flat": "in_playlist", # Get playlist with metadata for each entry
"noprogress": True, # No progress bars
}
# Add cookies if available
global_cookies = get_cookies_file_path()
if global_cookies:
ydl_opts["cookiefile"] = global_cookies
# Add no_playlist option if specified
if no_playlist:
ydl_opts["noplaylist"] = True
with yt_dlp.YoutubeDL(ydl_opts) as ydl: # type: ignore[arg-type]
info = ydl.extract_info(url, download=False)
if not isinstance(info, dict):
return None
# Extract relevant fields
return {
"extractor": info.get("extractor", ""),
"title": info.get("title", ""),
"entries": info.get("entries", []), # Will be populated if playlist
"duration": info.get("duration"),
"uploader": info.get("uploader"),
"description": info.get("description"),
"url": url,
}
except Exception as exc:
log(f"Probe failed for {url}: {exc}")
return None
def download_media(
opts: DownloadOptions,
*,
debug_logger: Optional[DebugLogger] = None,
) -> DownloadMediaResult:
"""Download media from URL using yt-dlp or direct HTTP download.
result_container: List[Optional[Any]] = [None, None] # [result, error]
Args:
opts: DownloadOptions with url, mode, output_dir, etc.
debug_logger: Optional debug logger for troubleshooting
Returns:
DownloadMediaResult with path, info, tags, hash
Raises:
DownloadError: If download fails
"""
# Handle LibGen URLs specially
# file.php redirects to mirrors, get.php is direct from modern API
if 'libgen' in opts.url.lower():
if '/get.php' in opts.url.lower():
# Modern API get.php links are direct downloads from mirrors (not file redirects)
log(f"Detected LibGen get.php URL, downloading directly...")
if debug_logger is not None:
debug_logger.write_record("libgen-direct", {"url": opts.url})
return _download_direct_file(opts.url, opts.output_dir, debug_logger)
elif '/file.php' in opts.url.lower():
# Old-style file.php redirects to mirrors, we need to resolve
log(f"Detected LibGen file.php URL, resolving to actual mirror...")
actual_url = _get_libgen_download_url(opts.url)
if actual_url and actual_url != opts.url:
log(f"Resolved LibGen URL to mirror: {actual_url}")
opts.url = actual_url
# After resolution, this will typically be an onion link or direct file
# Skip yt-dlp for this (it won't support onion/mirrors), go direct
if debug_logger is not None:
debug_logger.write_record("libgen-resolved", {"original": opts.url, "resolved": actual_url})
return _download_direct_file(opts.url, opts.output_dir, debug_logger)
else:
log(f"Could not resolve LibGen URL, trying direct download anyway", file=sys.stderr)
if debug_logger is not None:
debug_logger.write_record("libgen-resolve-failed", {"url": opts.url})
return _download_direct_file(opts.url, opts.output_dir, debug_logger)
# Handle GoFile shares with a dedicated resolver before yt-dlp/direct fallbacks
try:
netloc = urlparse(opts.url).netloc.lower()
except Exception:
netloc = ""
if "gofile.io" in netloc:
msg = "GoFile links are currently unsupported"
debug(msg)
if debug_logger is not None:
debug_logger.write_record("gofile-unsupported", {"url": opts.url})
raise DownloadError(msg)
# Determine if yt-dlp should be used
ytdlp_supported = is_url_supported_by_ytdlp(opts.url)
if ytdlp_supported:
probe_result = probe_url(opts.url, no_playlist=opts.no_playlist)
if probe_result is None:
log(f"URL supported by yt-dlp but no media detected, falling back to direct download: {opts.url}")
if debug_logger is not None:
debug_logger.write_record("ytdlp-skip-no-media", {"url": opts.url})
return _download_direct_file(opts.url, opts.output_dir, debug_logger)
else:
log(f"URL not supported by yt-dlp, trying direct download: {opts.url}")
if debug_logger is not None:
debug_logger.write_record("direct-file-attempt", {"url": opts.url})
return _download_direct_file(opts.url, opts.output_dir, debug_logger)
_ensure_yt_dlp_ready()
ytdl_options = _build_ytdlp_options(opts)
debug(f"Starting yt-dlp download: {opts.url}")
if debug_logger is not None:
debug_logger.write_record("ytdlp-start", {"url": opts.url})
assert yt_dlp is not None
try:
# Debug: show what options we're using
if ytdl_options.get("download_sections"):
debug(f"[yt-dlp] download_sections: {ytdl_options['download_sections']}")
debug(f"[yt-dlp] force_keyframes_at_cuts: {ytdl_options.get('force_keyframes_at_cuts', False)}")
# Use subprocess when download_sections are present (Python API doesn't support them properly)
session_id = None
first_section_info = {}
if ytdl_options.get("download_sections"):
session_id, first_section_info = _download_with_sections_via_cli(opts.url, ytdl_options, ytdl_options.get("download_sections", []))
info = None
else:
with yt_dlp.YoutubeDL(ytdl_options) as ydl: # type: ignore[arg-type]
info = ydl.extract_info(opts.url, download=True)
except Exception as exc:
log(f"yt-dlp failed: {exc}", file=sys.stderr)
if debug_logger is not None:
debug_logger.write_record(
"exception",
{
"phase": "yt-dlp",
"error": str(exc),
"traceback": traceback.format_exc(),
},
)
raise DownloadError("yt-dlp download failed") from exc
# If we used subprocess, we need to find the file manually
if info is None:
# Find files created/modified during this download (after we started)
# Look for files matching the expected output template pattern
def _do_probe() -> None:
try:
import glob
import time
import re
_ensure_yt_dlp_ready()
# Get the expected filename pattern from outtmpl
# For sections: "C:\path\{session_id}.section_1_of_3.ext", etc.
# For non-sections: "C:\path\title.ext"
# Wait a moment to ensure files are fully written
time.sleep(0.5)
# List all files in output_dir, sorted by modification time
files = sorted(opts.output_dir.iterdir(), key=lambda p: p.stat().st_mtime, reverse=True)
if not files:
raise FileNotFoundError(f"No files found in {opts.output_dir}")
# If we downloaded sections, look for files with the session_id pattern
if opts.clip_sections and session_id:
# Pattern: "{session_id}_1.ext", "{session_id}_2.ext", etc.
section_pattern = re.compile(rf'^{re.escape(session_id)}_(\d+)\.')
matching_files = [f for f in files if section_pattern.search(f.name)]
if matching_files:
# Sort by section number to ensure correct order
def extract_section_num(path: Path) -> int:
match = section_pattern.search(path.name)
return int(match.group(1)) if match else 999
matching_files.sort(key=extract_section_num)
debug(f"Found {len(matching_files)} section file(s) matching pattern")
# Now rename section files to use hash-based names
# This ensures unique filenames for each section content
renamed_files = []
for idx, section_file in enumerate(matching_files, 1):
try:
# Calculate hash for the file
file_hash = sha256_file(section_file)
ext = section_file.suffix
new_name = f"{file_hash}{ext}"
new_path = opts.output_dir / new_name
if new_path.exists() and new_path != section_file:
# If file with same hash exists, use it and delete the temp one
debug(f"File with hash {file_hash} already exists, using existing file.")
try:
section_file.unlink()
except OSError:
pass
renamed_files.append(new_path)
else:
section_file.rename(new_path)
debug(f"Renamed section file: {section_file.name}{new_name}")
renamed_files.append(new_path)
except Exception as e:
debug(f"Failed to process section file {section_file.name}: {e}")
renamed_files.append(section_file)
media_path = renamed_files[0]
media_paths = renamed_files
debug(f"✓ Downloaded {len(media_paths)} section file(s) (session: {session_id})")
else:
# Fallback to most recent file if pattern not found
media_path = files[0]
media_paths = None
debug(f"✓ Downloaded section file (pattern not found): {media_path.name}")
else:
# No sections, just take the most recent file
media_path = files[0]
media_paths = None
debug(f"✓ Downloaded: {media_path.name}")
if debug_logger is not None:
debug_logger.write_record("ytdlp-file-found", {"path": str(media_path)})
except Exception as exc:
log(f"Error finding downloaded file: {exc}", file=sys.stderr)
if debug_logger is not None:
debug_logger.write_record(
"exception",
{"phase": "find-file", "error": str(exc)},
)
raise DownloadError(str(exc)) from exc
# Create result with minimal data extracted from filename
file_hash = sha256_file(media_path)
# For section downloads, create tags with the title and build proper info dict
tags = []
title = ''
if first_section_info:
title = first_section_info.get('title', '')
if title:
tags.append(f'title:{title}')
debug(f"Added title tag for section download: {title}")
# Build info dict - always use extracted title if available, not hash
if first_section_info:
info_dict = first_section_info
else:
info_dict = {
"id": media_path.stem,
"title": title or media_path.stem,
"ext": media_path.suffix.lstrip(".")
assert yt_dlp is not None
# Extract info without downloading
# Use extract_flat='in_playlist' to get full metadata for playlist items
ydl_opts = {
"quiet": True, # Suppress all output
"no_warnings": True,
"socket_timeout": 10,
"retries": 2, # Reduce retries for faster timeout
"skip_download": True, # Don't actually download
"extract_flat": "in_playlist", # Get playlist with metadata for each entry
"noprogress": True, # No progress bars
}
return DownloadMediaResult(
path=media_path,
info=info_dict,
tags=tags,
source_url=opts.url,
hash_value=file_hash,
paths=media_paths, # Include all section files if present
)
# Add cookies if available (lazy import to avoid circular dependency)
from hydrus_health_check import get_cookies_file_path # local import
if not isinstance(info, dict):
log(f"Unexpected yt-dlp response: {type(info)}", file=sys.stderr)
raise DownloadError("Unexpected yt-dlp response type")
info_dict: Dict[str, Any] = info
if debug_logger is not None:
debug_logger.write_record(
"ytdlp-info",
{
"keys": sorted(info_dict.keys()),
"is_playlist": bool(info_dict.get("entries")),
},
)
try:
entry, media_path = _resolve_entry_and_path(info_dict, opts.output_dir)
except FileNotFoundError as exc:
log(f"Error: {exc}", file=sys.stderr)
if debug_logger is not None:
debug_logger.write_record(
"exception",
{"phase": "resolve-path", "error": str(exc)},
)
raise DownloadError(str(exc)) from exc
if debug_logger is not None:
debug_logger.write_record(
"resolved-media",
{"path": str(media_path), "entry_keys": sorted(entry.keys())},
)
# Extract hash from metadata or compute
hash_value = _extract_sha256(entry) or _extract_sha256(info_dict)
if not hash_value:
try:
hash_value = sha256_file(media_path)
except OSError as exc:
if debug_logger is not None:
debug_logger.write_record(
"hash-error",
{"path": str(media_path), "error": str(exc)},
)
# Extract tags using metadata.py
tags = []
if extract_ytdlp_tags:
try:
tags = extract_ytdlp_tags(entry)
except Exception as e:
log(f"Error extracting tags: {e}", file=sys.stderr)
source_url = (
entry.get("webpage_url")
or entry.get("original_url")
or entry.get("url")
)
debug(f"✓ Downloaded: {media_path.name} ({len(tags)} tags)")
if debug_logger is not None:
debug_logger.write_record(
"downloaded",
{
"path": str(media_path),
"tag_count": len(tags),
"source_url": source_url,
"sha256": hash_value,
},
)
return DownloadMediaResult(
path=media_path,
info=entry,
tags=tags,
source_url=source_url,
hash_value=hash_value,
)
global_cookies = get_cookies_file_path()
if global_cookies:
ydl_opts["cookiefile"] = global_cookies
# Add no_playlist option if specified
if no_playlist:
ydl_opts["noplaylist"] = True
with yt_dlp.YoutubeDL(ydl_opts) as ydl: # type: ignore[arg-type]
info = ydl.extract_info(url, download=False)
if not isinstance(info, dict):
result_container[0] = None
return
# Extract relevant fields
result_container[0] = {
"extractor": info.get("extractor", ""),
"title": info.get("title", ""),
"entries": info.get("entries", []), # Will be populated if playlist
"duration": info.get("duration"),
"uploader": info.get("uploader"),
"description": info.get("description"),
"url": url,
}
except Exception as exc:
log(f"Probe error for {url}: {exc}")
result_container[1] = exc
thread = threading.Thread(target=_do_probe, daemon=False)
thread.start()
thread.join(timeout=timeout_seconds)
if thread.is_alive():
# Probe timed out - return None to fall back to direct download
debug(f"Probe timeout for {url} (>={timeout_seconds}s), proceeding with download")
return None
if result_container[1] is not None:
# Probe error - return None to proceed anyway
return None
return cast(Optional[Dict[str, Any]], result_container[0])
__all__ = [
"download_media",
"is_url_supported_by_ytdlp",
"list_formats",
"probe_url",
"DownloadError",
"DownloadOptions",
"DownloadMediaResult",
]

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -73,7 +73,7 @@ class HydrusRequestSpec:
class HydrusClient:
"""Thin wrapper around the Hydrus Client API."""
base_url: str
url: str
access_key: str = ""
timeout: float = 60.0
@@ -84,10 +84,10 @@ class HydrusClient:
_session_key: str = field(init=False, default="", repr=False) # Cached session key
def __post_init__(self) -> None:
if not self.base_url:
if not self.url:
raise ValueError("Hydrus base URL is required")
self.base_url = self.base_url.rstrip("/")
parsed = urlsplit(self.base_url)
self.url = self.url.rstrip("/")
parsed = urlsplit(self.url)
if parsed.scheme not in {"http", "https"}:
raise ValueError("Hydrus base URL must use http or https")
self.scheme = parsed.scheme
@@ -374,24 +374,24 @@ class HydrusClient:
hashes = self._ensure_hashes(file_hashes)
if len(hashes) == 1:
body = {"hash": hashes[0], "url_to_add": url}
return self._post("/add_urls/associate_url", data=body)
return self._post("/add_url/associate_url", data=body)
results: dict[str, Any] = {}
for file_hash in hashes:
body = {"hash": file_hash, "url_to_add": url}
results[file_hash] = self._post("/add_urls/associate_url", data=body)
results[file_hash] = self._post("/add_url/associate_url", data=body)
return {"batched": results}
def delete_url(self, file_hashes: Union[str, Iterable[str]], url: str) -> dict[str, Any]:
hashes = self._ensure_hashes(file_hashes)
if len(hashes) == 1:
body = {"hash": hashes[0], "url_to_delete": url}
return self._post("/add_urls/associate_url", data=body)
return self._post("/add_url/associate_url", data=body)
results: dict[str, Any] = {}
for file_hash in hashes:
body = {"hash": file_hash, "url_to_delete": url}
results[file_hash] = self._post("/add_urls/associate_url", data=body)
results[file_hash] = self._post("/add_url/associate_url", data=body)
return {"batched": results}
def set_notes(self, file_hashes: Union[str, Iterable[str]], notes: dict[str, str], service_name: str) -> dict[str, Any]:
@@ -517,7 +517,7 @@ class HydrusClient:
file_ids: Sequence[int] | None = None,
hashes: Sequence[str] | None = None,
include_service_keys_to_tags: bool = True,
include_file_urls: bool = False,
include_file_url: bool = False,
include_duration: bool = True,
include_size: bool = True,
include_mime: bool = False,
@@ -535,7 +535,7 @@ class HydrusClient:
include_service_keys_to_tags,
lambda v: "true" if v else None,
),
("include_file_urls", include_file_urls, lambda v: "true" if v else None),
("include_file_url", include_file_url, lambda v: "true" if v else None),
("include_duration", include_duration, lambda v: "true" if v else None),
("include_size", include_size, lambda v: "true" if v else None),
("include_mime", include_mime, lambda v: "true" if v else None),
@@ -559,13 +559,13 @@ class HydrusClient:
def file_url(self, file_hash: str) -> str:
hash_param = quote(file_hash)
# Don't append access_key parameter for file downloads - use header instead
url = f"{self.base_url}/get_files/file?hash={hash_param}"
url = f"{self.url}/get_files/file?hash={hash_param}"
return url
def thumbnail_url(self, file_hash: str) -> str:
hash_param = quote(file_hash)
# Don't append access_key parameter for file downloads - use header instead
url = f"{self.base_url}/get_files/thumbnail?hash={hash_param}"
url = f"{self.url}/get_files/thumbnail?hash={hash_param}"
return url
@@ -612,7 +612,7 @@ def hydrus_request(args, parser) -> int:
parsed = urlsplit(options.url)
if parsed.scheme not in ('http', 'https'):
parser.error('Only http and https URLs are supported')
parser.error('Only http and https url are supported')
if not parsed.hostname:
parser.error('Invalid Hydrus URL')
@@ -1064,7 +1064,7 @@ def hydrus_export(args, _parser) -> int:
file_hash = getattr(args, 'file_hash', None) or _extract_hash(args.file_url)
if hydrus_url and file_hash:
try:
client = HydrusClient(base_url=hydrus_url, access_key=args.access_key, timeout=args.timeout)
client = HydrusClient(url=hydrus_url, access_key=args.access_key, timeout=args.timeout)
meta_response = client.fetch_file_metadata(hashes=[file_hash], include_mime=True)
entries = meta_response.get('metadata') if isinstance(meta_response, dict) else None
if isinstance(entries, list) and entries:
@@ -1301,8 +1301,7 @@ def is_available(config: dict[str, Any], use_cache: bool = True) -> tuple[bool,
Performs a lightweight probe to verify:
- Hydrus URL is configured
- Hydrus client library is available
- Can connect to Hydrus and retrieve services
- Can connect to Hydrus URL/port
Results are cached per session unless use_cache=False.
@@ -1330,50 +1329,43 @@ def is_available(config: dict[str, Any], use_cache: bool = True) -> tuple[bool,
return False, reason
access_key = get_hydrus_access_key(config, "home") or ""
if not access_key:
reason = "Hydrus access key not configured"
_HYDRUS_AVAILABLE = False
_HYDRUS_UNAVAILABLE_REASON = reason
return False, reason
timeout_raw = config.get("HydrusNetwork_Request_Timeout")
try:
timeout = float(timeout_raw) if timeout_raw is not None else 10.0
timeout = float(timeout_raw) if timeout_raw is not None else 5.0
except (TypeError, ValueError):
timeout = 10.0
timeout = 5.0
try:
# Use HTTPClient directly to avoid session key logic and reduce retries
# This prevents log spam when Hydrus is offline (avoiding 3 retries x 2 requests)
from helper.http_client import HTTPClient
# Simple TCP connection test to URL/port
import socket
from urllib.parse import urlparse
probe_url = f"{url.rstrip('/')}/get_services"
parsed = urlparse(url)
hostname = parsed.hostname or 'localhost'
port = parsed.port or (443 if parsed.scheme == 'https' else 80)
headers = {}
if access_key:
headers["Hydrus-Client-API-Access-Key"] = access_key
# Suppress HTTPClient logging during probe to avoid "Request failed" logs on startup
http_logger = logging.getLogger("helper.http_client")
original_level = http_logger.level
http_logger.setLevel(logging.CRITICAL)
# Try to connect to the host/port
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.settimeout(timeout)
try:
# Use retries=1 (single attempt, no retry) to fail fast
with HTTPClient(timeout=timeout, retries=1, headers=headers, verify_ssl=False) as http:
try:
response = http.get(probe_url)
if response.status_code == 200:
_HYDRUS_AVAILABLE = True
_HYDRUS_UNAVAILABLE_REASON = None
return True, None
else:
# Even if we get a 4xx/5xx, the service is "reachable" but maybe auth failed
# But for "availability" we usually mean "usable".
# If auth fails (403), we can't use it, so return False.
reason = f"HTTP {response.status_code}: {response.reason_phrase}"
_HYDRUS_AVAILABLE = False
_HYDRUS_UNAVAILABLE_REASON = reason
return False, reason
except Exception as e:
# This catches connection errors from HTTPClient
raise e
result = sock.connect_ex((hostname, port))
if result == 0:
_HYDRUS_AVAILABLE = True
_HYDRUS_UNAVAILABLE_REASON = None
return True, None
else:
reason = f"Cannot connect to {hostname}:{port}"
_HYDRUS_AVAILABLE = False
_HYDRUS_UNAVAILABLE_REASON = reason
return False, reason
finally:
http_logger.setLevel(original_level)
sock.close()
except Exception as exc:
reason = str(exc)

View File

@@ -2,15 +2,29 @@
import sys
import inspect
import threading
from pathlib import Path
_DEBUG_ENABLED = False
_thread_local = threading.local()
def set_thread_stream(stream):
"""Set a custom output stream for the current thread."""
_thread_local.stream = stream
def get_thread_stream():
"""Get the custom output stream for the current thread, if any."""
return getattr(_thread_local, 'stream', None)
def set_debug(enabled: bool) -> None:
"""Enable or disable debug logging."""
global _DEBUG_ENABLED
_DEBUG_ENABLED = enabled
def is_debug_enabled() -> bool:
"""Check if debug logging is enabled."""
return _DEBUG_ENABLED
def debug(*args, **kwargs) -> None:
"""Print debug message if debug logging is enabled.
@@ -18,9 +32,22 @@ def debug(*args, **kwargs) -> None:
"""
if not _DEBUG_ENABLED:
return
# Check if stderr has been redirected to /dev/null (quiet mode)
# If so, skip output to avoid queuing in background worker's capture
try:
stderr_name = getattr(sys.stderr, 'name', '')
if 'nul' in str(stderr_name).lower() or '/dev/null' in str(stderr_name):
return
except Exception:
pass
# Check for thread-local stream first
stream = get_thread_stream()
if stream:
kwargs['file'] = stream
# Set default to stderr for debug messages
if 'file' not in kwargs:
elif 'file' not in kwargs:
kwargs['file'] = sys.stderr
# Prepend DEBUG label
@@ -59,8 +86,12 @@ def log(*args, **kwargs) -> None:
# Get function name
func_name = caller_frame.f_code.co_name
# Check for thread-local stream first
stream = get_thread_stream()
if stream:
kwargs['file'] = stream
# Set default to stdout if not specified
if 'file' not in kwargs:
elif 'file' not in kwargs:
kwargs['file'] = sys.stdout
if add_prefix:

View File

@@ -96,7 +96,7 @@ class MPVfile:
relationship_metadata: Dict[str, Any] = field(default_factory=dict)
tags: List[str] = field(default_factory=list)
original_tags: Dict[str, str] = field(default_factory=dict)
known_urls: List[str] = field(default_factory=list)
url: List[str] = field(default_factory=list)
title: Optional[str] = None
source_url: Optional[str] = None
clip_time: Optional[str] = None
@@ -128,7 +128,7 @@ class MPVfile:
"relationship_metadata": self.relationship_metadata,
"tags": self.tags,
"original_tags": self.original_tags,
"known_urls": self.known_urls,
"url": self.url,
"title": self.title,
"source_url": self.source_url,
"clip_time": self.clip_time,
@@ -293,10 +293,10 @@ class MPVFileBuilder:
if s.tags:
s.original_tags = {tag: tag for tag in s.tags}
# known URLs + last_url
s.known_urls = _normalise_string_list(p.get("known_urls"))
if self.last_url and self.last_url not in s.known_urls:
s.known_urls.append(self.last_url)
# known url + last_url
s.url = _normalise_string_list(p.get("url"))
if self.last_url and self.last_url not in s.url:
s.url.append(self.last_url)
# source URL (explicit or fallback to last_url)
explicit_source = p.get("source_url")
@@ -500,8 +500,8 @@ class MPVFileBuilder:
self._apply_hydrus_result(result)
self.state.type = "hydrus"
matched_url = result.get("matched_url") or result.get("url")
if matched_url and matched_url not in self.state.known_urls:
self.state.known_urls.append(matched_url)
if matched_url and matched_url not in self.state.url:
self.state.url.append(matched_url)
# Enrich relationships once we know the hash
if self.include_relationships and self.state.hash and self.hydrus_settings.base_url:
self._enrich_relationships_from_api(self.state.hash)
@@ -527,7 +527,7 @@ class MPVFileBuilder:
metadata_payload["type"] = "other"
self.state.metadata = metadata_payload
# Do NOT overwrite MPVfile.type with metadata.type
self._merge_known_urls(metadata_payload.get("known_urls") or metadata_payload.get("known_urls_set"))
self._merge_url(metadata_payload.get("url") or metadata_payload.get("url_set"))
source_url = metadata_payload.get("original_url") or metadata_payload.get("source_url")
if source_url and not self.state.source_url:
self.state.source_url = self._normalise_url(source_url)
@@ -722,7 +722,7 @@ class MPVFileBuilder:
include_service_keys_to_tags=True,
include_duration=True,
include_size=True,
include_file_urls=False,
include_file_url=False,
include_mime=False,
)
except HydrusRequestError as hre: # pragma: no cover
@@ -801,11 +801,11 @@ class MPVFileBuilder:
if tag not in self.state.original_tags:
self.state.original_tags[tag] = tag
def _merge_known_urls(self, urls: Optional[Iterable[Any]]) -> None:
if not urls:
def _merge_url(self, url: Optional[Iterable[Any]]) -> None:
if not url:
return
combined = list(self.state.known_urls or []) + _normalise_string_list(urls)
self.state.known_urls = unique_preserve_order(combined)
combined = list(self.state.url or []) + _normalise_string_list(url)
self.state.url = unique_preserve_order(combined)
def _load_sidecar_tags(self, local_path: str) -> None:
try:
@@ -821,7 +821,7 @@ class MPVFileBuilder:
if hash_value and not self.state.hash and _looks_like_hash(hash_value):
self.state.hash = hash_value.lower()
self._merge_tags(tags)
self._merge_known_urls(known)
self._merge_url(known)
break
def _read_sidecar(self, sidecar_path: Path) -> tuple[Optional[str], List[str], List[str]]:
@@ -831,7 +831,7 @@ class MPVFileBuilder:
return None, [], []
hash_value: Optional[str] = None
tags: List[str] = []
known_urls: List[str] = []
url: List[str] = []
for line in raw.splitlines():
trimmed = line.strip()
if not trimmed:
@@ -841,13 +841,13 @@ class MPVFileBuilder:
candidate = trimmed.split(":", 1)[1].strip() if ":" in trimmed else ""
if candidate:
hash_value = candidate
elif lowered.startswith("known_url:") or lowered.startswith("url:"):
elif lowered.startswith("url:") or lowered.startswith("url:"):
candidate = trimmed.split(":", 1)[1].strip() if ":" in trimmed else ""
if candidate:
known_urls.append(candidate)
url.append(candidate)
else:
tags.append(trimmed)
return hash_value, tags, known_urls
return hash_value, tags, url
def _compute_local_hash(self, local_path: str) -> None:
try:
@@ -864,8 +864,8 @@ class MPVFileBuilder:
def _finalise(self) -> None:
if self.state.tags:
self.state.tags = unique_preserve_order(self.state.tags)
if self.state.known_urls:
self.state.known_urls = unique_preserve_order(self.state.known_urls)
if self.state.url:
self.state.url = unique_preserve_order(self.state.url)
# Ensure metadata.type is always present for Lua, but do NOT overwrite MPVfile.type
if not self.state.title:
if self.state.metadata.get("title"):

View File

@@ -85,7 +85,7 @@ def _normalize_target(text: Optional[str]) -> Optional[str]:
except Exception:
pass
# Normalize paths/urls for comparison
# Normalize paths/url for comparison
return lower.replace('\\', '\\')

818
helper/provider.py Normal file
View File

@@ -0,0 +1,818 @@
"""Provider interfaces for search and file upload functionality.
This module defines two distinct provider types:
1. SearchProvider: For searching content (books, music, videos, games)
2. FileProvider: For uploading files to hosting services
No legacy code or backwards compatibility - clean, single source of truth.
"""
from __future__ import annotations
from abc import ABC, abstractmethod
from typing import Any, Dict, List, Optional, Tuple
from dataclasses import dataclass, field
from pathlib import Path
import sys
import os
import json
import re
import time
import asyncio
import subprocess
import shutil
import mimetypes
import traceback
import requests
from helper.logger import log, debug
# Optional dependencies
try:
from playwright.sync_api import sync_playwright
PLAYWRIGHT_AVAILABLE = True
except ImportError:
PLAYWRIGHT_AVAILABLE = False
# ============================================================================
# SEARCH PROVIDERS
# ============================================================================
@dataclass
class SearchResult:
"""Unified search result format across all search providers."""
origin: str # Provider name: "libgen", "soulseek", "debrid", "bandcamp", etc.
title: str # Display title/filename
path: str # Download target (URL, path, magnet, identifier)
detail: str = "" # Additional description
annotations: List[str] = field(default_factory=list) # Tags: ["120MB", "flac", "ready"]
media_kind: str = "other" # Type: "book", "audio", "video", "game", "magnet"
size_bytes: Optional[int] = None
tags: set[str] = field(default_factory=set) # Searchable tags
columns: List[Tuple[str, str]] = field(default_factory=list) # Display columns
full_metadata: Dict[str, Any] = field(default_factory=dict) # Extra metadata
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary for pipeline processing."""
return {
"origin": self.origin,
"title": self.title,
"path": self.path,
"detail": self.detail,
"annotations": self.annotations,
"media_kind": self.media_kind,
"size_bytes": self.size_bytes,
"tags": list(self.tags),
"columns": list(self.columns),
"full_metadata": self.full_metadata,
}
class SearchProvider(ABC):
"""Base class for search providers."""
def __init__(self, config: Dict[str, Any] = None):
self.config = config or {}
self.name = self.__class__.__name__.lower()
@abstractmethod
def search(
self,
query: str,
limit: int = 50,
filters: Optional[Dict[str, Any]] = None,
**kwargs
) -> List[SearchResult]:
"""Search for items matching the query.
Args:
query: Search query string
limit: Maximum results to return
filters: Optional filtering criteria
**kwargs: Provider-specific arguments
Returns:
List of SearchResult objects
"""
pass
def validate(self) -> bool:
"""Check if provider is available and properly configured."""
return True
class Libgen(SearchProvider):
"""Search provider for Library Genesis books."""
def search(
self,
query: str,
limit: int = 50,
filters: Optional[Dict[str, Any]] = None,
**kwargs
) -> List[SearchResult]:
filters = filters or {}
try:
from helper.unified_book_downloader import UnifiedBookDownloader
from helper.query_parser import parse_query, get_field, get_free_text
parsed = parse_query(query)
isbn = get_field(parsed, 'isbn')
author = get_field(parsed, 'author')
title = get_field(parsed, 'title')
free_text = get_free_text(parsed)
search_query = isbn or title or author or free_text or query
downloader = UnifiedBookDownloader(config=self.config)
books = downloader.search_libgen(search_query, limit=limit)
results = []
for idx, book in enumerate(books, 1):
title = book.get("title", "Unknown")
author = book.get("author", "Unknown")
year = book.get("year", "Unknown")
pages = book.get("pages") or book.get("pages_str") or ""
extension = book.get("extension", "") or book.get("ext", "")
filesize = book.get("filesize_str", "Unknown")
isbn = book.get("isbn", "")
mirror_url = book.get("mirror_url", "")
columns = [
("Title", title),
("Author", author),
("Pages", str(pages)),
("Ext", str(extension)),
]
detail = f"By: {author}"
if year and year != "Unknown":
detail += f" ({year})"
annotations = [f"{filesize}"]
if isbn:
annotations.append(f"ISBN: {isbn}")
results.append(SearchResult(
origin="libgen",
title=title,
path=mirror_url or f"libgen:{book.get('id', '')}",
detail=detail,
annotations=annotations,
media_kind="book",
columns=columns,
full_metadata={
"number": idx,
"author": author,
"year": year,
"isbn": isbn,
"filesize": filesize,
"pages": pages,
"extension": extension,
"book_id": book.get("book_id", ""),
"md5": book.get("md5", ""),
},
))
return results
except Exception as e:
log(f"[libgen] Search error: {e}", file=sys.stderr)
return []
def validate(self) -> bool:
try:
from helper.unified_book_downloader import UnifiedBookDownloader
return True
except Exception:
return False
class Soulseek(SearchProvider):
"""Search provider for Soulseek P2P network."""
MUSIC_EXTENSIONS = {
'.flac', '.mp3', '.m4a', '.aac', '.ogg', '.opus',
'.wav', '.alac', '.wma', '.ape', '.aiff', '.dsf',
'.dff', '.wv', '.tta', '.tak', '.ac3', '.dts'
}
USERNAME = "asjhkjljhkjfdsd334"
PASSWORD = "khhhg"
DOWNLOAD_DIR = "./downloads"
MAX_WAIT_TRANSFER = 1200
async def perform_search(
self,
query: str,
timeout: float = 9.0,
limit: int = 50
) -> List[Dict[str, Any]]:
"""Perform async Soulseek search."""
import os
from aioslsk.client import SoulSeekClient
from aioslsk.settings import Settings, CredentialsSettings
os.makedirs(self.DOWNLOAD_DIR, exist_ok=True)
settings = Settings(credentials=CredentialsSettings(username=self.USERNAME, password=self.PASSWORD))
client = SoulSeekClient(settings)
try:
await client.start()
await client.login()
except Exception as e:
log(f"[soulseek] Login failed: {type(e).__name__}: {e}", file=sys.stderr)
return []
try:
search_request = await client.searches.search(query)
await self._collect_results(client, search_request, timeout=timeout)
return self._flatten_results(search_request)[:limit]
except Exception as e:
log(f"[soulseek] Search error: {type(e).__name__}: {e}", file=sys.stderr)
return []
finally:
try:
await client.stop()
except Exception:
pass
def _flatten_results(self, search_request) -> List[dict]:
flat = []
for result in search_request.results:
username = getattr(result, "username", "?")
for file_data in getattr(result, "shared_items", []):
flat.append({
"file": file_data,
"username": username,
"filename": getattr(file_data, "filename", "?"),
"size": getattr(file_data, "filesize", 0),
})
for file_data in getattr(result, "locked_results", []):
flat.append({
"file": file_data,
"username": username,
"filename": getattr(file_data, "filename", "?"),
"size": getattr(file_data, "filesize", 0),
})
return flat
async def _collect_results(self, client, search_request, timeout: float = 75.0) -> None:
end = time.time() + timeout
last_count = 0
while time.time() < end:
current_count = len(search_request.results)
if current_count > last_count:
debug(f"[soulseek] Got {current_count} result(s)...")
last_count = current_count
await asyncio.sleep(0.5)
def search(
self,
query: str,
limit: int = 50,
filters: Optional[Dict[str, Any]] = None,
**kwargs
) -> List[SearchResult]:
filters = filters or {}
try:
flat_results = asyncio.run(self.perform_search(query, timeout=9.0, limit=limit))
if not flat_results:
return []
# Filter to music files only
music_results = []
for item in flat_results:
filename = item['filename']
ext = '.' + filename.rsplit('.', 1)[-1].lower() if '.' in filename else ''
if ext in self.MUSIC_EXTENSIONS:
music_results.append(item)
if not music_results:
return []
# Extract metadata
enriched_results = []
for item in music_results:
filename = item['filename']
ext = '.' + filename.rsplit('.', 1)[-1].lower() if '.' in filename else ''
# Get display filename
display_name = filename.split('\\')[-1] if '\\' in filename else filename.split('/')[-1] if '/' in filename else filename
# Extract path hierarchy
path_parts = filename.replace('\\', '/').split('/')
artist = path_parts[-3] if len(path_parts) >= 3 else ''
album = path_parts[-2] if len(path_parts) >= 3 else path_parts[-2] if len(path_parts) == 2 else ''
# Extract track number and title
base_name = display_name.rsplit('.', 1)[0] if '.' in display_name else display_name
track_num = ''
title = base_name
filename_artist = ''
match = re.match(r'^(\d{1,3})\s*[\.\-]?\s+(.+)$', base_name)
if match:
track_num = match.group(1)
rest = match.group(2)
if ' - ' in rest:
filename_artist, title = rest.split(' - ', 1)
else:
title = rest
if filename_artist:
artist = filename_artist
enriched_results.append({
**item,
'artist': artist,
'album': album,
'title': title,
'track_num': track_num,
'ext': ext
})
# Apply filters
if filters:
artist_filter = filters.get('artist', '').lower() if filters.get('artist') else ''
album_filter = filters.get('album', '').lower() if filters.get('album') else ''
track_filter = filters.get('track', '').lower() if filters.get('track') else ''
if artist_filter or album_filter or track_filter:
filtered = []
for item in enriched_results:
if artist_filter and artist_filter not in item['artist'].lower():
continue
if album_filter and album_filter not in item['album'].lower():
continue
if track_filter and track_filter not in item['title'].lower():
continue
filtered.append(item)
enriched_results = filtered
# Sort: .flac first, then by size
enriched_results.sort(key=lambda item: (item['ext'].lower() != '.flac', -item['size']))
# Convert to SearchResult
results = []
for idx, item in enumerate(enriched_results, 1):
artist_display = item['artist'] if item['artist'] else "(no artist)"
album_display = item['album'] if item['album'] else "(no album)"
size_mb = int(item['size'] / 1024 / 1024)
columns = [
("Track", item['track_num'] or "?"),
("Title", item['title'][:40]),
("Artist", artist_display[:32]),
("Album", album_display[:32]),
("Size", f"{size_mb} MB"),
]
results.append(SearchResult(
origin="soulseek",
title=item['title'],
path=item['filename'],
detail=f"{artist_display} - {album_display}",
annotations=[f"{size_mb} MB", item['ext'].lstrip('.').upper()],
media_kind="audio",
size_bytes=item['size'],
columns=columns,
full_metadata={
"username": item['username'],
"filename": item['filename'],
"artist": item['artist'],
"album": item['album'],
"track_num": item['track_num'],
"ext": item['ext'],
},
))
return results
except Exception as e:
log(f"[soulseek] Search error: {e}", file=sys.stderr)
return []
def validate(self) -> bool:
try:
from aioslsk.client import SoulSeekClient
return True
except ImportError:
return False
class Bandcamp(SearchProvider):
"""Search provider for Bandcamp."""
def search(
self,
query: str,
limit: int = 50,
filters: Optional[Dict[str, Any]] = None,
**kwargs
) -> List[SearchResult]:
if not PLAYWRIGHT_AVAILABLE:
log("[bandcamp] Playwright not available. Install with: pip install playwright", file=sys.stderr)
return []
results = []
try:
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
# Parse query for artist: prefix
if query.strip().lower().startswith("artist:"):
artist_name = query[7:].strip().strip('"')
search_url = f"https://bandcamp.com/search?q={artist_name}&item_type=b"
else:
search_url = f"https://bandcamp.com/search?q={query}&item_type=a"
results = self._scrape_url(page, search_url, limit)
browser.close()
except Exception as e:
log(f"[bandcamp] Search error: {e}", file=sys.stderr)
return []
return results
def _scrape_url(self, page, url: str, limit: int) -> List[SearchResult]:
debug(f"[bandcamp] Scraping: {url}")
page.goto(url)
page.wait_for_load_state("domcontentloaded")
results = []
# Check for search results
search_results = page.query_selector_all(".searchresult")
if search_results:
for item in search_results[:limit]:
try:
heading = item.query_selector(".heading")
if not heading:
continue
link = heading.query_selector("a")
if not link:
continue
title = link.inner_text().strip()
target_url = link.get_attribute("href")
subhead = item.query_selector(".subhead")
artist = subhead.inner_text().strip() if subhead else "Unknown"
itemtype = item.query_selector(".itemtype")
media_type = itemtype.inner_text().strip() if itemtype else "album"
results.append(SearchResult(
origin="bandcamp",
title=title,
path=target_url,
detail=f"By: {artist}",
annotations=[media_type],
media_kind="audio",
columns=[
("Name", title),
("Artist", artist),
("Type", media_type),
],
full_metadata={
"artist": artist,
"type": media_type,
},
))
except Exception as e:
debug(f"[bandcamp] Error parsing result: {e}")
continue
return results
def validate(self) -> bool:
return PLAYWRIGHT_AVAILABLE
class YouTube(SearchProvider):
"""Search provider for YouTube using yt-dlp."""
def search(
self,
query: str,
limit: int = 10,
filters: Optional[Dict[str, Any]] = None,
**kwargs
) -> List[SearchResult]:
ytdlp_path = shutil.which("yt-dlp")
if not ytdlp_path:
log("[youtube] yt-dlp not found in PATH", file=sys.stderr)
return []
search_query = f"ytsearch{limit}:{query}"
cmd = [
ytdlp_path,
"--dump-json",
"--flat-playlist",
"--no-warnings",
search_query
]
try:
process = subprocess.run(
cmd,
capture_output=True,
text=True,
encoding="utf-8",
errors="replace"
)
if process.returncode != 0:
log(f"[youtube] yt-dlp failed: {process.stderr}", file=sys.stderr)
return []
results = []
for line in process.stdout.splitlines():
if not line.strip():
continue
try:
video_data = json.loads(line)
title = video_data.get("title", "Unknown")
video_id = video_data.get("id", "")
url = video_data.get("url") or f"https://youtube.com/watch?v={video_id}"
uploader = video_data.get("uploader", "Unknown")
duration = video_data.get("duration", 0)
view_count = video_data.get("view_count", 0)
duration_str = f"{int(duration//60)}:{int(duration%60):02d}" if duration else ""
views_str = f"{view_count:,}" if view_count else ""
results.append(SearchResult(
origin="youtube",
title=title,
path=url,
detail=f"By: {uploader}",
annotations=[duration_str, f"{views_str} views"],
media_kind="video",
columns=[
("Title", title),
("Uploader", uploader),
("Duration", duration_str),
("Views", views_str),
],
full_metadata={
"video_id": video_id,
"uploader": uploader,
"duration": duration,
"view_count": view_count,
},
))
except json.JSONDecodeError:
continue
return results
except Exception as e:
log(f"[youtube] Error: {e}", file=sys.stderr)
return []
def validate(self) -> bool:
return shutil.which("yt-dlp") is not None
def pipe(self, path: str, config: Optional[Dict[str, Any]] = None) -> Optional[str]:
"""Return the playable URL for MPV (just the path for YouTube)."""
return path
# Search provider registry
_SEARCH_PROVIDERS = {
"libgen": Libgen,
"soulseek": Soulseek,
"bandcamp": Bandcamp,
"youtube": YouTube,
}
def get_search_provider(name: str, config: Optional[Dict[str, Any]] = None) -> Optional[SearchProvider]:
"""Get a search provider by name."""
provider_class = _SEARCH_PROVIDERS.get(name.lower())
if provider_class is None:
log(f"[provider] Unknown search provider: {name}", file=sys.stderr)
return None
try:
provider = provider_class(config)
if not provider.validate():
log(f"[provider] Provider '{name}' is not available", file=sys.stderr)
return None
return provider
except Exception as e:
log(f"[provider] Error initializing '{name}': {e}", file=sys.stderr)
return None
def list_search_providers(config: Optional[Dict[str, Any]] = None) -> Dict[str, bool]:
"""List all search providers and their availability."""
availability = {}
for name, provider_class in _SEARCH_PROVIDERS.items():
try:
provider = provider_class(config)
availability[name] = provider.validate()
except Exception:
availability[name] = False
return availability
# ============================================================================
# FILE PROVIDERS
# ============================================================================
class FileProvider(ABC):
"""Base class for file upload providers."""
def __init__(self, config: Optional[Dict[str, Any]] = None):
self.config = config or {}
self.name = self.__class__.__name__.lower()
@abstractmethod
def upload(self, file_path: str, **kwargs: Any) -> str:
"""Upload a file and return the URL."""
pass
def validate(self) -> bool:
"""Check if provider is available/configured."""
return True
class ZeroXZero(FileProvider):
"""File provider for 0x0.st."""
def upload(self, file_path: str, **kwargs: Any) -> str:
from helper.http_client import HTTPClient
if not os.path.exists(file_path):
raise FileNotFoundError(f"File not found: {file_path}")
try:
headers = {"User-Agent": "Medeia-Macina/1.0"}
with HTTPClient(headers=headers) as client:
with open(file_path, 'rb') as f:
response = client.post(
"https://0x0.st",
files={"file": f}
)
if response.status_code == 200:
return response.text.strip()
else:
raise Exception(f"Upload failed: {response.status_code} - {response.text}")
except Exception as e:
log(f"[0x0] Upload error: {e}", file=sys.stderr)
raise
def validate(self) -> bool:
return True
class Matrix(FileProvider):
"""File provider for Matrix (Element) chat rooms."""
def validate(self) -> bool:
if not self.config:
return False
matrix_conf = self.config.get('storage', {}).get('matrix', {})
return bool(
matrix_conf.get('homeserver') and
matrix_conf.get('room_id') and
(matrix_conf.get('access_token') or matrix_conf.get('password'))
)
def upload(self, file_path: str, **kwargs: Any) -> str:
from pathlib import Path
path = Path(file_path)
if not path.exists():
raise FileNotFoundError(f"File not found: {file_path}")
matrix_conf = self.config.get('storage', {}).get('matrix', {})
homeserver = matrix_conf.get('homeserver')
access_token = matrix_conf.get('access_token')
room_id = matrix_conf.get('room_id')
if not homeserver.startswith('http'):
homeserver = f"https://{homeserver}"
# Upload media
upload_url = f"{homeserver}/_matrix/media/v3/upload"
headers = {
"Authorization": f"Bearer {access_token}",
"Content-Type": "application/octet-stream"
}
mime_type, _ = mimetypes.guess_type(path)
if mime_type:
headers["Content-Type"] = mime_type
filename = path.name
with open(path, 'rb') as f:
resp = requests.post(upload_url, headers=headers, data=f, params={"filename": filename})
if resp.status_code != 200:
raise Exception(f"Matrix upload failed: {resp.text}")
content_uri = resp.json().get('content_uri')
if not content_uri:
raise Exception("No content_uri returned")
# Send message
send_url = f"{homeserver}/_matrix/client/v3/rooms/{room_id}/send/m.room.message"
# Determine message type
msgtype = "m.file"
ext = path.suffix.lower()
AUDIO_EXTS = {'.mp3', '.flac', '.wav', '.m4a', '.aac', '.ogg', '.opus', '.wma', '.mka', '.alac'}
VIDEO_EXTS = {'.mp4', '.mkv', '.webm', '.mov', '.avi', '.flv', '.mpg', '.mpeg', '.ts', '.m4v', '.wmv'}
IMAGE_EXTS = {'.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp', '.tiff'}
if ext in AUDIO_EXTS:
msgtype = "m.audio"
elif ext in VIDEO_EXTS:
msgtype = "m.video"
elif ext in IMAGE_EXTS:
msgtype = "m.image"
info = {
"mimetype": mime_type,
"size": path.stat().st_size
}
payload = {
"msgtype": msgtype,
"body": filename,
"url": content_uri,
"info": info
}
resp = requests.post(send_url, headers=headers, json=payload)
if resp.status_code != 200:
raise Exception(f"Matrix send message failed: {resp.text}")
event_id = resp.json().get('event_id')
return f"https://matrix.to/#/{room_id}/{event_id}"
# File provider registry
_FILE_PROVIDERS = {
"0x0": ZeroXZero,
"matrix": Matrix,
}
def get_file_provider(name: str, config: Optional[Dict[str, Any]] = None) -> Optional[FileProvider]:
"""Get a file provider by name."""
provider_class = _FILE_PROVIDERS.get(name.lower())
if provider_class is None:
log(f"[provider] Unknown file provider: {name}", file=sys.stderr)
return None
try:
provider = provider_class(config)
if not provider.validate():
log(f"[provider] File provider '{name}' is not available", file=sys.stderr)
return None
return provider
except Exception as e:
log(f"[provider] Error initializing file provider '{name}': {e}", file=sys.stderr)
return None
def list_file_providers(config: Optional[Dict[str, Any]] = None) -> Dict[str, bool]:
"""List all file providers and their availability."""
availability = {}
for name, provider_class in _FILE_PROVIDERS.items():
try:
provider = provider_class(config)
availability[name] = provider.validate()
except Exception:
availability[name] = False
return availability

View File

@@ -159,8 +159,8 @@ def create_app():
status["storage_path"] = str(STORAGE_PATH)
status["storage_exists"] = STORAGE_PATH.exists()
try:
from helper.local_library import LocalLibraryDB
with LocalLibraryDB(STORAGE_PATH) as db:
from helper.folder_store import FolderDB
with FolderDB(STORAGE_PATH) as db:
status["database_accessible"] = True
except Exception as e:
status["database_accessible"] = False
@@ -177,7 +177,7 @@ def create_app():
@require_storage()
def search_files():
"""Search for files by name or tag."""
from helper.local_library import LocalLibrarySearchOptimizer
from helper.folder_store import LocalLibrarySearchOptimizer
query = request.args.get('q', '')
limit = request.args.get('limit', 100, type=int)
@@ -205,11 +205,11 @@ def create_app():
@require_storage()
def get_file_metadata(file_hash: str):
"""Get metadata for a specific file by hash."""
from helper.local_library import LocalLibraryDB
from helper.folder_store import FolderDB
try:
with LocalLibraryDB(STORAGE_PATH) as db:
file_path = db.search_by_hash(file_hash)
with FolderDB(STORAGE_PATH) as db:
file_path = db.search_hash(file_hash)
if not file_path or not file_path.exists():
return jsonify({"error": "File not found"}), 404
@@ -233,13 +233,13 @@ def create_app():
@require_storage()
def index_file():
"""Index a new file in the storage."""
from helper.local_library import LocalLibraryDB
from helper.folder_store import FolderDB
from helper.utils import sha256_file
data = request.get_json() or {}
file_path_str = data.get('path')
tags = data.get('tags', [])
urls = data.get('urls', [])
url = data.get('url', [])
if not file_path_str:
return jsonify({"error": "File path required"}), 400
@@ -250,14 +250,14 @@ def create_app():
if not file_path.exists():
return jsonify({"error": "File does not exist"}), 404
with LocalLibraryDB(STORAGE_PATH) as db:
with FolderDB(STORAGE_PATH) as db:
db.get_or_create_file_entry(file_path)
if tags:
db.add_tags(file_path, tags)
if urls:
db.add_known_urls(file_path, urls)
if url:
db.add_url(file_path, url)
file_hash = sha256_file(file_path)
@@ -265,7 +265,7 @@ def create_app():
"hash": file_hash,
"path": str(file_path),
"tags_added": len(tags),
"urls_added": len(urls)
"url_added": len(url)
}), 201
except Exception as e:
logger.error(f"Index error: {e}", exc_info=True)
@@ -280,11 +280,11 @@ def create_app():
@require_storage()
def get_tags(file_hash: str):
"""Get tags for a file."""
from helper.local_library import LocalLibraryDB
from helper.folder_store import FolderDB
try:
with LocalLibraryDB(STORAGE_PATH) as db:
file_path = db.search_by_hash(file_hash)
with FolderDB(STORAGE_PATH) as db:
file_path = db.search_hash(file_hash)
if not file_path:
return jsonify({"error": "File not found"}), 404
@@ -299,7 +299,7 @@ def create_app():
@require_storage()
def add_tags(file_hash: str):
"""Add tags to a file."""
from helper.local_library import LocalLibraryDB
from helper.folder_store import FolderDB
data = request.get_json() or {}
tags = data.get('tags', [])
@@ -309,8 +309,8 @@ def create_app():
return jsonify({"error": "Tags required"}), 400
try:
with LocalLibraryDB(STORAGE_PATH) as db:
file_path = db.search_by_hash(file_hash)
with FolderDB(STORAGE_PATH) as db:
file_path = db.search_hash(file_hash)
if not file_path:
return jsonify({"error": "File not found"}), 404
@@ -328,13 +328,13 @@ def create_app():
@require_storage()
def remove_tags(file_hash: str):
"""Remove tags from a file."""
from helper.local_library import LocalLibraryDB
from helper.folder_store import FolderDB
tags_str = request.args.get('tags', '')
try:
with LocalLibraryDB(STORAGE_PATH) as db:
file_path = db.search_by_hash(file_hash)
with FolderDB(STORAGE_PATH) as db:
file_path = db.search_hash(file_hash)
if not file_path:
return jsonify({"error": "File not found"}), 404
@@ -358,11 +358,11 @@ def create_app():
@require_storage()
def get_relationships(file_hash: str):
"""Get relationships for a file."""
from helper.local_library import LocalLibraryDB
from helper.folder_store import FolderDB
try:
with LocalLibraryDB(STORAGE_PATH) as db:
file_path = db.search_by_hash(file_hash)
with FolderDB(STORAGE_PATH) as db:
file_path = db.search_hash(file_hash)
if not file_path:
return jsonify({"error": "File not found"}), 404
@@ -378,7 +378,7 @@ def create_app():
@require_storage()
def set_relationship():
"""Set a relationship between two files."""
from helper.local_library import LocalLibraryDB
from helper.folder_store import FolderDB
data = request.get_json() or {}
from_hash = data.get('from_hash')
@@ -389,9 +389,9 @@ def create_app():
return jsonify({"error": "from_hash and to_hash required"}), 400
try:
with LocalLibraryDB(STORAGE_PATH) as db:
from_path = db.search_by_hash(from_hash)
to_path = db.search_by_hash(to_hash)
with FolderDB(STORAGE_PATH) as db:
from_path = db.search_hash(from_hash)
to_path = db.search_hash(to_hash)
if not from_path or not to_path:
return jsonify({"error": "File not found"}), 404
@@ -406,49 +406,49 @@ def create_app():
# URL OPERATIONS
# ========================================================================
@app.route('/urls/<file_hash>', methods=['GET'])
@app.route('/url/<file_hash>', methods=['GET'])
@require_auth()
@require_storage()
def get_urls(file_hash: str):
"""Get known URLs for a file."""
from helper.local_library import LocalLibraryDB
def get_url(file_hash: str):
"""Get known url for a file."""
from helper.folder_store import FolderDB
try:
with LocalLibraryDB(STORAGE_PATH) as db:
file_path = db.search_by_hash(file_hash)
with FolderDB(STORAGE_PATH) as db:
file_path = db.search_hash(file_hash)
if not file_path:
return jsonify({"error": "File not found"}), 404
metadata = db.get_metadata(file_path)
urls = metadata.get('known_urls', []) if metadata else []
return jsonify({"hash": file_hash, "urls": urls}), 200
url = metadata.get('url', []) if metadata else []
return jsonify({"hash": file_hash, "url": url}), 200
except Exception as e:
logger.error(f"Get URLs error: {e}", exc_info=True)
logger.error(f"Get url error: {e}", exc_info=True)
return jsonify({"error": f"Failed: {str(e)}"}), 500
@app.route('/urls/<file_hash>', methods=['POST'])
@app.route('/url/<file_hash>', methods=['POST'])
@require_auth()
@require_storage()
def add_urls(file_hash: str):
"""Add URLs to a file."""
from helper.local_library import LocalLibraryDB
def add_url(file_hash: str):
"""Add url to a file."""
from helper.folder_store import FolderDB
data = request.get_json() or {}
urls = data.get('urls', [])
url = data.get('url', [])
if not urls:
return jsonify({"error": "URLs required"}), 400
if not url:
return jsonify({"error": "url required"}), 400
try:
with LocalLibraryDB(STORAGE_PATH) as db:
file_path = db.search_by_hash(file_hash)
with FolderDB(STORAGE_PATH) as db:
file_path = db.search_hash(file_hash)
if not file_path:
return jsonify({"error": "File not found"}), 404
db.add_known_urls(file_path, urls)
return jsonify({"hash": file_hash, "urls_added": len(urls)}), 200
db.add_url(file_path, url)
return jsonify({"hash": file_hash, "url_added": len(url)}), 200
except Exception as e:
logger.error(f"Add URLs error: {e}", exc_info=True)
logger.error(f"Add url error: {e}", exc_info=True)
return jsonify({"error": f"Failed: {str(e)}"}), 500
return app
@@ -509,8 +509,8 @@ def main():
print(f"\n{'='*70}\n")
try:
from helper.local_library import LocalLibraryDB
with LocalLibraryDB(STORAGE_PATH) as db:
from helper.folder_store import FolderDB
with FolderDB(STORAGE_PATH) as db:
logger.info("Database initialized successfully")
except Exception as e:
logger.error(f"Failed to initialize database: {e}")

File diff suppressed because it is too large Load Diff

2268
helper/store.py Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -555,7 +555,7 @@ class UnifiedBookDownloader:
This follows the exact process from archive_client.py:
1. Login with credentials
2. Call loan() to create 14-day borrow
3. Get book info (extract page URLs)
3. Get book info (extract page url)
4. Download all pages as images
5. Merge images into searchable PDF
@@ -576,10 +576,10 @@ class UnifiedBookDownloader:
# If we get here, borrowing succeeded
logger.info(f"[UnifiedBookDownloader] Successfully borrowed book: {book_id}")
# Now get the book info (page URLs and metadata)
# Now get the book info (page url and metadata)
logger.info(f"[UnifiedBookDownloader] Extracting book page information...")
# Try both URL formats: with /borrow and without
book_urls = [
book_url = [
f"https://archive.org/borrow/{book_id}", # Try borrow page first (for borrowed books)
f"https://archive.org/details/{book_id}" # Fallback to details page
]
@@ -589,7 +589,7 @@ class UnifiedBookDownloader:
metadata = None
last_error = None
for book_url in book_urls:
for book_url in book_url:
try:
logger.debug(f"[UnifiedBookDownloader] Trying to get book info from: {book_url}")
response = session.get(book_url, timeout=10)
@@ -611,7 +611,7 @@ class UnifiedBookDownloader:
continue
if links is None:
logger.error(f"[UnifiedBookDownloader] Failed to get book info from all URLs: {last_error}")
logger.error(f"[UnifiedBookDownloader] Failed to get book info from all url: {last_error}")
# Borrow extraction failed - return False
return False, "Could not extract borrowed book pages"

View File

@@ -308,7 +308,7 @@ def format_metadata_value(key: str, value) -> str:
# ============================================================================
# Link Utilities - Consolidated from link_utils.py
# ============================================================================
"""Link utilities - Extract and process URLs from various sources."""
"""Link utilities - Extract and process url from various sources."""
def extract_link_from_args(args: Iterable[str]) -> Any | None:

View File

@@ -77,3 +77,26 @@ mime_maps = {
"csv": { "ext": ".csv", "mimes": ["text/csv"] }
}
}
def get_type_from_ext(ext: str) -> str:
"""Determine the type (e.g., 'image', 'video', 'audio') from file extension.
Args:
ext: File extension (with or without leading dot, e.g., 'jpg' or '.jpg')
Returns:
Type string (e.g., 'image', 'video', 'audio') or 'other' if unknown
"""
if not ext:
return 'other'
# Normalize: remove leading dot and convert to lowercase
ext_clean = ext.lstrip('.').lower()
# Search through mime_maps to find matching type
for type_name, extensions_dict in mime_maps.items():
if ext_clean in extensions_dict:
return type_name
return 'other'

View File

@@ -11,7 +11,7 @@ from datetime import datetime
from threading import Thread, Lock
import time
from .local_library import LocalLibraryDB
from .folder_store import FolderDB
from helper.logger import log
logger = logging.getLogger(__name__)
@@ -140,7 +140,7 @@ class Worker:
class WorkerLoggingHandler(logging.StreamHandler):
"""Custom logging handler that captures logs for a worker."""
def __init__(self, worker_id: str, db: LocalLibraryDB,
def __init__(self, worker_id: str, db: FolderDB,
manager: Optional['WorkerManager'] = None,
buffer_size: int = 50):
"""Initialize the handler.
@@ -235,7 +235,7 @@ class WorkerManager:
auto_refresh_interval: Seconds between auto-refresh checks (0 = disabled)
"""
self.library_root = Path(library_root)
self.db = LocalLibraryDB(library_root)
self.db = FolderDB(library_root)
self.auto_refresh_interval = auto_refresh_interval
self.refresh_callbacks: List[Callable] = []
self.refresh_thread: Optional[Thread] = None
@@ -244,6 +244,22 @@ class WorkerManager:
self.worker_handlers: Dict[str, WorkerLoggingHandler] = {} # Track active handlers
self._worker_last_step: Dict[str, str] = {}
def close(self) -> None:
"""Close the database connection."""
if self.db:
try:
self.db.close()
except Exception:
pass
def __enter__(self):
"""Context manager entry."""
return self
def __exit__(self, exc_type, exc_val, exc_tb):
"""Context manager exit - close database."""
self.close()
def add_refresh_callback(self, callback: Callable[[List[Dict[str, Any]]], None]) -> None:
"""Register a callback to be called on worker updates.