This commit is contained in:
nose
2025-12-14 00:53:52 -08:00
parent 52a79b0086
commit a03eb0d1be
24 changed files with 2785 additions and 1868 deletions

View File

@@ -1,14 +1,16 @@
from __future__ import annotations
from typing import Any, Dict, Optional, Sequence, Tuple, List, Union
from typing import Any, Dict, Optional, Sequence, Tuple, List
from pathlib import Path
import sys
import shutil
import tempfile
import models
import pipeline as ctx
from API import HydrusNetwork as hydrus_wrapper
from SYS.logger import log, debug
from SYS.utils_constant import ALL_SUPPORTED_EXTENSIONS
from Store import Store
from ._shared import (
Cmdlet, CmdletArg, parse_cmdlet_args, SharedArgs,
@@ -20,8 +22,8 @@ from API.folder import read_sidecar, find_sidecar, write_sidecar, API_folder_sto
from SYS.utils import sha256_file, unique_path
from metadata import write_metadata
# Use official Hydrus supported filetypes from hydrus_wrapper
SUPPORTED_MEDIA_EXTENSIONS = hydrus_wrapper.ALL_SUPPORTED_EXTENSIONS
# Canonical supported filetypes for all stores/cmdlets
SUPPORTED_MEDIA_EXTENSIONS = ALL_SUPPORTED_EXTENSIONS
class Add_File(Cmdlet):
"""Add file into the DB"""
@@ -53,93 +55,210 @@ class Add_File(Cmdlet):
def run(self, result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
"""Main execution entry point."""
# Parse arguments
parsed = parse_cmdlet_args(args, self)
# Initialize state
path_arg = parsed.get("path")
location = parsed.get("store") # Fixed: was "storage", should be "store"
location = parsed.get("store")
provider_name = parsed.get("provider")
delete_after = parsed.get("delete", False)
# Coerce result to PipeObject; if result is a list, prefer the first element
effective_result = result
if isinstance(result, list) and result:
first_item = result[0]
# Prefer first item if it's a dict or PipeObject
if isinstance(first_item, (dict, )):
effective_result = first_item
pipe_obj = coerce_to_pipe_object(effective_result, path_arg)
stage_ctx = ctx.get_stage_context()
is_last_stage = (stage_ctx is None) or bool(getattr(stage_ctx, "is_last_stage", False))
# Decide which items to process.
# - If user provided -path, treat this invocation as single-item.
# - Otherwise, if piped input is a list, ingest each item.
if path_arg:
items_to_process: List[Any] = [result]
elif isinstance(result, list) and result:
items_to_process = list(result)
else:
items_to_process = [result]
# Debug: Log input result details
debug(f"[add-file] INPUT result type={type(result).__name__}")
if isinstance(result, list):
debug(f"[add-file] INPUT result is list with {len(result)} items")
if result and isinstance(result[0], dict):
first = result[0]
hash_val = first.get('hash')
hash_str = hash_val[:12] + "..." if hash_val else "N/A"
debug(f"[add-file] First item details: title={first.get('title')}, hash={hash_str}, store={first.get('store', 'N/A')}")
elif isinstance(result, dict):
hash_val = result.get('hash')
hash_str = hash_val[:12] + "..." if hash_val else "N/A"
debug(f"[add-file] INPUT result is dict: title={result.get('title')}, hash={hash_str}, store={result.get('store', 'N/A')}")
# Debug: Log parsed arguments
debug(f"[add-file] PARSED args: location={location}, provider={provider_name}, delete={delete_after}")
# Resolve source - returns (media_path_or_url, file_hash)
media_path_or_url, file_hash = self._resolve_source(result, path_arg, pipe_obj, config)
debug(f"[add-file] RESOLVED source: path={media_path_or_url}, hash={file_hash[:12] if file_hash else 'N/A'}...")
if not media_path_or_url:
debug(f"[add-file] ERROR: Could not resolve source file/URL")
return 1
# Update pipe_obj with resolved path
pipe_obj.path = str(media_path_or_url) if isinstance(media_path_or_url, (str, Path)) else str(media_path_or_url)
# Check if it's a URL before validating as file
if isinstance(media_path_or_url, str) and media_path_or_url.lower().startswith(("http://", "https://", "magnet:", "torrent:")):
debug(f"Detected URL target, delegating to download-data: {media_path_or_url}")
return self._delegate_to_download_data(result, media_path_or_url, location, provider_name, args, config)
collected_payloads: List[Dict[str, Any]] = []
successes = 0
failures = 0
# Convert to Path and validate
media_path = Path(media_path_or_url) if isinstance(media_path_or_url, str) else media_path_or_url
# Validate source
if not self._validate_source(media_path):
debug(f"[add-file] ERROR: Source validation failed for {media_path}")
return 1
# Only run the search-store refresh when add-file is the last stage.
# In the middle of a pipeline, downstream cmdlets should receive the emitted
# storage payload directly (no need to re-search and risk duplicate emits).
auto_search_store_after_add = bool(is_last_stage) and len(items_to_process) == 1
# Debug: Log execution path decision
debug(f"[add-file] DECISION POINT: provider={provider_name}, location={location}")
debug(f" media_path={media_path}, exists={media_path.exists()}")
for item in items_to_process:
pipe_obj = coerce_to_pipe_object(item, path_arg)
# Execute transfer based on destination (using Store registry)
if provider_name:
debug(f"[add-file] ROUTE: file provider upload")
return self._handle_provider_upload(media_path, provider_name, pipe_obj, config, delete_after)
elif location:
# Check if location is a registered backend name
temp_dir_to_cleanup: Optional[Path] = None
delete_after_item = delete_after
try:
store = Store(config)
backends = store.list_backends()
if location in backends:
debug(f"[add-file] ROUTE: storage backend '{location}'")
return self._handle_storage_backend(media_path, location, pipe_obj, config, delete_after)
else:
# Treat as local export path
debug(f"[add-file] ROUTE: local export to path '{location}'")
return self._handle_local_export(media_path, location, pipe_obj, config, delete_after)
except Exception as exc:
debug(f"[add-file] ERROR: Failed to resolve location: {exc}")
log(f"Invalid location: {location}", file=sys.stderr)
return 1
else:
debug(f"[add-file] ERROR: No location or provider specified")
log(f"No storage location or provider specified", file=sys.stderr)
return 1
media_path_or_url, file_hash = self._resolve_source(item, path_arg, pipe_obj, config)
debug(f"[add-file] RESOLVED source: path={media_path_or_url}, hash={file_hash[:12] if file_hash else 'N/A'}...")
if not media_path_or_url:
failures += 1
continue
# Update pipe_obj with resolved path
pipe_obj.path = str(media_path_or_url)
# URL targets: prefer provider-aware download for OpenLibrary selections.
if isinstance(media_path_or_url, str) and media_path_or_url.lower().startswith(
("http://", "https://", "magnet:", "torrent:")
):
table = None
full_metadata = None
if isinstance(pipe_obj.extra, dict):
table = pipe_obj.extra.get("table")
full_metadata = pipe_obj.extra.get("full_metadata")
is_openlibrary = (str(table or "").lower() == "openlibrary") or ("openlibrary.org/books/" in media_path_or_url.lower())
if is_openlibrary:
# Enrich tags from OpenLibrary metadata so the stored file has book tags (author/pages/etc).
try:
from Provider.openlibrary import OpenLibrary as _OpenLibrary
olid = None
archive_id = None
if isinstance(full_metadata, dict):
olid = full_metadata.get("openlibrary_id") or full_metadata.get("openlibrary")
archive_id = full_metadata.get("archive_id")
if not olid:
import re
m = re.search(r"/books/(OL\d+M)", str(media_path_or_url), flags=re.IGNORECASE)
if m:
olid = m.group(1)
scraped_tags: List[str] = []
if olid:
scraped_tags.extend(_OpenLibrary.scrape_openlibrary_metadata(str(olid)) or [])
if archive_id:
scraped_tags.append(f"internet_archive:{archive_id}")
if scraped_tags:
existing = list(pipe_obj.tag or [])
pipe_obj.tag = merge_sequences(existing, scraped_tags, case_sensitive=False)
except Exception:
pass
from ProviderCore.registry import get_search_provider
from ProviderCore.base import SearchResult
provider = get_search_provider("openlibrary", config)
if provider is None:
log("[add-file] OpenLibrary provider not available", file=sys.stderr)
failures += 1
continue
temp_dir_to_cleanup = Path(tempfile.mkdtemp(prefix="medios_openlibrary_"))
sr = SearchResult(
table="openlibrary",
title=str(getattr(pipe_obj, "title", None) or "Unknown"),
path=str(media_path_or_url),
full_metadata=full_metadata if isinstance(full_metadata, dict) else {},
)
downloaded = provider.download(sr, temp_dir_to_cleanup)
if downloaded is None:
log("[add-file] OpenLibrary download failed", file=sys.stderr)
failures += 1
continue
downloaded_path = Path(downloaded)
if downloaded_path.exists() and downloaded_path.is_dir():
log(
"[add-file] OpenLibrary download produced a directory (missing img2pdf?). Cannot ingest.",
file=sys.stderr,
)
failures += 1
continue
media_path_or_url = str(downloaded_path)
pipe_obj.path = str(downloaded_path)
delete_after_item = True
# For non-provider URLs, or if still a URL after provider attempt, delegate to download-media.
if isinstance(media_path_or_url, str) and media_path_or_url.lower().startswith(
("http://", "https://", "magnet:", "torrent:")
):
code = self._delegate_to_download_data(item, media_path_or_url, location, provider_name, args, config)
if code == 0:
successes += 1
else:
failures += 1
continue
media_path = Path(media_path_or_url) if isinstance(media_path_or_url, str) else media_path_or_url
if not self._validate_source(media_path):
failures += 1
continue
if provider_name:
code = self._handle_provider_upload(media_path, provider_name, pipe_obj, config, delete_after_item)
if code == 0:
successes += 1
else:
failures += 1
continue
if location:
try:
store = Store(config)
backends = store.list_backends()
if location in backends:
code = self._handle_storage_backend(
item,
media_path,
location,
pipe_obj,
config,
delete_after_item,
collect_payloads=collected_payloads,
suppress_last_stage_overlay=is_last_stage and len(items_to_process) > 1,
auto_search_store=auto_search_store_after_add,
)
else:
code = self._handle_local_export(media_path, location, pipe_obj, config, delete_after_item)
except Exception as exc:
debug(f"[add-file] ERROR: Failed to resolve location: {exc}")
log(f"Invalid location: {location}", file=sys.stderr)
failures += 1
continue
if code == 0:
successes += 1
else:
failures += 1
continue
log("No destination specified", file=sys.stderr)
failures += 1
finally:
if temp_dir_to_cleanup is not None:
try:
shutil.rmtree(temp_dir_to_cleanup, ignore_errors=True)
except Exception:
pass
# If we processed multiple storage ingests, present a single consolidated overlay table.
if is_last_stage and len(items_to_process) > 1 and collected_payloads:
try:
from result_table import ResultTable
table = ResultTable("Result")
for payload in collected_payloads:
table.add_result(payload)
# Make this the active selectable table so @.. returns here (and playlist table is kept in history).
ctx.set_last_result_table(table, collected_payloads, subject=collected_payloads)
except Exception:
pass
if successes > 0:
return 0
return 1
@staticmethod
def _resolve_source(
@@ -149,10 +268,7 @@ class Add_File(Cmdlet):
config: Dict[str, Any],
) -> Tuple[Optional[Path | str], Optional[str]]:
"""Resolve the source file path from args or pipeline result.
PRIORITY: hash+store pattern is preferred over path-based resolution.
This ensures consistency when @N selections pass hash+store identifiers.
Returns (media_path_or_url, file_hash)
where media_path_or_url can be a Path object or a URL string.
"""
@@ -161,8 +277,9 @@ class Add_File(Cmdlet):
result_hash = result.get("hash")
result_store = result.get("store")
if result_hash and result_store:
debug(f"[add-file] Using hash+store from result: hash={result_hash[:12]}..., store={result_store}")
# Use get_file to retrieve from the specific store
debug(
f"[add-file] Using hash+store from result: hash={str(result_hash)[:12]}..., store={result_store}"
)
try:
store = Store(config)
if result_store in store.list_backends():
@@ -170,16 +287,15 @@ class Add_File(Cmdlet):
media_path = backend.get_file(result_hash)
if isinstance(media_path, Path) and media_path.exists():
pipe_obj.path = str(media_path)
debug(f"[add-file] Retrieved file from {result_store}: {media_path}")
return media_path, result_hash
if isinstance(media_path, str) and media_path.lower().startswith(("http://", "https://")):
return media_path, str(result_hash)
if isinstance(media_path, str) and media_path.lower().startswith(
("http://", "https://", "magnet:", "torrent:")
):
pipe_obj.path = media_path
debug(f"[add-file] Retrieved URL from {result_store}: {media_path}")
return media_path, result_hash
return media_path, str(result_hash)
except Exception as exc:
debug(f"[add-file] Failed to retrieve via hash+store: {exc}")
# PRIORITY 2: Try explicit path argument
if path_arg:
media_path = Path(path_arg)
@@ -196,10 +312,9 @@ class Add_File(Cmdlet):
file_hash = pipe_path_str.split(":", 1)[1]
media_path, success = Add_File._fetch_hydrus_path(file_hash, config)
return media_path, file_hash if success else None
# Check if pipe_path is a URL - skip to URL handling below
if not pipe_path_str.lower().startswith(("http://", "https://", "magnet:", "torrent:")):
media_path = Path(pipe_path_str)
return media_path, None
if pipe_path_str.lower().startswith(("http://", "https://", "magnet:", "torrent:")):
return pipe_path_str, None
return Path(pipe_path_str), None
# PRIORITY 4: Try from pipe_obj.url (for streaming url without downloaded file)
pipe_url = getattr(pipe_obj, "url", None)
@@ -248,8 +363,9 @@ class Add_File(Cmdlet):
# Look for path or path-like keys
path_candidate = first_item.get("path") or first_item.get("filepath") or first_item.get("file")
# If the dict includes a 'paths' list (multi-part/section download), prefer the first file
if not path_candidate and isinstance(first_item.get("paths"), (list, tuple)) and first_item.get("paths"):
path_candidate = first_item.get("paths")[0]
paths_val = first_item.get("paths")
if not path_candidate and isinstance(paths_val, (list, tuple)) and paths_val:
path_candidate = paths_val[0]
if path_candidate:
debug(f"Resolved path from result dict: {path_candidate}")
try:
@@ -361,10 +477,12 @@ class Add_File(Cmdlet):
selection_args = result["_selection_args"]
if selection_args:
dl_args.extend(selection_args)
elif hasattr(result, 'extra') and isinstance(result.extra, dict) and "_selection_args" in result.extra:
selection_args = result.extra["_selection_args"]
if selection_args:
dl_args.extend(selection_args)
else:
extra_val = getattr(result, "extra", None)
if isinstance(extra_val, dict) and "_selection_args" in extra_val:
selection_args = extra_val["_selection_args"]
if selection_args:
dl_args.extend(selection_args)
# download-media doesn't support -storage flag
# It downloads to the configured directory, then add-file will handle storage
@@ -375,18 +493,32 @@ class Add_File(Cmdlet):
@staticmethod
def _get_url(result: Any, pipe_obj: models.PipeObject) -> List[str]:
url: List[str] = []
try:
if isinstance(pipe_obj.extra, dict):
url = list(pipe_obj.extra.get("url") or pipe_obj.extra.get("url") or [])
except Exception:
pass
from metadata import normalize_urls
if not url and isinstance(result, dict):
url = list(result.get("url") or result.get("url") or [])
if not url:
url = list(extract_url_from_result(result) or [])
return url
# Prefer explicit PipeObject.url if present
urls: List[str] = []
try:
urls = normalize_urls(getattr(pipe_obj, "url", None))
except Exception:
urls = []
# Then check extra.url
if not urls:
try:
if isinstance(pipe_obj.extra, dict):
urls = normalize_urls(pipe_obj.extra.get("url"))
except Exception:
pass
# Then check result dict
if not urls and isinstance(result, dict):
urls = normalize_urls(result.get("url"))
# Finally, try extractor helper
if not urls:
urls = normalize_urls(extract_url_from_result(result))
return urls
@staticmethod
def _get_relationships(result: Any, pipe_obj: models.PipeObject) -> Optional[Dict[str, Any]]:
@@ -405,10 +537,36 @@ class Add_File(Cmdlet):
@staticmethod
def _get_duration(result: Any, pipe_obj: models.PipeObject) -> Optional[float]:
if getattr(pipe_obj, "duration", None) is not None:
return pipe_obj.duration
def _parse_duration(value: Any) -> Optional[float]:
if value is None:
return None
if isinstance(value, (int, float)):
return float(value) if value > 0 else None
if isinstance(value, str):
s = value.strip()
if not s:
return None
try:
candidate = float(s)
return candidate if candidate > 0 else None
except ValueError:
pass
if ":" in s:
parts = [p.strip() for p in s.split(":") if p.strip()]
if len(parts) in {2, 3} and all(p.isdigit() for p in parts):
nums = [int(p) for p in parts]
if len(nums) == 2:
minutes, seconds = nums
return float(minutes * 60 + seconds)
hours, minutes, seconds = nums
return float(hours * 3600 + minutes * 60 + seconds)
return None
parsed = _parse_duration(getattr(pipe_obj, "duration", None))
if parsed is not None:
return parsed
try:
return extract_duration(result)
return _parse_duration(extract_duration(result))
except Exception:
return None
@@ -442,19 +600,20 @@ class Add_File(Cmdlet):
ctx.set_current_stage_table(None)
@staticmethod
def _emit_storage_result(payload: Dict[str, Any]) -> None:
def _emit_storage_result(payload: Dict[str, Any], *, overlay: bool = True, emit: bool = True) -> None:
"""Emit a storage-style result payload.
- Always emits the dict downstream (when in a pipeline).
- If this is the last stage (or not in a pipeline), prints a search-store-like table
and sets an overlay table/items for @N selection.
"""
# Always emit for downstream commands (no-op if not in a pipeline)
ctx.emit(payload)
# Emit for downstream commands (no-op if not in a pipeline)
if emit:
ctx.emit(payload)
stage_ctx = ctx.get_stage_context()
is_last = (stage_ctx is None) or bool(getattr(stage_ctx, "is_last_stage", False))
if not is_last:
if not is_last or not overlay:
return
try:
@@ -470,6 +629,53 @@ class Add_File(Cmdlet):
except Exception:
pass
@staticmethod
def _try_emit_search_store_by_hash(*, store: str, hash_value: str, config: Dict[str, Any]) -> bool:
"""Run search-store for a single hash so the final table/payload is consistent.
Important: `add-file` is treated as an action command by the CLI, so the CLI only
prints tables for it when a display overlay exists. After running search-store,
this copies the resulting table into the display overlay (when this is the last
stage) so the canonical store table is what the user sees and can select from.
Returns True if search-store ran successfully, else False.
"""
try:
from cmdlet.search_store import CMDLET as search_store_cmdlet
args = ["-store", str(store), f"hash:{str(hash_value)}"]
log(f"[add-file] Refresh: search-store -store {store} \"hash:{hash_value}\"", file=sys.stderr)
# Run search-store under a temporary stage context so its ctx.emit() calls
# don't interfere with the outer add-file pipeline stage.
prev_ctx = ctx.get_stage_context()
temp_ctx = ctx.PipelineStageContext(stage_index=0, total_stages=1, worker_id=getattr(prev_ctx, "worker_id", None))
ctx.set_stage_context(temp_ctx)
try:
code = search_store_cmdlet.run(None, args, config)
finally:
ctx.set_stage_context(prev_ctx)
if code != 0:
return False
# Promote the search-store result to a display overlay so the CLI prints it
# for action commands like add-file.
stage_ctx = ctx.get_stage_context()
is_last = (stage_ctx is None) or bool(getattr(stage_ctx, "is_last_stage", False))
if is_last:
try:
table = ctx.get_last_result_table()
items = ctx.get_last_result_items()
if table is not None and items:
ctx.set_last_result_table_overlay(table, items, subject={"store": store, "hash": hash_value})
except Exception:
pass
return True
except Exception as exc:
debug(f"[add-file] Failed to run search-store after add-file: {type(exc).__name__}: {exc}")
return False
@staticmethod
def _prepare_metadata(
result: Any,
@@ -664,8 +870,9 @@ class Add_File(Cmdlet):
if not username or not filename:
debug(f"[add-file] ERROR: Could not extract soulseek metadata from result (type={type(result).__name__})")
if hasattr(result, "extra"):
debug(f"[add-file] Result extra keys: {list(result.extra.keys())}")
extra_val = getattr(result, "extra", None)
if isinstance(extra_val, dict):
debug(f"[add-file] Result extra keys: {list(extra_val.keys())}")
return None
if not username or not filename:
@@ -769,28 +976,55 @@ class Add_File(Cmdlet):
@staticmethod
def _handle_storage_backend(
result: Any,
media_path: Path,
backend_name: str,
pipe_obj: models.PipeObject,
config: Dict[str, Any],
delete_after: bool,
*,
collect_payloads: Optional[List[Dict[str, Any]]] = None,
suppress_last_stage_overlay: bool = False,
auto_search_store: bool = True,
) -> int:
"""Handle uploading to a registered storage backend (e.g., 'test' folder store, 'hydrus', etc.)."""
log(f"Adding file to storage backend '{backend_name}': {media_path.name}", file=sys.stderr)
delete_after_effective = bool(delete_after)
if not delete_after_effective:
# When download-media is piped into add-file, the downloaded artifact is a temp file.
# After it is persisted to a storage backend, delete the temp copy to avoid duplicates.
try:
if (
str(backend_name or "").strip().lower() != "temp"
and getattr(pipe_obj, "is_temp", False)
and getattr(pipe_obj, "action", None) == "cmdlet:download-media"
):
from config import resolve_output_dir
temp_dir = resolve_output_dir(config)
try:
if media_path.resolve().is_relative_to(temp_dir.expanduser().resolve()):
delete_after_effective = True
debug(f"[add-file] Auto-delete temp source after ingest: {media_path}")
except Exception:
# If path resolution fails, fall back to non-destructive behavior
pass
except Exception:
pass
try:
store = Store(config)
backend = store[backend_name]
# Prepare metadata from pipe_obj and sidecars
tags, url, title, f_hash = Add_File._prepare_metadata(None, media_path, pipe_obj, config)
tags, url, title, f_hash = Add_File._prepare_metadata(result, media_path, pipe_obj, config)
# Call backend's add_file with full metadata
# Backend returns hash as identifier
file_identifier = backend.add_file(
media_path,
title=title,
tags=tags,
tag=tags,
url=url
)
log(f"✓ File added to '{backend_name}': {file_identifier}", file=sys.stderr)
@@ -822,6 +1056,14 @@ class Add_File(Cmdlet):
# Keep hash/store for downstream commands (get-tag, get-file, etc.).
resolved_hash = file_identifier if len(file_identifier) == 64 else (f_hash or file_identifier or "unknown")
# If we have url(s), ensure they get associated with the destination file.
# This mirrors `add-url` behavior but avoids emitting extra pipeline noise.
if url:
try:
backend.add_url(resolved_hash, list(url))
except Exception:
pass
meta: Dict[str, Any] = {}
try:
meta = backend.get_metadata(resolved_hash) or {}
@@ -865,9 +1107,30 @@ class Add_File(Cmdlet):
"tag": list(tags or []),
"url": list(url or []),
}
Add_File._emit_storage_result(payload)
if collect_payloads is not None:
try:
collect_payloads.append(payload)
except Exception:
pass
# Keep the add-file 1-row summary overlay (when last stage), then emit the
# canonical search-store payload/table for piping/selection consistency.
if auto_search_store and resolved_hash and resolved_hash != "unknown":
# Show the add-file summary (overlay only) but let search-store provide the downstream payload.
Add_File._emit_storage_result(payload, overlay=not suppress_last_stage_overlay, emit=False)
ok = Add_File._try_emit_search_store_by_hash(
store=backend_name,
hash_value=resolved_hash,
config=config,
)
if not ok:
# Fall back to emitting the add-file payload so downstream stages still receive an item.
ctx.emit(payload)
else:
Add_File._emit_storage_result(payload, overlay=not suppress_last_stage_overlay, emit=True)
Add_File._cleanup_after_success(media_path, delete_source=delete_after)
Add_File._cleanup_after_success(media_path, delete_source=delete_after_effective)
return 0
except Exception as exc: