This commit is contained in:
nose
2025-12-14 00:53:52 -08:00
parent 52a79b0086
commit a03eb0d1be
24 changed files with 2785 additions and 1868 deletions

View File

@@ -5,10 +5,9 @@ from __future__ import annotations
import json
import sys
import inspect
from collections.abc import Iterable as IterableABC
from SYS.logger import log, debug
from SYS.logger import log
from pathlib import Path
from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Set
from dataclasses import dataclass, field
@@ -690,7 +689,9 @@ def get_field(obj: Any, field: str, default: Optional[Any] = None) -> Any:
get_field(result, "table", "unknown") # With default
"""
# Handle lists by accessing the first element
if isinstance(obj, list) and obj:
if isinstance(obj, list):
if not obj:
return default
obj = obj[0]
if isinstance(obj, dict):
@@ -702,8 +703,9 @@ def get_field(obj: Any, field: str, default: Optional[Any] = None) -> Any:
return value
# For PipeObjects, also check the extra field
if hasattr(obj, 'extra') and isinstance(obj.extra, dict):
return obj.extra.get(field, default)
extra_val = getattr(obj, 'extra', None)
if isinstance(extra_val, dict):
return extra_val.get(field, default)
return default
@@ -1118,7 +1120,7 @@ def create_pipe_object_result(
Returns:
Dict with all PipeObject fields for emission
"""
result = {
result: Dict[str, Any] = {
'source': source,
'id': identifier,
'path': file_path,
@@ -1546,14 +1548,11 @@ def coerce_to_pipe_object(value: Any, default_path: Optional[str] = None) -> mod
extra = {k: v for k, v in value.items() if k not in known_keys}
# Extract URL: prefer direct url field, then url list
url_val = value.get("url")
if not url_val:
url = value.get("url") or value.get("url") or []
if url and isinstance(url, list) and len(url) > 0:
url_val = url[0]
# Preserve url in extra if multiple url exist
if url and len(url) > 1:
extra["url"] = url
from metadata import normalize_urls
url_list = normalize_urls(value.get("url"))
url_val = url_list[0] if url_list else None
if len(url_list) > 1:
extra["url"] = url_list
# Extract relationships
rels = value.get("relationships") or {}

View File

@@ -1,14 +1,16 @@
from __future__ import annotations
from typing import Any, Dict, Optional, Sequence, Tuple, List, Union
from typing import Any, Dict, Optional, Sequence, Tuple, List
from pathlib import Path
import sys
import shutil
import tempfile
import models
import pipeline as ctx
from API import HydrusNetwork as hydrus_wrapper
from SYS.logger import log, debug
from SYS.utils_constant import ALL_SUPPORTED_EXTENSIONS
from Store import Store
from ._shared import (
Cmdlet, CmdletArg, parse_cmdlet_args, SharedArgs,
@@ -20,8 +22,8 @@ from API.folder import read_sidecar, find_sidecar, write_sidecar, API_folder_sto
from SYS.utils import sha256_file, unique_path
from metadata import write_metadata
# Use official Hydrus supported filetypes from hydrus_wrapper
SUPPORTED_MEDIA_EXTENSIONS = hydrus_wrapper.ALL_SUPPORTED_EXTENSIONS
# Canonical supported filetypes for all stores/cmdlets
SUPPORTED_MEDIA_EXTENSIONS = ALL_SUPPORTED_EXTENSIONS
class Add_File(Cmdlet):
"""Add file into the DB"""
@@ -53,93 +55,210 @@ class Add_File(Cmdlet):
def run(self, result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
"""Main execution entry point."""
# Parse arguments
parsed = parse_cmdlet_args(args, self)
# Initialize state
path_arg = parsed.get("path")
location = parsed.get("store") # Fixed: was "storage", should be "store"
location = parsed.get("store")
provider_name = parsed.get("provider")
delete_after = parsed.get("delete", False)
# Coerce result to PipeObject; if result is a list, prefer the first element
effective_result = result
if isinstance(result, list) and result:
first_item = result[0]
# Prefer first item if it's a dict or PipeObject
if isinstance(first_item, (dict, )):
effective_result = first_item
pipe_obj = coerce_to_pipe_object(effective_result, path_arg)
stage_ctx = ctx.get_stage_context()
is_last_stage = (stage_ctx is None) or bool(getattr(stage_ctx, "is_last_stage", False))
# Decide which items to process.
# - If user provided -path, treat this invocation as single-item.
# - Otherwise, if piped input is a list, ingest each item.
if path_arg:
items_to_process: List[Any] = [result]
elif isinstance(result, list) and result:
items_to_process = list(result)
else:
items_to_process = [result]
# Debug: Log input result details
debug(f"[add-file] INPUT result type={type(result).__name__}")
if isinstance(result, list):
debug(f"[add-file] INPUT result is list with {len(result)} items")
if result and isinstance(result[0], dict):
first = result[0]
hash_val = first.get('hash')
hash_str = hash_val[:12] + "..." if hash_val else "N/A"
debug(f"[add-file] First item details: title={first.get('title')}, hash={hash_str}, store={first.get('store', 'N/A')}")
elif isinstance(result, dict):
hash_val = result.get('hash')
hash_str = hash_val[:12] + "..." if hash_val else "N/A"
debug(f"[add-file] INPUT result is dict: title={result.get('title')}, hash={hash_str}, store={result.get('store', 'N/A')}")
# Debug: Log parsed arguments
debug(f"[add-file] PARSED args: location={location}, provider={provider_name}, delete={delete_after}")
# Resolve source - returns (media_path_or_url, file_hash)
media_path_or_url, file_hash = self._resolve_source(result, path_arg, pipe_obj, config)
debug(f"[add-file] RESOLVED source: path={media_path_or_url}, hash={file_hash[:12] if file_hash else 'N/A'}...")
if not media_path_or_url:
debug(f"[add-file] ERROR: Could not resolve source file/URL")
return 1
# Update pipe_obj with resolved path
pipe_obj.path = str(media_path_or_url) if isinstance(media_path_or_url, (str, Path)) else str(media_path_or_url)
# Check if it's a URL before validating as file
if isinstance(media_path_or_url, str) and media_path_or_url.lower().startswith(("http://", "https://", "magnet:", "torrent:")):
debug(f"Detected URL target, delegating to download-data: {media_path_or_url}")
return self._delegate_to_download_data(result, media_path_or_url, location, provider_name, args, config)
collected_payloads: List[Dict[str, Any]] = []
successes = 0
failures = 0
# Convert to Path and validate
media_path = Path(media_path_or_url) if isinstance(media_path_or_url, str) else media_path_or_url
# Validate source
if not self._validate_source(media_path):
debug(f"[add-file] ERROR: Source validation failed for {media_path}")
return 1
# Only run the search-store refresh when add-file is the last stage.
# In the middle of a pipeline, downstream cmdlets should receive the emitted
# storage payload directly (no need to re-search and risk duplicate emits).
auto_search_store_after_add = bool(is_last_stage) and len(items_to_process) == 1
# Debug: Log execution path decision
debug(f"[add-file] DECISION POINT: provider={provider_name}, location={location}")
debug(f" media_path={media_path}, exists={media_path.exists()}")
for item in items_to_process:
pipe_obj = coerce_to_pipe_object(item, path_arg)
# Execute transfer based on destination (using Store registry)
if provider_name:
debug(f"[add-file] ROUTE: file provider upload")
return self._handle_provider_upload(media_path, provider_name, pipe_obj, config, delete_after)
elif location:
# Check if location is a registered backend name
temp_dir_to_cleanup: Optional[Path] = None
delete_after_item = delete_after
try:
store = Store(config)
backends = store.list_backends()
if location in backends:
debug(f"[add-file] ROUTE: storage backend '{location}'")
return self._handle_storage_backend(media_path, location, pipe_obj, config, delete_after)
else:
# Treat as local export path
debug(f"[add-file] ROUTE: local export to path '{location}'")
return self._handle_local_export(media_path, location, pipe_obj, config, delete_after)
except Exception as exc:
debug(f"[add-file] ERROR: Failed to resolve location: {exc}")
log(f"Invalid location: {location}", file=sys.stderr)
return 1
else:
debug(f"[add-file] ERROR: No location or provider specified")
log(f"No storage location or provider specified", file=sys.stderr)
return 1
media_path_or_url, file_hash = self._resolve_source(item, path_arg, pipe_obj, config)
debug(f"[add-file] RESOLVED source: path={media_path_or_url}, hash={file_hash[:12] if file_hash else 'N/A'}...")
if not media_path_or_url:
failures += 1
continue
# Update pipe_obj with resolved path
pipe_obj.path = str(media_path_or_url)
# URL targets: prefer provider-aware download for OpenLibrary selections.
if isinstance(media_path_or_url, str) and media_path_or_url.lower().startswith(
("http://", "https://", "magnet:", "torrent:")
):
table = None
full_metadata = None
if isinstance(pipe_obj.extra, dict):
table = pipe_obj.extra.get("table")
full_metadata = pipe_obj.extra.get("full_metadata")
is_openlibrary = (str(table or "").lower() == "openlibrary") or ("openlibrary.org/books/" in media_path_or_url.lower())
if is_openlibrary:
# Enrich tags from OpenLibrary metadata so the stored file has book tags (author/pages/etc).
try:
from Provider.openlibrary import OpenLibrary as _OpenLibrary
olid = None
archive_id = None
if isinstance(full_metadata, dict):
olid = full_metadata.get("openlibrary_id") or full_metadata.get("openlibrary")
archive_id = full_metadata.get("archive_id")
if not olid:
import re
m = re.search(r"/books/(OL\d+M)", str(media_path_or_url), flags=re.IGNORECASE)
if m:
olid = m.group(1)
scraped_tags: List[str] = []
if olid:
scraped_tags.extend(_OpenLibrary.scrape_openlibrary_metadata(str(olid)) or [])
if archive_id:
scraped_tags.append(f"internet_archive:{archive_id}")
if scraped_tags:
existing = list(pipe_obj.tag or [])
pipe_obj.tag = merge_sequences(existing, scraped_tags, case_sensitive=False)
except Exception:
pass
from ProviderCore.registry import get_search_provider
from ProviderCore.base import SearchResult
provider = get_search_provider("openlibrary", config)
if provider is None:
log("[add-file] OpenLibrary provider not available", file=sys.stderr)
failures += 1
continue
temp_dir_to_cleanup = Path(tempfile.mkdtemp(prefix="medios_openlibrary_"))
sr = SearchResult(
table="openlibrary",
title=str(getattr(pipe_obj, "title", None) or "Unknown"),
path=str(media_path_or_url),
full_metadata=full_metadata if isinstance(full_metadata, dict) else {},
)
downloaded = provider.download(sr, temp_dir_to_cleanup)
if downloaded is None:
log("[add-file] OpenLibrary download failed", file=sys.stderr)
failures += 1
continue
downloaded_path = Path(downloaded)
if downloaded_path.exists() and downloaded_path.is_dir():
log(
"[add-file] OpenLibrary download produced a directory (missing img2pdf?). Cannot ingest.",
file=sys.stderr,
)
failures += 1
continue
media_path_or_url = str(downloaded_path)
pipe_obj.path = str(downloaded_path)
delete_after_item = True
# For non-provider URLs, or if still a URL after provider attempt, delegate to download-media.
if isinstance(media_path_or_url, str) and media_path_or_url.lower().startswith(
("http://", "https://", "magnet:", "torrent:")
):
code = self._delegate_to_download_data(item, media_path_or_url, location, provider_name, args, config)
if code == 0:
successes += 1
else:
failures += 1
continue
media_path = Path(media_path_or_url) if isinstance(media_path_or_url, str) else media_path_or_url
if not self._validate_source(media_path):
failures += 1
continue
if provider_name:
code = self._handle_provider_upload(media_path, provider_name, pipe_obj, config, delete_after_item)
if code == 0:
successes += 1
else:
failures += 1
continue
if location:
try:
store = Store(config)
backends = store.list_backends()
if location in backends:
code = self._handle_storage_backend(
item,
media_path,
location,
pipe_obj,
config,
delete_after_item,
collect_payloads=collected_payloads,
suppress_last_stage_overlay=is_last_stage and len(items_to_process) > 1,
auto_search_store=auto_search_store_after_add,
)
else:
code = self._handle_local_export(media_path, location, pipe_obj, config, delete_after_item)
except Exception as exc:
debug(f"[add-file] ERROR: Failed to resolve location: {exc}")
log(f"Invalid location: {location}", file=sys.stderr)
failures += 1
continue
if code == 0:
successes += 1
else:
failures += 1
continue
log("No destination specified", file=sys.stderr)
failures += 1
finally:
if temp_dir_to_cleanup is not None:
try:
shutil.rmtree(temp_dir_to_cleanup, ignore_errors=True)
except Exception:
pass
# If we processed multiple storage ingests, present a single consolidated overlay table.
if is_last_stage and len(items_to_process) > 1 and collected_payloads:
try:
from result_table import ResultTable
table = ResultTable("Result")
for payload in collected_payloads:
table.add_result(payload)
# Make this the active selectable table so @.. returns here (and playlist table is kept in history).
ctx.set_last_result_table(table, collected_payloads, subject=collected_payloads)
except Exception:
pass
if successes > 0:
return 0
return 1
@staticmethod
def _resolve_source(
@@ -149,10 +268,7 @@ class Add_File(Cmdlet):
config: Dict[str, Any],
) -> Tuple[Optional[Path | str], Optional[str]]:
"""Resolve the source file path from args or pipeline result.
PRIORITY: hash+store pattern is preferred over path-based resolution.
This ensures consistency when @N selections pass hash+store identifiers.
Returns (media_path_or_url, file_hash)
where media_path_or_url can be a Path object or a URL string.
"""
@@ -161,8 +277,9 @@ class Add_File(Cmdlet):
result_hash = result.get("hash")
result_store = result.get("store")
if result_hash and result_store:
debug(f"[add-file] Using hash+store from result: hash={result_hash[:12]}..., store={result_store}")
# Use get_file to retrieve from the specific store
debug(
f"[add-file] Using hash+store from result: hash={str(result_hash)[:12]}..., store={result_store}"
)
try:
store = Store(config)
if result_store in store.list_backends():
@@ -170,16 +287,15 @@ class Add_File(Cmdlet):
media_path = backend.get_file(result_hash)
if isinstance(media_path, Path) and media_path.exists():
pipe_obj.path = str(media_path)
debug(f"[add-file] Retrieved file from {result_store}: {media_path}")
return media_path, result_hash
if isinstance(media_path, str) and media_path.lower().startswith(("http://", "https://")):
return media_path, str(result_hash)
if isinstance(media_path, str) and media_path.lower().startswith(
("http://", "https://", "magnet:", "torrent:")
):
pipe_obj.path = media_path
debug(f"[add-file] Retrieved URL from {result_store}: {media_path}")
return media_path, result_hash
return media_path, str(result_hash)
except Exception as exc:
debug(f"[add-file] Failed to retrieve via hash+store: {exc}")
# PRIORITY 2: Try explicit path argument
if path_arg:
media_path = Path(path_arg)
@@ -196,10 +312,9 @@ class Add_File(Cmdlet):
file_hash = pipe_path_str.split(":", 1)[1]
media_path, success = Add_File._fetch_hydrus_path(file_hash, config)
return media_path, file_hash if success else None
# Check if pipe_path is a URL - skip to URL handling below
if not pipe_path_str.lower().startswith(("http://", "https://", "magnet:", "torrent:")):
media_path = Path(pipe_path_str)
return media_path, None
if pipe_path_str.lower().startswith(("http://", "https://", "magnet:", "torrent:")):
return pipe_path_str, None
return Path(pipe_path_str), None
# PRIORITY 4: Try from pipe_obj.url (for streaming url without downloaded file)
pipe_url = getattr(pipe_obj, "url", None)
@@ -248,8 +363,9 @@ class Add_File(Cmdlet):
# Look for path or path-like keys
path_candidate = first_item.get("path") or first_item.get("filepath") or first_item.get("file")
# If the dict includes a 'paths' list (multi-part/section download), prefer the first file
if not path_candidate and isinstance(first_item.get("paths"), (list, tuple)) and first_item.get("paths"):
path_candidate = first_item.get("paths")[0]
paths_val = first_item.get("paths")
if not path_candidate and isinstance(paths_val, (list, tuple)) and paths_val:
path_candidate = paths_val[0]
if path_candidate:
debug(f"Resolved path from result dict: {path_candidate}")
try:
@@ -361,10 +477,12 @@ class Add_File(Cmdlet):
selection_args = result["_selection_args"]
if selection_args:
dl_args.extend(selection_args)
elif hasattr(result, 'extra') and isinstance(result.extra, dict) and "_selection_args" in result.extra:
selection_args = result.extra["_selection_args"]
if selection_args:
dl_args.extend(selection_args)
else:
extra_val = getattr(result, "extra", None)
if isinstance(extra_val, dict) and "_selection_args" in extra_val:
selection_args = extra_val["_selection_args"]
if selection_args:
dl_args.extend(selection_args)
# download-media doesn't support -storage flag
# It downloads to the configured directory, then add-file will handle storage
@@ -375,18 +493,32 @@ class Add_File(Cmdlet):
@staticmethod
def _get_url(result: Any, pipe_obj: models.PipeObject) -> List[str]:
url: List[str] = []
try:
if isinstance(pipe_obj.extra, dict):
url = list(pipe_obj.extra.get("url") or pipe_obj.extra.get("url") or [])
except Exception:
pass
from metadata import normalize_urls
if not url and isinstance(result, dict):
url = list(result.get("url") or result.get("url") or [])
if not url:
url = list(extract_url_from_result(result) or [])
return url
# Prefer explicit PipeObject.url if present
urls: List[str] = []
try:
urls = normalize_urls(getattr(pipe_obj, "url", None))
except Exception:
urls = []
# Then check extra.url
if not urls:
try:
if isinstance(pipe_obj.extra, dict):
urls = normalize_urls(pipe_obj.extra.get("url"))
except Exception:
pass
# Then check result dict
if not urls and isinstance(result, dict):
urls = normalize_urls(result.get("url"))
# Finally, try extractor helper
if not urls:
urls = normalize_urls(extract_url_from_result(result))
return urls
@staticmethod
def _get_relationships(result: Any, pipe_obj: models.PipeObject) -> Optional[Dict[str, Any]]:
@@ -405,10 +537,36 @@ class Add_File(Cmdlet):
@staticmethod
def _get_duration(result: Any, pipe_obj: models.PipeObject) -> Optional[float]:
if getattr(pipe_obj, "duration", None) is not None:
return pipe_obj.duration
def _parse_duration(value: Any) -> Optional[float]:
if value is None:
return None
if isinstance(value, (int, float)):
return float(value) if value > 0 else None
if isinstance(value, str):
s = value.strip()
if not s:
return None
try:
candidate = float(s)
return candidate if candidate > 0 else None
except ValueError:
pass
if ":" in s:
parts = [p.strip() for p in s.split(":") if p.strip()]
if len(parts) in {2, 3} and all(p.isdigit() for p in parts):
nums = [int(p) for p in parts]
if len(nums) == 2:
minutes, seconds = nums
return float(minutes * 60 + seconds)
hours, minutes, seconds = nums
return float(hours * 3600 + minutes * 60 + seconds)
return None
parsed = _parse_duration(getattr(pipe_obj, "duration", None))
if parsed is not None:
return parsed
try:
return extract_duration(result)
return _parse_duration(extract_duration(result))
except Exception:
return None
@@ -442,19 +600,20 @@ class Add_File(Cmdlet):
ctx.set_current_stage_table(None)
@staticmethod
def _emit_storage_result(payload: Dict[str, Any]) -> None:
def _emit_storage_result(payload: Dict[str, Any], *, overlay: bool = True, emit: bool = True) -> None:
"""Emit a storage-style result payload.
- Always emits the dict downstream (when in a pipeline).
- If this is the last stage (or not in a pipeline), prints a search-store-like table
and sets an overlay table/items for @N selection.
"""
# Always emit for downstream commands (no-op if not in a pipeline)
ctx.emit(payload)
# Emit for downstream commands (no-op if not in a pipeline)
if emit:
ctx.emit(payload)
stage_ctx = ctx.get_stage_context()
is_last = (stage_ctx is None) or bool(getattr(stage_ctx, "is_last_stage", False))
if not is_last:
if not is_last or not overlay:
return
try:
@@ -470,6 +629,53 @@ class Add_File(Cmdlet):
except Exception:
pass
@staticmethod
def _try_emit_search_store_by_hash(*, store: str, hash_value: str, config: Dict[str, Any]) -> bool:
"""Run search-store for a single hash so the final table/payload is consistent.
Important: `add-file` is treated as an action command by the CLI, so the CLI only
prints tables for it when a display overlay exists. After running search-store,
this copies the resulting table into the display overlay (when this is the last
stage) so the canonical store table is what the user sees and can select from.
Returns True if search-store ran successfully, else False.
"""
try:
from cmdlet.search_store import CMDLET as search_store_cmdlet
args = ["-store", str(store), f"hash:{str(hash_value)}"]
log(f"[add-file] Refresh: search-store -store {store} \"hash:{hash_value}\"", file=sys.stderr)
# Run search-store under a temporary stage context so its ctx.emit() calls
# don't interfere with the outer add-file pipeline stage.
prev_ctx = ctx.get_stage_context()
temp_ctx = ctx.PipelineStageContext(stage_index=0, total_stages=1, worker_id=getattr(prev_ctx, "worker_id", None))
ctx.set_stage_context(temp_ctx)
try:
code = search_store_cmdlet.run(None, args, config)
finally:
ctx.set_stage_context(prev_ctx)
if code != 0:
return False
# Promote the search-store result to a display overlay so the CLI prints it
# for action commands like add-file.
stage_ctx = ctx.get_stage_context()
is_last = (stage_ctx is None) or bool(getattr(stage_ctx, "is_last_stage", False))
if is_last:
try:
table = ctx.get_last_result_table()
items = ctx.get_last_result_items()
if table is not None and items:
ctx.set_last_result_table_overlay(table, items, subject={"store": store, "hash": hash_value})
except Exception:
pass
return True
except Exception as exc:
debug(f"[add-file] Failed to run search-store after add-file: {type(exc).__name__}: {exc}")
return False
@staticmethod
def _prepare_metadata(
result: Any,
@@ -664,8 +870,9 @@ class Add_File(Cmdlet):
if not username or not filename:
debug(f"[add-file] ERROR: Could not extract soulseek metadata from result (type={type(result).__name__})")
if hasattr(result, "extra"):
debug(f"[add-file] Result extra keys: {list(result.extra.keys())}")
extra_val = getattr(result, "extra", None)
if isinstance(extra_val, dict):
debug(f"[add-file] Result extra keys: {list(extra_val.keys())}")
return None
if not username or not filename:
@@ -769,28 +976,55 @@ class Add_File(Cmdlet):
@staticmethod
def _handle_storage_backend(
result: Any,
media_path: Path,
backend_name: str,
pipe_obj: models.PipeObject,
config: Dict[str, Any],
delete_after: bool,
*,
collect_payloads: Optional[List[Dict[str, Any]]] = None,
suppress_last_stage_overlay: bool = False,
auto_search_store: bool = True,
) -> int:
"""Handle uploading to a registered storage backend (e.g., 'test' folder store, 'hydrus', etc.)."""
log(f"Adding file to storage backend '{backend_name}': {media_path.name}", file=sys.stderr)
delete_after_effective = bool(delete_after)
if not delete_after_effective:
# When download-media is piped into add-file, the downloaded artifact is a temp file.
# After it is persisted to a storage backend, delete the temp copy to avoid duplicates.
try:
if (
str(backend_name or "").strip().lower() != "temp"
and getattr(pipe_obj, "is_temp", False)
and getattr(pipe_obj, "action", None) == "cmdlet:download-media"
):
from config import resolve_output_dir
temp_dir = resolve_output_dir(config)
try:
if media_path.resolve().is_relative_to(temp_dir.expanduser().resolve()):
delete_after_effective = True
debug(f"[add-file] Auto-delete temp source after ingest: {media_path}")
except Exception:
# If path resolution fails, fall back to non-destructive behavior
pass
except Exception:
pass
try:
store = Store(config)
backend = store[backend_name]
# Prepare metadata from pipe_obj and sidecars
tags, url, title, f_hash = Add_File._prepare_metadata(None, media_path, pipe_obj, config)
tags, url, title, f_hash = Add_File._prepare_metadata(result, media_path, pipe_obj, config)
# Call backend's add_file with full metadata
# Backend returns hash as identifier
file_identifier = backend.add_file(
media_path,
title=title,
tags=tags,
tag=tags,
url=url
)
log(f"✓ File added to '{backend_name}': {file_identifier}", file=sys.stderr)
@@ -822,6 +1056,14 @@ class Add_File(Cmdlet):
# Keep hash/store for downstream commands (get-tag, get-file, etc.).
resolved_hash = file_identifier if len(file_identifier) == 64 else (f_hash or file_identifier or "unknown")
# If we have url(s), ensure they get associated with the destination file.
# This mirrors `add-url` behavior but avoids emitting extra pipeline noise.
if url:
try:
backend.add_url(resolved_hash, list(url))
except Exception:
pass
meta: Dict[str, Any] = {}
try:
meta = backend.get_metadata(resolved_hash) or {}
@@ -865,9 +1107,30 @@ class Add_File(Cmdlet):
"tag": list(tags or []),
"url": list(url or []),
}
Add_File._emit_storage_result(payload)
if collect_payloads is not None:
try:
collect_payloads.append(payload)
except Exception:
pass
# Keep the add-file 1-row summary overlay (when last stage), then emit the
# canonical search-store payload/table for piping/selection consistency.
if auto_search_store and resolved_hash and resolved_hash != "unknown":
# Show the add-file summary (overlay only) but let search-store provide the downstream payload.
Add_File._emit_storage_result(payload, overlay=not suppress_last_stage_overlay, emit=False)
ok = Add_File._try_emit_search_store_by_hash(
store=backend_name,
hash_value=resolved_hash,
config=config,
)
if not ok:
# Fall back to emitting the add-file payload so downstream stages still receive an item.
ctx.emit(payload)
else:
Add_File._emit_storage_result(payload, overlay=not suppress_last_stage_overlay, emit=True)
Add_File._cleanup_after_success(media_path, delete_source=delete_after)
Add_File._cleanup_after_success(media_path, delete_source=delete_after_effective)
return 0
except Exception as exc:

View File

@@ -3,7 +3,6 @@ from __future__ import annotations
from typing import Any, Dict, Sequence
import sys
from . import register
import pipeline as ctx
from ._shared import Cmdlet, CmdletArg, SharedArgs, parse_cmdlet_args, get_field, normalize_hash
from SYS.logger import log
@@ -12,19 +11,24 @@ from Store import Store
class Add_Url(Cmdlet):
"""Add URL associations to files via hash+store."""
NAME = "add-url"
SUMMARY = "Associate a URL with a file"
USAGE = "@1 | add-url <url>"
ARGS = [
SharedArgs.HASH,
SharedArgs.STORE,
CmdletArg("url", required=True, description="URL to associate"),
]
DETAIL = [
"- Associates URL with file identified by hash+store",
"- Multiple url can be comma-separated",
]
def __init__(self) -> None:
super().__init__(
name="add-url",
summary="Associate a URL with a file",
usage="@1 | add-url <url>",
arg=[
SharedArgs.HASH,
SharedArgs.STORE,
CmdletArg("url", required=True, description="URL to associate"),
],
detail=[
"- Associates URL with file identified by hash+store",
"- Multiple url can be comma-separated",
],
exec=self.run,
)
self.register()
def run(self, result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
"""Add URL to file via hash+store backend."""
@@ -78,8 +82,7 @@ class Add_Url(Cmdlet):
return 1
# Register cmdlet
register(["add-url", "add_url"])(Add_Url)
CMDLET = Add_Url()

View File

@@ -3,7 +3,6 @@ from __future__ import annotations
from typing import Any, Dict, Sequence
import sys
from . import register
import pipeline as ctx
from ._shared import Cmdlet, CmdletArg, SharedArgs, parse_cmdlet_args, get_field, normalize_hash
from SYS.logger import log
@@ -12,19 +11,24 @@ from Store import Store
class Delete_Url(Cmdlet):
"""Delete URL associations from files via hash+store."""
NAME = "delete-url"
SUMMARY = "Remove a URL association from a file"
USAGE = "@1 | delete-url <url>"
ARGS = [
SharedArgs.HASH,
SharedArgs.STORE,
CmdletArg("url", required=True, description="URL to remove"),
]
DETAIL = [
"- Removes URL association from file identified by hash+store",
"- Multiple url can be comma-separated",
]
def __init__(self) -> None:
super().__init__(
name="delete-url",
summary="Remove a URL association from a file",
usage="@1 | delete-url <url>",
arg=[
SharedArgs.HASH,
SharedArgs.STORE,
CmdletArg("url", required=True, description="URL to remove"),
],
detail=[
"- Removes URL association from file identified by hash+store",
"- Multiple url can be comma-separated",
],
exec=self.run,
)
self.register()
def run(self, result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
"""Delete URL from file via hash+store backend."""
@@ -78,5 +82,4 @@ class Delete_Url(Cmdlet):
return 1
# Register cmdlet
register(["delete-url", "del-url", "delete_url"])(Delete_Url)
CMDLET = Delete_Url()

View File

@@ -190,9 +190,11 @@ class Download_File(Cmdlet):
# If this looks like a provider item and providers are available, prefer provider.download()
downloaded_path: Optional[Path] = None
attempted_provider_download = False
if table and get_search_provider and SearchResult:
provider = get_search_provider(str(table), config)
if provider is not None:
attempted_provider_download = True
sr = SearchResult(
table=str(table),
title=str(title or "Unknown"),
@@ -202,6 +204,19 @@ class Download_File(Cmdlet):
debug(f"[download-file] Downloading provider item via {table}: {sr.title}")
downloaded_path = provider.download(sr, final_output_dir)
# OpenLibrary: if provider download failed, do NOT try to download the OpenLibrary page HTML.
if downloaded_path is None and attempted_provider_download and str(table or "").lower() == "openlibrary":
availability = None
reason = None
if isinstance(full_metadata, dict):
availability = full_metadata.get("availability")
reason = full_metadata.get("availability_reason")
msg = "[download-file] OpenLibrary item not downloadable"
if availability or reason:
msg += f" (availability={availability or ''} reason={reason or ''})"
log(msg, file=sys.stderr)
continue
# Fallback: if we have a direct HTTP URL, download it directly
if downloaded_path is None and isinstance(target, str) and target.startswith("http"):
debug(f"[download-file] Provider item looks like direct URL, downloading: {target}")

View File

@@ -693,6 +693,7 @@ def probe_url(url: str, no_playlist: bool = False, timeout_seconds: int = 15) ->
return
# Extract relevant fields
webpage_url = info.get("webpage_url") or info.get("original_url") or info.get("url")
result_container[0] = {
"extractor": info.get("extractor", ""),
"title": info.get("title", ""),
@@ -700,7 +701,9 @@ def probe_url(url: str, no_playlist: bool = False, timeout_seconds: int = 15) ->
"duration": info.get("duration"),
"uploader": info.get("uploader"),
"description": info.get("description"),
"url": url,
# Keep both the requested and canonical URL forms; callers should prefer webpage_url.
"requested_url": url,
"webpage_url": webpage_url,
}
except Exception as exc:
log(f"Probe error for {url}: {exc}")
@@ -1220,9 +1223,359 @@ class Download_Media(Cmdlet):
log(f"Invalid clip format: {clip_spec}", file=sys.stderr)
return 1
quiet_mode = bool(config.get("_quiet_background_output")) if isinstance(config, dict) else False
storage = None
hydrus_available = True
try:
from Store import Store
storage = Store(config=config or {}, suppress_debug=True)
from API.HydrusNetwork import is_hydrus_available
hydrus_available = bool(is_hydrus_available(config or {}))
except Exception:
storage = None
def _preflight_url_duplicate(candidate_url: str, extra_urls: Optional[Sequence[str]] = None) -> bool:
# NOTE: download-media sets _quiet_background_output=True when running in a pipeline to
# reduce background noise. URL de-dup is interactive and must still run in pipelines.
if storage is None:
debug("Preflight URL check skipped: storage unavailable")
return True
debug(f"Preflight URL check: candidate={candidate_url}")
try:
from metadata import normalize_urls
except Exception:
normalize_urls = None # type: ignore[assignment]
needles: List[str] = []
if normalize_urls is not None:
for raw in [candidate_url, *(list(extra_urls) if extra_urls else [])]:
try:
needles.extend(normalize_urls(raw))
except Exception:
continue
# Fallback: always have at least one needle
if not needles:
needles = [str(candidate_url)]
# Deduplicate needles (preserve order)
seen_needles: List[str] = []
for needle in needles:
if needle and needle not in seen_needles:
seen_needles.append(needle)
needles = seen_needles
try:
debug(f"Preflight URL needles: {needles}")
except Exception:
pass
url_matches: List[Dict[str, Any]] = []
try:
from Store.HydrusNetwork import HydrusNetwork
# Avoid searching the temp/download directory backend during dedup.
# We only want to warn about duplicates in real stores.
backend_names_all = storage.list_searchable_backends()
backend_names: List[str] = []
skipped: List[str] = []
for backend_name in backend_names_all:
try:
backend = storage[backend_name]
except Exception:
continue
try:
if str(backend_name).strip().lower() == "temp":
skipped.append(backend_name)
continue
except Exception:
pass
# Heuristic: if a Folder backend points at the configured temp output dir, skip it.
try:
backend_location = getattr(backend, "_location", None)
if backend_location and final_output_dir:
backend_path = Path(str(backend_location)).expanduser().resolve()
temp_path = Path(str(final_output_dir)).expanduser().resolve()
if backend_path == temp_path:
skipped.append(backend_name)
continue
except Exception:
pass
backend_names.append(backend_name)
try:
if skipped:
debug(f"Preflight backends: {backend_names} (skipped temp: {skipped})")
else:
debug(f"Preflight backends: {backend_names}")
except Exception:
pass
for backend_name in backend_names:
backend = storage[backend_name]
if isinstance(backend, HydrusNetwork) and not hydrus_available:
continue
backend_hits: List[Dict[str, Any]] = []
for needle in needles:
try:
backend_hits = backend.search(f"url:{needle}", limit=25) or []
if backend_hits:
break
except Exception:
continue
if backend_hits:
url_matches.extend([dict(x) if isinstance(x, dict) else {"title": str(x)} for x in backend_hits])
if len(url_matches) >= 25:
url_matches = url_matches[:25]
break
except Exception:
url_matches = []
if not url_matches:
debug("Preflight URL check: no matches")
return True
table = ResultTable(f"URL already exists ({len(url_matches)} match(es))")
results_list: List[Dict[str, Any]] = []
for item in url_matches:
if "title" not in item:
item["title"] = item.get("name") or item.get("target") or item.get("path") or "Result"
table.add_result(item)
results_list.append(item)
pipeline_context.set_current_stage_table(table)
pipeline_context.set_last_result_table(table, results_list)
print(f"\n{table}")
response = input("Continue anyway? (y/n): ").strip().lower()
if response not in {"y", "yes"}:
return False
return True
def _canonicalize_url_for_storage(requested_url: str) -> str:
# Prefer yt-dlp's canonical webpage URL (e.g. strips timestamps/redirects).
# Fall back to the requested URL if probing fails.
# Important: when playlist item selection is used, avoid probing (can hang on large playlists).
if playlist_items:
return str(requested_url)
try:
pr = probe_url(requested_url, no_playlist=False, timeout_seconds=15)
if isinstance(pr, dict):
for key in ("webpage_url", "original_url", "url", "requested_url"):
value = pr.get(key)
if isinstance(value, str) and value.strip():
return value.strip()
except Exception:
pass
return str(requested_url)
# Check if we need to show format selection
playlist_items = str(parsed.get("item")) if parsed.get("item") else None
ytdl_format = parsed.get("format")
playlist_selection_handled = False
def _parse_at_selection(choice: str, *, max_index: int) -> Optional[List[int]]:
"""Parse @ selection syntax (@2, @2-5, @{1,3,5}, @2,5,7) into 1-based indices."""
raw = str(choice or "").strip()
if not raw:
return None
if raw.lower() in {"q", "quit", "cancel"}:
return None
if raw == "@*" or raw == "*":
return list(range(1, max_index + 1))
if raw.startswith("@"):
raw = raw[1:].strip()
if raw.startswith("{") and raw.endswith("}"):
raw = raw[1:-1].strip()
if not raw:
return None
indices: set[int] = set()
for part in raw.split(","):
part = part.strip()
if not part:
continue
if "-" in part:
left, right = [p.strip() for p in part.split("-", 1)]
if not left or not right:
return None
try:
start = int(left)
end = int(right)
except ValueError:
return None
if start < 1 or end < 1:
return None
if end < start:
start, end = end, start
for i in range(start, end + 1):
if 1 <= i <= max_index:
indices.add(i)
else:
try:
i = int(part)
except ValueError:
return None
if 1 <= i <= max_index:
indices.add(i)
if not indices:
return None
return sorted(indices)
def _maybe_prompt_playlist_items(url: str) -> Optional[Dict[str, Any]]:
"""If URL appears to be a playlist/channel/collection, prompt user for @ selection.
Returns:
- None if URL is not a playlist-like multi-entry page (or probe fails)
- Dict with keys:
- cancel: bool
- playlist_items: Optional[str] (None means download all)
- selected_urls: Optional[List[str]] (expanded per-entry urls when available)
"""
try:
pr = probe_url(url, no_playlist=False, timeout_seconds=15)
except Exception:
pr = None
if not isinstance(pr, dict):
return None
entries = pr.get("entries")
if not isinstance(entries, list) or len(entries) <= 1:
return None
# Display table (limit rows to keep output reasonable)
max_rows = 200
display_entries = entries[:max_rows]
total = len(entries)
def _entry_to_url(entry: Any) -> Optional[str]:
if not isinstance(entry, dict):
return None
# Prefer explicit absolute URLs when present
for key in ("webpage_url", "original_url", "url"):
v = entry.get(key)
if isinstance(v, str) and v.strip():
s = v.strip()
try:
if urlparse(s).scheme in {"http", "https"}:
return s
except Exception:
return s
# Best-effort YouTube fallback from id
entry_id = entry.get("id")
if isinstance(entry_id, str) and entry_id.strip():
extractor_name = str(pr.get("extractor") or pr.get("extractor_key") or "").lower()
if "youtube" in extractor_name:
return f"https://www.youtube.com/watch?v={entry_id.strip()}"
return None
table = ResultTable()
table.title = f"Playlist items ({total}{' shown ' + str(len(display_entries)) if total > max_rows else ''})"
table.set_source_command("download-media", [url])
try:
table.set_preserve_order(True)
except Exception:
pass
results_list: List[Dict[str, Any]] = []
for idx, entry in enumerate(display_entries, 1):
title = None
uploader = None
duration = None
try:
if isinstance(entry, dict):
title = entry.get("title")
uploader = entry.get("uploader") or pr.get("uploader")
duration = entry.get("duration")
except Exception:
pass
row: Dict[str, Any] = {
"table": "download-media",
"title": str(title or f"Item {idx}"),
"detail": str(uploader or ""),
"media_kind": "playlist-item",
"playlist_index": idx,
"columns": [
("#", str(idx)),
("Title", str(title or "")),
("Duration", str(duration or "")),
("Uploader", str(uploader or "")),
],
}
results_list.append(row)
table.add_result(row)
pipeline_context.set_current_stage_table(table)
pipeline_context.set_last_result_table(table, results_list)
print(f"\n{table}")
choice = input("Select items to download (@N, @2-5, @{1,3}, @*, or 'q' to cancel): ").strip()
if not choice or choice.lower() in {"q", "quit", "cancel"}:
return {"cancel": True, "playlist_items": None, "selected_urls": []}
if choice.strip() == "@*" or choice.strip() == "*":
# @* means all entries, not just displayed rows.
selected_urls: List[str] = []
for entry in entries:
u = _entry_to_url(entry)
if u and u not in selected_urls:
selected_urls.append(u)
# Only expand when we can derive URLs for all entries; otherwise fall back to yt-dlp playlist handling.
if len(selected_urls) == len(entries):
return {"cancel": False, "playlist_items": None, "selected_urls": selected_urls}
return {"cancel": False, "playlist_items": None, "selected_urls": []}
parsed_indices = _parse_at_selection(choice, max_index=len(display_entries))
if not parsed_indices:
log("Invalid selection. Use @N, @2-5, @{1,3}, or @*", file=sys.stderr)
return {"cancel": True, "playlist_items": None, "selected_urls": []}
selected_urls: List[str] = []
for i in parsed_indices:
try:
entry = display_entries[i - 1]
except Exception:
continue
u = _entry_to_url(entry)
if u and u not in selected_urls:
selected_urls.append(u)
# If we can expand per-entry URLs, return them.
if selected_urls and len(selected_urls) == len(parsed_indices):
return {"cancel": False, "playlist_items": None, "selected_urls": selected_urls}
# yt-dlp accepts comma-separated 1-based indices for playlist_items
return {"cancel": False, "playlist_items": ",".join(str(i) for i in parsed_indices), "selected_urls": []}
# Playlist/multi-entry detection: if the URL has multiple items and the user didn't
# specify -item, prompt for @ selection (supports @* for all).
if len(supported_url) == 1 and not playlist_items and not ytdl_format:
candidate_url = supported_url[0]
selection_info = _maybe_prompt_playlist_items(candidate_url)
if selection_info is not None:
playlist_selection_handled = True
if bool(selection_info.get("cancel")):
return 0
selected_urls = selection_info.get("selected_urls")
if isinstance(selected_urls, list) and selected_urls:
# Expand playlist/channel URL into per-entry URLs so that de-dup preflight
# and downloads operate per file.
supported_url = selected_urls
playlist_items = None
else:
playlist_items = selection_info.get("playlist_items")
# If no -item, no explicit -format specified, and single URL, show the format table.
# Do NOT stop to show formats when -audio is used (auto-pick) or when -clip is used.
@@ -1232,8 +1585,15 @@ class Download_Media(Cmdlet):
and not playlist_items
and not ytdl_format
and len(supported_url) == 1
and not playlist_selection_handled
):
url = supported_url[0]
canonical_url = _canonicalize_url_for_storage(url)
if not _preflight_url_duplicate(canonical_url, extra_urls=[url]):
log(f"Skipping download: {url}", file=sys.stderr)
return 0
formats = list_formats(url, no_playlist=False)
if formats and len(formats) > 1:
@@ -1379,12 +1739,18 @@ class Download_Media(Cmdlet):
# Download each URL
downloaded_count = 0
clip_sections_spec = self._build_clip_sections_spec(clip_range)
quiet_mode = bool(config.get("_quiet_background_output")) if isinstance(config, dict) else False
for url in supported_url:
try:
debug(f"Processing: {url}")
canonical_url = _canonicalize_url_for_storage(url)
# Preflight: warn if URL already exists in storage backends.
if not _preflight_url_duplicate(canonical_url, extra_urls=[url]):
log(f"Skipping download: {url}", file=sys.stderr)
continue
# If playlist_items is specified but looks like a format ID (e.g. from table selection),
# treat it as a format selector instead of playlist items.
# This handles the case where @N selection passes -item <format_id>
@@ -1532,24 +1898,17 @@ class Download_Media(Cmdlet):
if title and f"title:{title}" not in tag:
tag.insert(0, f"title:{title}")
# Build a single canonical URL field; prefer yt-dlp provided webpage_url or info.url,
# but fall back to the original requested URL. If multiple unique urls are available,
# join them into a comma-separated string.
urls_to_consider: List[str] = []
# Store the canonical URL for de-dup/search purposes.
# Prefer yt-dlp's webpage_url, and do not mix in the raw requested URL (which may contain timestamps).
final_url = None
try:
page_url = info.get("webpage_url") or info.get("url")
page_url = info.get("webpage_url") or info.get("original_url") or info.get("url")
if page_url:
urls_to_consider.append(str(page_url))
final_url = str(page_url)
except Exception:
pass
if url:
urls_to_consider.append(str(url))
seen_urls: List[str] = []
for u in urls_to_consider:
if u and u not in seen_urls:
seen_urls.append(u)
final_url = ",".join(seen_urls) if seen_urls else None
final_url = None
if not final_url and url:
final_url = str(url)
# Construct canonical PipeObject dict: hash, store, path, url, title, tags
# Prefer explicit backend names (storage_name/storage_location). If none, default to PATH
@@ -1561,6 +1920,7 @@ class Download_Media(Cmdlet):
"url": final_url,
"tag": tag,
"action": "cmdlet:download-media",
"is_temp": True,
# download_mode removed (deprecated), keep media_kind
"store": getattr(opts, "storage_name", None) or getattr(opts, "storage_location", None) or "PATH",
"media_kind": "video" if opts.mode == "video" else "audio",

View File

@@ -184,6 +184,32 @@ class Get_Metadata(Cmdlet):
mime_type = metadata.get("mime") or metadata.get("ext", "")
file_size = metadata.get("size")
duration_seconds = metadata.get("duration")
if duration_seconds is None:
duration_seconds = metadata.get("duration_seconds")
if duration_seconds is None:
duration_seconds = metadata.get("length")
if duration_seconds is None and isinstance(metadata.get("duration_ms"), (int, float)):
try:
duration_seconds = float(metadata["duration_ms"]) / 1000.0
except Exception:
duration_seconds = None
if isinstance(duration_seconds, str):
s = duration_seconds.strip()
if s:
try:
duration_seconds = float(s)
except ValueError:
if ":" in s:
parts = [p.strip() for p in s.split(":") if p.strip()]
if len(parts) in {2, 3} and all(p.isdigit() for p in parts):
nums = [int(p) for p in parts]
if len(nums) == 2:
duration_seconds = float(nums[0] * 60 + nums[1])
else:
duration_seconds = float(nums[0] * 3600 + nums[1] * 60 + nums[2])
else:
duration_seconds = None
pages = metadata.get("pages")
url = metadata.get("url") or []
imported_ts = self._extract_imported_ts(metadata)

View File

@@ -12,7 +12,13 @@ from __future__ import annotations
import sys
from SYS.logger import log, debug
try:
from Provider.openlibrary import OpenLibrary
_ol_scrape_isbn_metadata = OpenLibrary.scrape_isbn_metadata
_ol_scrape_openlibrary_metadata = OpenLibrary.scrape_openlibrary_metadata
except Exception:
_ol_scrape_isbn_metadata = None # type: ignore[assignment]
_ol_scrape_openlibrary_metadata = None # type: ignore[assignment]
from Provider.metadata_provider import get_metadata_provider, list_metadata_providers
import subprocess
from pathlib import Path
@@ -31,6 +37,10 @@ except ImportError:
extract_title = None
_scrape_isbn_metadata = _ol_scrape_isbn_metadata # type: ignore[assignment]
_scrape_openlibrary_metadata = _ol_scrape_openlibrary_metadata # type: ignore[assignment]
@@ -691,249 +701,22 @@ def _extract_url_formats(formats: list) -> List[Tuple[str, str]]:
def _scrape_isbn_metadata(isbn: str) -> List[str]:
"""Scrape metadata for an ISBN using Open Library API."""
new_tags = []
if _ol_scrape_isbn_metadata is None:
log("OpenLibrary scraper unavailable", file=sys.stderr)
return []
try:
from ..API.HTTP import HTTPClient
import json as json_module
isbn_clean = isbn.replace('-', '').strip()
url = f"https://openlibrary.org/api/books?bibkeys=ISBN:{isbn_clean}&jscmd=data&format=json"
try:
with HTTPClient() as client:
response = client.get(url)
response.raise_for_status()
data = json_module.loads(response.content.decode('utf-8'))
except Exception as e:
log(f"Failed to fetch ISBN metadata: {e}", file=sys.stderr)
return []
if not data:
log(f"No ISBN metadata found for: {isbn}")
return []
book_data = next(iter(data.values()), None)
if not book_data:
return []
if 'title' in book_data:
new_tags.append(f"title:{book_data['title']}")
if 'authors' in book_data and isinstance(book_data['authors'], list):
for author in book_data['authors'][:3]:
if 'name' in author:
new_tags.append(f"author:{author['name']}")
if 'publish_date' in book_data:
new_tags.append(f"publish_date:{book_data['publish_date']}")
if 'publishers' in book_data and isinstance(book_data['publishers'], list):
for pub in book_data['publishers'][:1]:
if 'name' in pub:
new_tags.append(f"publisher:{pub['name']}")
if 'description' in book_data:
desc = book_data['description']
if isinstance(desc, dict) and 'value' in desc:
desc = desc['value']
if desc:
desc_str = str(desc).strip()
# Include description if available (limit to 200 chars to keep it manageable)
if len(desc_str) > 0:
new_tags.append(f"description:{desc_str[:200]}")
if 'number_of_pages' in book_data:
page_count = book_data['number_of_pages']
if page_count and isinstance(page_count, int) and page_count > 0:
new_tags.append(f"pages:{page_count}")
if 'identifiers' in book_data and isinstance(book_data['identifiers'], dict):
identifiers = book_data['identifiers']
if 'openlibrary' in identifiers:
ol_ids = identifiers['openlibrary']
if isinstance(ol_ids, list) and ol_ids:
new_tags.append(f"openlibrary:{ol_ids[0]}")
elif isinstance(ol_ids, str):
new_tags.append(f"openlibrary:{ol_ids}")
if 'lccn' in identifiers:
lccn_list = identifiers['lccn']
if isinstance(lccn_list, list) and lccn_list:
new_tags.append(f"lccn:{lccn_list[0]}")
elif isinstance(lccn_list, str):
new_tags.append(f"lccn:{lccn_list}")
if 'oclc' in identifiers:
oclc_list = identifiers['oclc']
if isinstance(oclc_list, list) and oclc_list:
new_tags.append(f"oclc:{oclc_list[0]}")
elif isinstance(oclc_list, str):
new_tags.append(f"oclc:{oclc_list}")
if 'goodreads' in identifiers:
goodreads_list = identifiers['goodreads']
if isinstance(goodreads_list, list) and goodreads_list:
new_tags.append(f"goodreads:{goodreads_list[0]}")
elif isinstance(goodreads_list, str):
new_tags.append(f"goodreads:{goodreads_list}")
if 'librarything' in identifiers:
lt_list = identifiers['librarything']
if isinstance(lt_list, list) and lt_list:
new_tags.append(f"librarything:{lt_list[0]}")
elif isinstance(lt_list, str):
new_tags.append(f"librarything:{lt_list}")
if 'doi' in identifiers:
doi_list = identifiers['doi']
if isinstance(doi_list, list) and doi_list:
new_tags.append(f"doi:{doi_list[0]}")
elif isinstance(doi_list, str):
new_tags.append(f"doi:{doi_list}")
if 'internet_archive' in identifiers:
ia_list = identifiers['internet_archive']
if isinstance(ia_list, list) and ia_list:
new_tags.append(f"internet_archive:{ia_list[0]}")
elif isinstance(ia_list, str):
new_tags.append(f"internet_archive:{ia_list}")
log(f"Found {len(new_tags)} tag(s) from ISBN lookup")
return new_tags
return list(_ol_scrape_isbn_metadata(isbn))
except Exception as e:
log(f"ISBN scraping error: {e}", file=sys.stderr)
return []
def _scrape_openlibrary_metadata(olid: str) -> List[str]:
"""Scrape metadata for an OpenLibrary ID using the .json API endpoint.
Fetches from https://openlibrary.org/books/{OLID}.json and extracts:
- Title, authors, publish date, publishers
- Description
- Subjects as freeform tags (without namespace prefix)
- Identifiers (ISBN, LCCN, OCLC, etc.)
"""
new_tags = []
if _ol_scrape_openlibrary_metadata is None:
log("OpenLibrary scraper unavailable", file=sys.stderr)
return []
try:
from ..API.HTTP import HTTPClient
import json as json_module
# Format: OL9674499M or just 9674499M
olid_clean = olid.replace('OL', '').replace('M', '')
if not olid_clean.isdigit():
olid_clean = olid
# Ensure we have the full OLID format for the URL
if not olid.startswith('OL'):
url = f"https://openlibrary.org/books/OL{olid_clean}M.json"
else:
url = f"https://openlibrary.org/books/{olid}.json"
try:
with HTTPClient() as client:
response = client.get(url)
response.raise_for_status()
data = json_module.loads(response.content.decode('utf-8'))
except Exception as e:
log(f"Failed to fetch OpenLibrary metadata: {e}", file=sys.stderr)
return []
if not data:
log(f"No OpenLibrary metadata found for: {olid}")
return []
# Add title
if 'title' in data:
new_tags.append(f"title:{data['title']}")
# Add authors
if 'authors' in data and isinstance(data['authors'], list):
for author in data['authors'][:3]:
if isinstance(author, dict) and 'name' in author:
new_tags.append(f"author:{author['name']}")
elif isinstance(author, str):
new_tags.append(f"author:{author}")
# Add publish date
if 'publish_date' in data:
new_tags.append(f"publish_date:{data['publish_date']}")
# Add publishers
if 'publishers' in data and isinstance(data['publishers'], list):
for pub in data['publishers'][:1]:
if isinstance(pub, dict) and 'name' in pub:
new_tags.append(f"publisher:{pub['name']}")
elif isinstance(pub, str):
new_tags.append(f"publisher:{pub}")
# Add description
if 'description' in data:
desc = data['description']
if isinstance(desc, dict) and 'value' in desc:
desc = desc['value']
if desc:
desc_str = str(desc).strip()
if len(desc_str) > 0:
new_tags.append(f"description:{desc_str[:200]}")
# Add number of pages
if 'number_of_pages' in data:
page_count = data['number_of_pages']
if page_count and isinstance(page_count, int) and page_count > 0:
new_tags.append(f"pages:{page_count}")
# Add subjects as FREEFORM tags (no namespace prefix)
if 'subjects' in data and isinstance(data['subjects'], list):
for subject in data['subjects'][:10]:
if subject and isinstance(subject, str):
subject_clean = str(subject).strip()
if subject_clean and subject_clean not in new_tags:
new_tags.append(subject_clean)
# Add identifiers
if 'identifiers' in data and isinstance(data['identifiers'], dict):
identifiers = data['identifiers']
if 'isbn_10' in identifiers:
isbn_10_list = identifiers['isbn_10']
if isinstance(isbn_10_list, list) and isbn_10_list:
new_tags.append(f"isbn_10:{isbn_10_list[0]}")
elif isinstance(isbn_10_list, str):
new_tags.append(f"isbn_10:{isbn_10_list}")
if 'isbn_13' in identifiers:
isbn_13_list = identifiers['isbn_13']
if isinstance(isbn_13_list, list) and isbn_13_list:
new_tags.append(f"isbn_13:{isbn_13_list[0]}")
elif isinstance(isbn_13_list, str):
new_tags.append(f"isbn_13:{isbn_13_list}")
if 'lccn' in identifiers:
lccn_list = identifiers['lccn']
if isinstance(lccn_list, list) and lccn_list:
new_tags.append(f"lccn:{lccn_list[0]}")
elif isinstance(lccn_list, str):
new_tags.append(f"lccn:{lccn_list}")
if 'oclc_numbers' in identifiers:
oclc_list = identifiers['oclc_numbers']
if isinstance(oclc_list, list) and oclc_list:
new_tags.append(f"oclc:{oclc_list[0]}")
elif isinstance(oclc_list, str):
new_tags.append(f"oclc:{oclc_list}")
if 'goodreads' in identifiers:
goodreads_list = identifiers['goodreads']
if isinstance(goodreads_list, list) and goodreads_list:
new_tags.append(f"goodreads:{goodreads_list[0]}")
elif isinstance(goodreads_list, str):
new_tags.append(f"goodreads:{goodreads_list}")
log(f"Found {len(new_tags)} tag(s) from OpenLibrary lookup")
return new_tags
return list(_ol_scrape_openlibrary_metadata(olid))
except Exception as e:
log(f"OpenLibrary scraping error: {e}", file=sys.stderr)
return []

View File

@@ -1,28 +1,40 @@
from __future__ import annotations
from typing import Any, Dict, Sequence
from dataclasses import dataclass
from typing import Any, Dict, List, Sequence
import sys
from . import register
import pipeline as ctx
from ._shared import Cmdlet, CmdletArg, SharedArgs, parse_cmdlet_args, get_field, normalize_hash
from ._shared import Cmdlet, SharedArgs, parse_cmdlet_args, get_field, normalize_hash
from SYS.logger import log
from Store import Store
@dataclass
class UrlItem:
url: str
hash: str
store: str
class Get_Url(Cmdlet):
"""Get url associated with files via hash+store."""
NAME = "get-url"
SUMMARY = "List url associated with a file"
USAGE = "@1 | get-url"
ARGS = [
SharedArgs.HASH,
SharedArgs.STORE,
]
DETAIL = [
"- Lists all url associated with file identified by hash+store",
]
def __init__(self) -> None:
super().__init__(
name="get-url",
summary="List url associated with a file",
usage="@1 | get-url",
arg=[
SharedArgs.HASH,
SharedArgs.STORE,
],
detail=[
"- Lists all url associated with file identified by hash+store",
],
exec=self.run,
)
self.register()
def run(self, result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
"""Get url for file via hash+store backend."""
@@ -53,18 +65,34 @@ class Get_Url(Cmdlet):
urls = backend.get_url(file_hash)
if urls:
for u in urls:
# Emit rich object for pipeline compatibility
ctx.emit({
"url": u,
"hash": file_hash,
"store": store_name,
})
return 0
else:
ctx.emit("No url found")
return 0
from result_table import ResultTable
title = str(get_field(result, "title") or "").strip()
table_title = "Title"
if title:
table_title = f"Title: {title}"
table = ResultTable(table_title, max_columns=1).set_preserve_order(True)
table.set_source_command("get-url", [])
items: List[UrlItem] = []
for u in list(urls or []):
u = str(u or "").strip()
if not u:
continue
row = table.add_row()
row.add_column("Url", u)
item = UrlItem(url=u, hash=file_hash, store=str(store_name))
items.append(item)
ctx.emit(item)
# Make this a real result table so @.. / @,, can navigate it
ctx.set_last_result_table(table if items else None, items, subject=result)
if not items:
log("No url found", file=sys.stderr)
return 0
except KeyError:
log(f"Error: Storage backend '{store_name}' not configured")
@@ -74,7 +102,6 @@ class Get_Url(Cmdlet):
return 1
# Register cmdlet
register(["get-url", "get_url"])(Get_Url)
CMDLET = Get_Url()

View File

@@ -3,7 +3,6 @@ from __future__ import annotations
from typing import Any, Dict, Sequence, List, Optional, Tuple
from pathlib import Path
from dataclasses import dataclass, field
from collections import OrderedDict
import re
import json
@@ -11,57 +10,9 @@ import sys
from SYS.logger import log, debug
from ._shared import Cmdlet, CmdletArg, get_field, should_show_help
from ._shared import Cmdlet, CmdletArg, get_field, should_show_help, normalize_hash, first_title_tag
import pipeline as ctx
# Optional dependencies
try:
import mutagen # type: ignore
except ImportError: # pragma: no cover
mutagen = None # type: ignore
try:
from config import get_hydrus_url, resolve_output_dir
except Exception: # pragma: no cover
get_hydrus_url = None # type: ignore
resolve_output_dir = None # type: ignore
try:
from API.HydrusNetwork import HydrusNetwork, HydrusRequestError
except ImportError: # pragma: no cover
HydrusNetwork = None # type: ignore
HydrusRequestError = RuntimeError # type: ignore
try:
from SYS.utils import sha256_file
except ImportError: # pragma: no cover
sha256_file = None # type: ignore
try:
from SYS.utils_constant import mime_maps
except ImportError: # pragma: no cover
mime_maps = {} # type: ignore
@dataclass(slots=True)
class SearchRecord:
path: str
size_bytes: int | None = None
duration_seconds: str | None = None
tag: str | None = None
hash: str | None = None
def as_dict(self) -> dict[str, str]:
payload: dict[str, str] = {"path": self.path}
if self.size_bytes is not None:
payload["size"] = str(self.size_bytes)
if self.duration_seconds:
payload["duration"] = self.duration_seconds
if self.tag:
payload["tag"] = self.tag
if self.hash:
payload["hash"] = self.hash
return payload
STORAGE_ORIGINS = {"local", "hydrus", "folder"}
@@ -86,12 +37,15 @@ class Search_Store(Cmdlet):
detail=[
"Search across storage backends: Folder stores and Hydrus instances",
"Use -store to search a specific backend by name",
"URL search: url:* (any URL) or url:<value> (URL substring)",
"Filter results by: tag, size, type, duration",
"Results include hash for downstream commands (get-file, add-tag, etc.)",
"Examples:",
"search-store foo # Search all storage backends",
"search-store -store home '*' # Search 'home' Hydrus instance",
"search-store -store test 'video' # Search 'test' folder store",
"search-store 'url:*' # Files that have any URL",
"search-store 'url:youtube.com' # Files whose URL contains substring",
"search-store song -type audio # Search for audio files",
"search-store movie -tag action # Search with tag filter",
],
@@ -100,6 +54,40 @@ class Search_Store(Cmdlet):
self.register()
# --- Helper methods -------------------------------------------------
@staticmethod
def _parse_hash_query(query: str) -> List[str]:
"""Parse a `hash:` query into a list of normalized 64-hex SHA256 hashes.
Supported examples:
- hash:<h1>,<h2>,<h3>
- Hash: <h1> <h2> <h3>
- hash:{<h1>, <h2>}
"""
q = str(query or "").strip()
if not q:
return []
m = re.match(r"^hash(?:es)?\s*:\s*(.+)$", q, flags=re.IGNORECASE)
if not m:
return []
rest = (m.group(1) or "").strip()
if rest.startswith("{") and rest.endswith("}"):
rest = rest[1:-1].strip()
if rest.startswith("[") and rest.endswith("]"):
rest = rest[1:-1].strip()
# Split on commas and whitespace.
raw_parts = [p.strip() for p in re.split(r"[\s,]+", rest) if p.strip()]
out: List[str] = []
for part in raw_parts:
h = normalize_hash(part)
if not h:
continue
if h not in out:
out.append(h)
return out
@staticmethod
def _normalize_extension(ext_value: Any) -> str:
"""Sanitize extension strings to alphanumerics and cap at 5 chars."""
@@ -150,10 +138,10 @@ class Search_Store(Cmdlet):
# Parse arguments
query = ""
tag_filters: List[str] = []
size_filter: Optional[Tuple[str, int]] = None
duration_filter: Optional[Tuple[str, float]] = None
type_filter: Optional[str] = None
_tag_filters: List[str] = []
_size_filter: Optional[Tuple[str, int]] = None
_duration_filter: Optional[Tuple[str, float]] = None
_type_filter: Optional[str] = None
storage_backend: Optional[str] = None
limit = 100
searched_backends: List[str] = []
@@ -166,7 +154,7 @@ class Search_Store(Cmdlet):
storage_backend = args_list[i + 1]
i += 2
elif low in {"-tag", "--tag"} and i + 1 < len(args_list):
tag_filters.append(args_list[i + 1])
_tag_filters.append(args_list[i + 1])
i += 2
elif low in {"-limit", "--limit"} and i + 1 < len(args_list):
try:
@@ -175,7 +163,7 @@ class Search_Store(Cmdlet):
limit = 100
i += 2
elif low in {"-type", "--type"} and i + 1 < len(args_list):
type_filter = args_list[i + 1].lower()
_type_filter = args_list[i + 1].lower()
i += 2
elif not arg.startswith("-"):
query = f"{query} {arg}".strip() if query else arg
@@ -195,6 +183,8 @@ class Search_Store(Cmdlet):
if store_filter and not storage_backend:
storage_backend = store_filter
hash_query = self._parse_hash_query(query)
if not query:
log("Provide a search query", file=sys.stderr)
return 1
@@ -230,12 +220,136 @@ class Search_Store(Cmdlet):
table_title += f" [{storage_backend}]"
table = ResultTable(table_title)
if hash_query:
try:
table.set_preserve_order(True)
except Exception:
pass
from Store import Store
storage = Store(config=config or {})
from Store._base import Store as BaseStore
backend_to_search = storage_backend or None
if hash_query:
# Explicit hash list search: build rows from backend metadata.
backends_to_try: List[str] = []
if backend_to_search:
backends_to_try = [backend_to_search]
else:
backends_to_try = list(storage.list_backends())
found_any = False
for h in hash_query:
resolved_backend_name: Optional[str] = None
resolved_backend = None
for backend_name in backends_to_try:
try:
backend = storage[backend_name]
except Exception:
continue
try:
# If get_metadata works, consider it a hit; get_file can be optional (e.g. remote URL).
meta = backend.get_metadata(h)
if meta is None:
continue
resolved_backend_name = backend_name
resolved_backend = backend
break
except Exception:
continue
if resolved_backend_name is None or resolved_backend is None:
continue
found_any = True
searched_backends.append(resolved_backend_name)
# Resolve a path/URL string if possible
path_str: Optional[str] = None
try:
maybe_path = resolved_backend.get_file(h)
if isinstance(maybe_path, Path):
path_str = str(maybe_path)
elif isinstance(maybe_path, str) and maybe_path:
path_str = maybe_path
except Exception:
path_str = None
meta_obj: Dict[str, Any] = {}
try:
meta_obj = resolved_backend.get_metadata(h) or {}
except Exception:
meta_obj = {}
tags_list: List[str] = []
try:
tag_result = resolved_backend.get_tag(h)
if isinstance(tag_result, tuple) and tag_result:
maybe_tags = tag_result[0]
else:
maybe_tags = tag_result
if isinstance(maybe_tags, list):
tags_list = [str(t).strip() for t in maybe_tags if isinstance(t, str) and str(t).strip()]
except Exception:
tags_list = []
title_from_tag: Optional[str] = None
try:
title_tag = first_title_tag(tags_list)
if title_tag and ":" in title_tag:
title_from_tag = title_tag.split(":", 1)[1].strip()
except Exception:
title_from_tag = None
title = title_from_tag or meta_obj.get("title") or meta_obj.get("name")
if not title and path_str:
try:
title = Path(path_str).stem
except Exception:
title = path_str
ext_val = meta_obj.get("ext") or meta_obj.get("extension")
if not ext_val and path_str:
try:
ext_val = Path(path_str).suffix
except Exception:
ext_val = None
size_bytes = meta_obj.get("size")
if size_bytes is None:
size_bytes = meta_obj.get("size_bytes")
try:
size_bytes_int: Optional[int] = int(size_bytes) if size_bytes is not None else None
except Exception:
size_bytes_int = None
payload: Dict[str, Any] = {
"title": str(title or h),
"hash": h,
"store": resolved_backend_name,
"path": path_str,
"ext": self._normalize_extension(ext_val),
"size_bytes": size_bytes_int,
"tag": tags_list,
}
table.add_result(payload)
results_list.append(payload)
ctx.emit(payload)
if found_any:
ctx.set_last_result_table(table, results_list)
db.append_worker_stdout(worker_id, json.dumps(results_list, indent=2))
db.update_worker_status(worker_id, 'completed')
return 0
log("No results found", file=sys.stderr)
db.append_worker_stdout(worker_id, json.dumps([], indent=2))
db.update_worker_status(worker_id, 'completed')
return 0
if backend_to_search:
searched_backends.append(backend_to_search)
target_backend = storage[backend_to_search]
@@ -243,7 +357,9 @@ class Search_Store(Cmdlet):
log(f"Backend '{backend_to_search}' does not support searching", file=sys.stderr)
db.update_worker_status(worker_id, 'error')
return 1
debug(f"[search-store] Searching '{backend_to_search}'")
results = target_backend.search(query, limit=limit)
debug(f"[search-store] '{backend_to_search}' -> {len(results or [])} result(s)")
else:
from API.HydrusNetwork import is_hydrus_available
hydrus_available = is_hydrus_available(config or {})
@@ -257,7 +373,9 @@ class Search_Store(Cmdlet):
continue
searched_backends.append(backend_name)
debug(f"[search-store] Searching '{backend_name}'")
backend_results = backend.search(query, limit=limit - len(all_results))
debug(f"[search-store] '{backend_name}' -> {len(backend_results or [])} result(s)")
if backend_results:
all_results.extend(backend_results)
if len(all_results) >= limit:
@@ -317,11 +435,6 @@ class Search_Store(Cmdlet):
results_list.append(normalized)
ctx.emit(normalized)
# Debug: Verify table rows match items list
debug(f"[search-store] Added {len(table.rows)} rows to table, {len(results_list)} items to results_list")
if len(table.rows) != len(results_list):
debug(f"[search-store] WARNING: Table/items mismatch! rows={len(table.rows)} items={len(results_list)}", file=sys.stderr)
ctx.set_last_result_table(table, results_list)
db.append_worker_stdout(worker_id, json.dumps(results_list, indent=2))
else: