df
Some checks failed
smoke-mm / Install & smoke test mm --help (push) Has been cancelled

This commit is contained in:
2025-12-27 14:50:59 -08:00
parent 22af776ee2
commit fcdd507d00
12 changed files with 1004 additions and 66 deletions

View File

@@ -373,6 +373,29 @@ class HydrusNetwork:
body = {"hashes": hash_list} body = {"hashes": hash_list}
return self._post("/add_files/undelete_files", data=body) return self._post("/add_files/undelete_files", data=body)
def delete_files(self, hashes: Union[str, Iterable[str]], *, reason: str | None = None) -> dict[str, Any]:
"""Delete files in Hydrus.
Hydrus Client API: POST /add_files/delete_files
Required JSON args: {"hashes": [<sha256 hex>, ...]}
Optional JSON args: {"reason": "..."}
"""
hash_list = self._ensure_hashes(hashes)
body: dict[str, Any] = {"hashes": hash_list}
if isinstance(reason, str) and reason.strip():
body["reason"] = reason.strip()
return self._post("/add_files/delete_files", data=body)
def clear_file_deletion_record(self, hashes: Union[str, Iterable[str]]) -> dict[str, Any]:
"""Clear Hydrus's file deletion record for the provided hashes.
Hydrus Client API: POST /add_files/clear_file_deletion_record
Required JSON args: {"hashes": [<sha256 hex>, ...]}
"""
hash_list = self._ensure_hashes(hashes)
body = {"hashes": hash_list}
return self._post("/add_files/clear_file_deletion_record", data=body)
def add_tag(self, hash: Union[str, Iterable[str]], tags: Iterable[str], service_name: str) -> dict[str, Any]: def add_tag(self, hash: Union[str, Iterable[str]], tags: Iterable[str], service_name: str) -> dict[str, Any]:
hash = self._ensure_hashes(hash) hash = self._ensure_hashes(hash)
body = {"hashes": hash, "service_names_to_tags": {service_name: list(tags)}} body = {"hashes": hash, "service_names_to_tags": {service_name: list(tags)}}

47
CLI.py
View File

@@ -1012,6 +1012,14 @@ class CmdletExecutor:
ensure_registry_loaded() ensure_registry_loaded()
# REPL guard: stage-local selection tables should not leak across independent
# commands. @ selection can always re-seed from the last result table.
try:
if hasattr(ctx, "set_current_stage_table"):
ctx.set_current_stage_table(None)
except Exception:
pass
cmd_fn = REGISTRY.get(cmd_name) cmd_fn = REGISTRY.get(cmd_name)
if not cmd_fn: if not cmd_fn:
# Lazy-import module and register its CMDLET. # Lazy-import module and register its CMDLET.
@@ -1451,6 +1459,13 @@ class CmdletExecutor:
ctx.set_live_progress(None) ctx.set_live_progress(None)
except Exception: except Exception:
pass pass
# Do not keep stage tables around after a single command; it can cause
# later @ selections to bind to stale tables (e.g. old add-file scans).
try:
if hasattr(ctx, "set_current_stage_table"):
ctx.set_current_stage_table(None)
except Exception:
pass
try: try:
if hasattr(ctx, "clear_current_cmdlet_name"): if hasattr(ctx, "clear_current_cmdlet_name"):
ctx.clear_current_cmdlet_name() ctx.clear_current_cmdlet_name()
@@ -2027,6 +2042,9 @@ class PipelineExecutor:
elif table_type == "bandcamp": elif table_type == "bandcamp":
print("Auto-running Bandcamp selection via download-media") print("Auto-running Bandcamp selection via download-media")
stages.append(["download-media"]) stages.append(["download-media"])
elif table_type == "internetarchive":
print("Auto-loading Internet Archive item via download-data")
stages.append(["download-data"])
elif table_type in {"soulseek", "openlibrary", "libgen"}: elif table_type in {"soulseek", "openlibrary", "libgen"}:
print("Auto-piping selection to download-file") print("Auto-piping selection to download-file")
stages.append(["download-file"]) stages.append(["download-file"])
@@ -2056,6 +2074,16 @@ class PipelineExecutor:
): ):
print("Auto-inserting download-media after Bandcamp selection") print("Auto-inserting download-media after Bandcamp selection")
stages.insert(0, ["download-media"]) stages.insert(0, ["download-media"])
if table_type == "internetarchive" and first_cmd not in (
"download-data",
"download_data",
"download-file",
"download-media",
"download_media",
".pipe",
):
debug("Auto-inserting download-data after Internet Archive selection")
stages.insert(0, ["download-data"])
if table_type == "libgen" and first_cmd not in ( if table_type == "libgen" and first_cmd not in (
"download-file", "download-file",
"download-media", "download-media",
@@ -2166,6 +2194,14 @@ class PipelineExecutor:
try: try:
self._try_clear_pipeline_stop(ctx) self._try_clear_pipeline_stop(ctx)
# REPL guard: stage-local tables should not persist across independent
# commands. Selection stages can always seed from last/display tables.
try:
if hasattr(ctx, "set_current_stage_table"):
ctx.set_current_stage_table(None)
except Exception:
pass
# Preflight (URL-duplicate prompts, etc.) should be cached within a single # Preflight (URL-duplicate prompts, etc.) should be cached within a single
# pipeline run, not across independent pipelines. # pipeline run, not across independent pipelines.
try: try:
@@ -2615,11 +2651,13 @@ class PipelineExecutor:
if ( if (
(not stage_is_last) (not stage_is_last)
and (not emits) and (not emits)
and cmd_name in {"download-media", "download_media"} and cmd_name in {"download-media", "download_media", "download-data", "download_data"}
and stage_table is not None and stage_table is not None
and ( and (
stage_table_type in {"ytdlp.formatlist", "download-media", "download_media", "bandcamp", "youtube"} stage_table_type in {"ytdlp.formatlist", "download-media", "download_media", "bandcamp", "youtube"}
or stage_table_source in {"download-media", "download_media"} or stage_table_source in {"download-media", "download_media"}
or stage_table_type in {"internetarchive.formats"}
or stage_table_source in {"download-file"}
) )
): ):
try: try:
@@ -2812,6 +2850,13 @@ class PipelineExecutor:
_pipeline_ctx.set_live_progress(None) _pipeline_ctx.set_live_progress(None)
except Exception: except Exception:
pass pass
# End-of-command cleanup: avoid leaking current stage tables into
# the next REPL command (causes stale @ selection sources).
try:
if hasattr(ctx, "set_current_stage_table"):
ctx.set_current_stage_table(None)
except Exception:
pass
if pipeline_session: if pipeline_session:
pipeline_session.close(status=pipeline_status, error_msg=pipeline_error) pipeline_session.close(status=pipeline_status, error_msg=pipeline_error)
except Exception as exc: except Exception as exc:

View File

@@ -220,7 +220,6 @@ class InternetArchive(Provider):
"mediatype", "mediatype",
"creator", "creator",
"date", "date",
"downloads",
"collection", "collection",
] ]
@@ -243,7 +242,11 @@ class InternetArchive(Provider):
title = str(row.get("title") or identifier).strip() or identifier title = str(row.get("title") or identifier).strip() or identifier
mediatype = str(row.get("mediatype") or "").strip() mediatype = str(row.get("mediatype") or "").strip()
creator = str(row.get("creator") or "").strip() creator_raw = row.get("creator")
if isinstance(creator_raw, list):
creator = ", ".join(str(x) for x in creator_raw if x)
else:
creator = str(creator_raw or "").strip()
date = str(row.get("date") or "").strip() date = str(row.get("date") or "").strip()
annotations: List[str] = [] annotations: List[str] = []
@@ -272,9 +275,10 @@ class InternetArchive(Provider):
size_bytes=None, size_bytes=None,
tag=set(), tag=set(),
columns=[ columns=[
("identifier", identifier), ("title", title),
("mediatype", mediatype), ("mediatype", mediatype),
("date", date), ("date", date),
("creator", creator),
], ],
full_metadata=dict(row), full_metadata=dict(row),
) )

View File

@@ -15,6 +15,115 @@ from ProviderCore.base import Provider
_MATRIX_INIT_CHECK_CACHE: Dict[str, Tuple[bool, Optional[str]]] = {} _MATRIX_INIT_CHECK_CACHE: Dict[str, Tuple[bool, Optional[str]]] = {}
def _sniff_mime_from_header(path: Path) -> Optional[str]:
"""Best-effort MIME sniffing from file headers.
Used when the file has no/unknown extension (common for exported/temp files).
Keeps dependencies to stdlib only.
"""
try:
if not path.exists() or not path.is_file():
return None
with open(path, "rb") as handle:
header = handle.read(512)
if not header:
return None
# Images
if header.startswith(b"\xFF\xD8\xFF"):
return "image/jpeg"
if header.startswith(b"\x89PNG\r\n\x1a\n"):
return "image/png"
if header.startswith(b"GIF87a") or header.startswith(b"GIF89a"):
return "image/gif"
if header.startswith(b"BM"):
return "image/bmp"
if header.startswith(b"RIFF") and len(header) >= 12 and header[8:12] == b"WEBP":
return "image/webp"
# Audio
if header.startswith(b"fLaC"):
return "audio/flac"
if header.startswith(b"OggS"):
# Could be audio or video; treat as audio unless extension suggests video.
return "audio/ogg"
if header.startswith(b"ID3"):
return "audio/mpeg"
if len(header) >= 2 and header[0] == 0xFF and (header[1] & 0xE0) == 0xE0:
return "audio/mpeg"
if header.startswith(b"RIFF") and len(header) >= 12 and header[8:12] == b"WAVE":
return "audio/wav"
# Video
if header.startswith(b"RIFF") and len(header) >= 12 and header[8:12] == b"AVI ":
return "video/x-msvideo"
if header.startswith(b"\x1A\x45\xDF\xA3"):
# EBML container: Matroska/WebM.
return "video/x-matroska"
if len(header) >= 12 and header[4:8] == b"ftyp":
# ISO BMFF: mp4/mov/m4a. Default to mp4; extension can refine.
return "video/mp4"
# MPEG-TS / M2TS (sync byte every 188 bytes)
try:
if path.stat().st_size >= 188 * 2 and header[0] == 0x47:
with open(path, "rb") as handle:
handle.seek(188)
b = handle.read(1)
if b == b"\x47":
return "video/mp2t"
except Exception:
pass
return None
except Exception:
return None
def _classify_matrix_upload(path: Path, *, explicit_mime_type: Optional[str] = None) -> Tuple[str, str]:
"""Return (mime_type, msgtype) for Matrix uploads."""
mime_type = str(explicit_mime_type or "").strip() or None
if not mime_type:
# `mimetypes.guess_type` expects a string/URL; Path can return None on some platforms.
mime_type, _ = mimetypes.guess_type(str(path))
if not mime_type:
mime_type = _sniff_mime_from_header(path)
# Refinements based on extension for ambiguous containers.
ext = path.suffix.lower()
if ext in {".m4a", ".aac"}:
mime_type = mime_type or "audio/mp4"
if ext in {".mkv", ".webm"}:
mime_type = mime_type or "video/x-matroska"
if ext in {".ogv"}:
mime_type = mime_type or "video/ogg"
msgtype = "m.file"
if mime_type:
mt = mime_type.casefold()
if mt.startswith("image/"):
msgtype = "m.image"
elif mt.startswith("audio/"):
msgtype = "m.audio"
elif mt.startswith("video/"):
msgtype = "m.video"
# Final fallback for unknown MIME types.
if msgtype == "m.file":
audio_exts = {".mp3", ".flac", ".wav", ".m4a", ".aac", ".ogg", ".opus", ".wma", ".mka", ".alac"}
video_exts = {".mp4", ".mkv", ".webm", ".mov", ".avi", ".flv", ".mpg", ".mpeg", ".ts", ".m4v", ".wmv", ".m2ts", ".mts", ".3gp", ".ogv"}
image_exts = {".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".tiff"}
if ext in audio_exts:
msgtype = "m.audio"
elif ext in video_exts:
msgtype = "m.video"
elif ext in image_exts:
msgtype = "m.image"
return (mime_type or "application/octet-stream"), msgtype
def _normalize_homeserver(value: str) -> str: def _normalize_homeserver(value: str) -> str:
text = str(value or "").strip() text = str(value or "").strip()
if not text: if not text:
@@ -189,9 +298,8 @@ class Matrix(Provider):
"Content-Type": "application/octet-stream", "Content-Type": "application/octet-stream",
} }
mime_type, _ = mimetypes.guess_type(path) mime_type, msgtype = _classify_matrix_upload(path, explicit_mime_type=kwargs.get("mime_type"))
if mime_type: headers["Content-Type"] = mime_type
headers["Content-Type"] = mime_type
filename = path.name filename = path.name
@@ -222,19 +330,6 @@ class Matrix(Provider):
except Exception: except Exception:
download_url_for_store = "" download_url_for_store = ""
# Determine message type
msgtype = "m.file"
ext = path.suffix.lower()
audio_exts = {".mp3", ".flac", ".wav", ".m4a", ".aac", ".ogg", ".opus", ".wma", ".mka", ".alac"}
video_exts = {".mp4", ".mkv", ".webm", ".mov", ".avi", ".flv", ".mpg", ".mpeg", ".ts", ".m4v", ".wmv"}
image_exts = {".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".tiff"}
if ext in audio_exts:
msgtype = "m.audio"
elif ext in video_exts:
msgtype = "m.video"
elif ext in image_exts:
msgtype = "m.image"
info = {"mimetype": mime_type, "size": path.stat().st_size} info = {"mimetype": mime_type, "size": path.stat().st_size}
payload = {"msgtype": msgtype, "body": filename, "url": content_uri, "info": info} payload = {"msgtype": msgtype, "body": filename, "url": content_uri, "info": info}

View File

@@ -955,6 +955,39 @@ class HydrusNetwork(Store):
debug(f"{self._log_prefix()} get_file: url={browser_url}") debug(f"{self._log_prefix()} get_file: url={browser_url}")
return browser_url return browser_url
def delete_file(self, file_identifier: str, **kwargs: Any) -> bool:
"""Delete a file from Hydrus, then clear the deletion record.
This is used by the delete-file cmdlet when the item belongs to a HydrusNetwork store.
"""
try:
client = self._client
if client is None:
debug(f"{self._log_prefix()} delete_file: client unavailable")
return False
file_hash = str(file_identifier or "").strip().lower()
if len(file_hash) != 64 or not all(ch in "0123456789abcdef" for ch in file_hash):
debug(f"{self._log_prefix()} delete_file: invalid file hash '{file_identifier}'")
return False
reason = kwargs.get("reason")
reason_text = str(reason).strip() if isinstance(reason, str) and reason.strip() else None
# 1) Delete file
client.delete_files([file_hash], reason=reason_text)
# 2) Clear deletion record (best-effort)
try:
client.clear_file_deletion_record([file_hash])
except Exception as exc:
debug(f"{self._log_prefix()} delete_file: clear_file_deletion_record failed: {exc}")
return True
except Exception as exc:
debug(f"{self._log_prefix()} delete_file failed: {exc}")
return False
def get_metadata(self, file_hash: str, **kwargs: Any) -> Optional[Dict[str, Any]]: def get_metadata(self, file_hash: str, **kwargs: Any) -> Optional[Dict[str, Any]]:
"""Get metadata for a file from Hydrus by hash. """Get metadata for a file from Hydrus by hash.

View File

@@ -47,6 +47,13 @@ def _normalize_title_for_extract(text: str) -> str:
s = s.replace("\u2011", "-") # non-breaking hyphen s = s.replace("\u2011", "-") # non-breaking hyphen
s = s.replace("\u2012", "-") # figure dash s = s.replace("\u2012", "-") # figure dash
s = s.replace("\u2015", "-") # horizontal bar s = s.replace("\u2015", "-") # horizontal bar
# Collapse any whitespace runs (including newlines/tabs) to a single space.
# Some sources wrap the artist name or title across lines.
try:
s = re.sub(r"\s+", " ", s).strip()
except Exception:
s = " ".join(s.split())
return s return s
@@ -70,7 +77,7 @@ def _literal_to_title_pattern_regex(literal: str) -> str:
if ch.isspace(): if ch.isspace():
while i < len(literal) and literal[i].isspace(): while i < len(literal) and literal[i].isspace():
i += 1 i += 1
out.append(r"\\s*") out.append(r"\s*")
continue continue
out.append(re.escape(ch)) out.append(re.escape(ch))
i += 1 i += 1
@@ -95,7 +102,7 @@ def _compile_extract_template(template: str) -> tuple[re.Pattern[str], List[str]
raise ValueError("extract template must contain at least one (field)") raise ValueError("extract template must contain at least one (field)")
field_names: List[str] = [] field_names: List[str] = []
parts: List[str] = [r"^\\s*"] parts: List[str] = [r"^\s*"]
last_end = 0 last_end = 0
for idx, m in enumerate(matches): for idx, m in enumerate(matches):
@@ -108,18 +115,24 @@ def _compile_extract_template(template: str) -> tuple[re.Pattern[str], List[str]
raise ValueError(f"invalid field name '{raw_name}' (use A-Z, 0-9, underscore)") raise ValueError(f"invalid field name '{raw_name}' (use A-Z, 0-9, underscore)")
field_names.append(raw_name) field_names.append(raw_name)
name_lower = raw_name.lower()
is_last = idx == (len(matches) - 1) is_last = idx == (len(matches) - 1)
if is_last: if is_last:
parts.append(fr"(?P<{raw_name}>.+)") parts.append(fr"(?P<{raw_name}>.+)")
else: else:
parts.append(fr"(?P<{raw_name}>.+?)") # Heuristic: common numeric fields should capture full digit runs.
# This avoids ambiguous splits like track='2', title='3 ...'.
if name_lower in {"disk", "disc", "cd", "track", "trk", "episode", "ep", "season", "year"}:
parts.append(fr"(?P<{raw_name}>\d+)")
else:
parts.append(fr"(?P<{raw_name}>.+?)")
last_end = m.end() last_end = m.end()
tail = tpl[last_end:] tail = tpl[last_end:]
if tail: if tail:
parts.append(_literal_to_title_pattern_regex(tail)) parts.append(_literal_to_title_pattern_regex(tail))
parts.append(r"\\s*$") parts.append(r"\s*$")
rx = "".join(parts) rx = "".join(parts)
return re.compile(rx, flags=re.IGNORECASE), field_names return re.compile(rx, flags=re.IGNORECASE), field_names

426
cmdlet/archive_file.py Normal file
View File

@@ -0,0 +1,426 @@
"""Create a single .tar.zst archive from piped file selections."""
from __future__ import annotations
import re
import sys
import tarfile
import tempfile
import time
import uuid
from pathlib import Path
from typing import Any, Dict, List, Sequence, Set
from urllib.parse import parse_qs, urlparse
from SYS.logger import log
import pipeline as ctx
from config import resolve_output_dir
from . import _shared as sh
Cmdlet = sh.Cmdlet
CmdletArg = sh.CmdletArg
SharedArgs = sh.SharedArgs
coerce_to_pipe_object = sh.coerce_to_pipe_object
create_pipe_object_result = sh.create_pipe_object_result
parse_cmdlet_args = sh.parse_cmdlet_args
should_show_help = sh.should_show_help
_SHA256_RE = re.compile(r"^[0-9a-fA-F]{64}$")
def _extract_sha256_hex(item: Any) -> str:
try:
if isinstance(item, dict):
h = item.get("hash")
else:
h = getattr(item, "hash", None)
if isinstance(h, str) and _SHA256_RE.fullmatch(h.strip()):
return h.strip().lower()
except Exception:
pass
return ""
def _extract_store_name(item: Any) -> str:
try:
if isinstance(item, dict):
s = item.get("store")
else:
s = getattr(item, "store", None)
return str(s or "").strip()
except Exception:
return ""
def _extract_url(item: Any) -> str:
try:
u = sh.get_field(item, "url") or sh.get_field(item, "target")
if isinstance(u, str) and u.strip().lower().startswith(("http://", "https://")):
return u.strip()
except Exception:
pass
return ""
def _extract_hash_from_hydrus_file_url(url: str) -> str:
try:
parsed = urlparse(str(url))
if not (parsed.path or "").endswith("/get_files/file"):
return ""
qs = parse_qs(parsed.query or "")
h = (qs.get("hash") or [""])[0]
if isinstance(h, str) and _SHA256_RE.fullmatch(h.strip()):
return h.strip().lower()
except Exception:
pass
return ""
def _hydrus_instance_names(config: Dict[str, Any]) -> Set[str]:
instances: Set[str] = set()
try:
store_cfg = config.get("store") if isinstance(config, dict) else None
if isinstance(store_cfg, dict):
hydrus_cfg = store_cfg.get("hydrusnetwork")
if isinstance(hydrus_cfg, dict):
instances = {str(k).strip().lower() for k in hydrus_cfg.keys() if str(k).strip()}
except Exception:
instances = set()
return instances
def _maybe_download_hydrus_item(item: Any, config: Dict[str, Any], output_dir: Path) -> Path | None:
"""Download a Hydrus-backed item to a local temp path (best-effort).
This is intentionally side-effect free except for writing the local temp file.
"""
try:
from config import get_hydrus_access_key, get_hydrus_url
from API.HydrusNetwork import HydrusNetwork as HydrusClient, download_hydrus_file
except Exception:
return None
store_name = _extract_store_name(item)
store_lower = store_name.lower()
hydrus_instances = _hydrus_instance_names(config)
store_hint = store_lower in {"hydrus", "hydrusnetwork"} or (store_lower in hydrus_instances)
url = _extract_url(item)
file_hash = _extract_sha256_hex(item) or (_extract_hash_from_hydrus_file_url(url) if url else "")
if not file_hash:
return None
# Only treat it as Hydrus when we have an explicit Hydrus file URL OR the store suggests it.
is_hydrus_url = False
if url:
try:
parsed = urlparse(url)
is_hydrus_url = (parsed.path or "").endswith("/get_files/file") and _extract_hash_from_hydrus_file_url(url) == file_hash
except Exception:
is_hydrus_url = False
if not (is_hydrus_url or store_hint):
return None
# Prefer store name as instance key; fall back to "home".
access_key = None
hydrus_url = None
for inst in [s for s in [store_lower, "home"] if s]:
try:
access_key = (get_hydrus_access_key(config, inst) or "").strip() or None
hydrus_url = (get_hydrus_url(config, inst) or "").strip() or None
if access_key and hydrus_url:
break
except Exception:
access_key = None
hydrus_url = None
if not access_key or not hydrus_url:
return None
client = HydrusClient(url=hydrus_url, access_key=access_key, timeout=60.0)
file_url = url if (url and is_hydrus_url) else client.file_url(file_hash)
# Best-effort extension from Hydrus metadata.
suffix = ".hydrus"
try:
meta_response = client.fetch_file_metadata(hashes=[file_hash], include_mime=True)
entries = meta_response.get("metadata") if isinstance(meta_response, dict) else None
if isinstance(entries, list) and entries:
entry = entries[0]
if isinstance(entry, dict):
ext = entry.get("ext")
if isinstance(ext, str) and ext.strip():
cleaned = ext.strip()
if not cleaned.startswith("."):
cleaned = "." + cleaned.lstrip(".")
if len(cleaned) <= 12:
suffix = cleaned
except Exception:
pass
try:
output_dir.mkdir(parents=True, exist_ok=True)
except Exception:
pass
dest = output_dir / f"{file_hash}{suffix}"
if dest.exists():
dest = output_dir / f"{file_hash}_{uuid.uuid4().hex[:10]}{suffix}"
headers = {"Hydrus-Client-API-Access-Key": access_key}
download_hydrus_file(file_url, headers, dest, timeout=60.0)
try:
if dest.exists() and dest.is_file():
return dest
except Exception:
return None
return None
def _resolve_existing_or_fetch_path(item: Any, config: Dict[str, Any]) -> tuple[Path | None, Path | None]:
"""Return (path, temp_path) where temp_path is non-None only for files we downloaded."""
# 1) Direct local path
try:
po = coerce_to_pipe_object(item, None)
raw_path = getattr(po, "path", None) or getattr(po, "target", None) or sh.get_pipe_object_path(item)
if raw_path:
p = Path(str(raw_path)).expanduser()
if p.exists():
return p, None
except Exception:
pass
# 2) Store-backed path
file_hash = _extract_sha256_hex(item)
store_name = _extract_store_name(item)
if file_hash and store_name:
try:
from Store import Store
store = Store(config)
backend = store[store_name]
src = backend.get_file(file_hash)
if isinstance(src, Path):
if src.exists():
return src, None
elif isinstance(src, str) and src.strip():
cand = Path(src).expanduser()
if cand.exists():
return cand, None
# If the backend returns a URL (HydrusNetwork), download it.
if src.strip().lower().startswith(("http://", "https://")):
tmp_base = None
try:
tmp_base = config.get("temp") if isinstance(config, dict) else None
except Exception:
tmp_base = None
out_dir = Path(str(tmp_base)).expanduser() if tmp_base else (Path(tempfile.gettempdir()) / "Medios-Macina")
out_dir = out_dir / "archive" / "hydrus"
downloaded = _maybe_download_hydrus_item({"hash": file_hash, "store": store_name, "url": src.strip()}, config, out_dir)
if downloaded is not None:
return downloaded, downloaded
except Exception:
pass
# 3) Hydrus-backed items without backend.get_file path.
try:
tmp_base = config.get("temp") if isinstance(config, dict) else None
except Exception:
tmp_base = None
out_dir = Path(str(tmp_base)).expanduser() if tmp_base else (Path(tempfile.gettempdir()) / "Medios-Macina")
out_dir = out_dir / "archive" / "hydrus"
downloaded = _maybe_download_hydrus_item(item, config, out_dir)
if downloaded is not None:
return downloaded, downloaded
return None, None
def _unique_arcname(name: str, seen: Set[str]) -> str:
base = str(name or "").replace("\\", "/")
base = base.lstrip("/")
if not base:
base = "file"
if base not in seen:
seen.add(base)
return base
stem = base
suffix = ""
if "/" not in base:
p = Path(base)
stem = p.stem
suffix = p.suffix
n = 2
while True:
candidate = f"{stem} ({n}){suffix}" if stem else f"file ({n}){suffix}"
if candidate not in seen:
seen.add(candidate)
return candidate
n += 1
def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
if should_show_help(args):
log(f"Cmdlet: {CMDLET.name}\nSummary: {CMDLET.summary}\nUsage: {CMDLET.usage}")
return 0
parsed = parse_cmdlet_args(args, CMDLET)
level_raw = parsed.get("level")
try:
level = int(level_raw) if level_raw is not None else 11
except Exception:
level = 11
if level < 1:
level = 1
if level > 22:
level = 22
# Output destination is controlled by the shared -path behavior in the pipeline runner.
# This cmdlet always creates the archive in the configured output directory and emits it.
# Collect piped items; archive-file is a batch command (single output).
items: List[Any] = []
if isinstance(result, list):
items = list(result)
elif result is not None:
items = [result]
if not items:
log("No piped items provided to archive-file", file=sys.stderr)
return 1
temp_downloads: List[Path] = []
try:
paths: List[Path] = []
for it in items:
p, tmp = _resolve_existing_or_fetch_path(it, config)
if p is None:
continue
paths.append(p)
if tmp is not None:
temp_downloads.append(tmp)
# Keep stable order, remove duplicates.
uniq: List[Path] = []
seen_paths: Set[str] = set()
for p in paths:
key = str(p.resolve()) if p.exists() else str(p)
if key in seen_paths:
continue
seen_paths.add(key)
uniq.append(p)
paths = uniq
if not paths:
log("No existing file paths found in piped items", file=sys.stderr)
return 1
out_dir = resolve_output_dir(config)
try:
out_dir.mkdir(parents=True, exist_ok=True)
except Exception:
pass
stamp = time.strftime("%Y%m%d_%H%M%S")
out_path = out_dir / f"archive_{stamp}.tar.zst"
try:
out_path = sh._unique_destination_path(out_path) # type: ignore[attr-defined]
except Exception:
pass
try:
out_path.parent.mkdir(parents=True, exist_ok=True)
except Exception as exc:
log(f"Failed to create output directory: {out_path.parent} ({exc})", file=sys.stderr)
return 1
# Import zstandard lazily so the rest of the CLI still runs without it.
try:
import zstandard as zstd # type: ignore
except Exception:
log("Missing dependency: zstandard (pip install zstandard)", file=sys.stderr)
return 1
# Write tar stream into zstd stream.
try:
with open(out_path, "wb") as out_handle:
cctx = zstd.ZstdCompressor(level=level)
with cctx.stream_writer(out_handle) as compressor:
with tarfile.open(fileobj=compressor, mode="w|", format=tarfile.PAX_FORMAT) as tf:
seen_names: Set[str] = set()
for p in paths:
arcname = _unique_arcname(p.name, seen_names)
# For directories, tarfile will include contents when recursive=True.
try:
tf.add(str(p), arcname=arcname, recursive=True)
except Exception as exc:
log(f"Failed to add to archive: {p} ({exc})", file=sys.stderr)
except Exception as exc:
log(f"Archive creation failed: {exc}", file=sys.stderr)
return 1
# Emit a single artifact downstream.
hash_value = None
try:
from SYS.utils import sha256_file
hash_value = sha256_file(out_path)
except Exception:
hash_value = None
pipe_obj = create_pipe_object_result(
source="archive",
identifier=out_path.stem,
file_path=str(out_path),
cmdlet_name="archive-file",
title=out_path.name,
hash_value=hash_value,
is_temp=True,
store="PATH",
extra={
"target": str(out_path),
"archive_format": "tar.zst",
"compression": "zstd",
"level": level,
"source_count": len(paths),
"source_paths": [str(p) for p in paths],
},
)
ctx.emit(pipe_obj)
return 0
finally:
# Best-effort cleanup of any temp Hydrus downloads we created.
for tmp in temp_downloads:
try:
tmp.unlink(missing_ok=True) # type: ignore[arg-type]
except TypeError:
try:
if tmp.exists():
tmp.unlink()
except Exception:
pass
except Exception:
pass
CMDLET = Cmdlet(
name="archive-file",
summary="Archive piped files into a single .tar.zst.",
usage="@N | archive-file [-level <1-22>] [-path <path>]",
arg=[
CmdletArg("-level", type="integer", description="Zstandard compression level (default: 11)."),
SharedArgs.PATH,
],
detail=[
"- Example: @1-5 | archive-file",
"- Default zstd level is 11.",
"- Emits one output item (the archive) for downstream piping.",
],
)
CMDLET.exec = _run
CMDLET.register()

View File

@@ -297,51 +297,81 @@ class Delete_File(sh.Cmdlet):
should_try_hydrus = False should_try_hydrus = False
if should_try_hydrus and hash_hex: if should_try_hydrus and hash_hex:
client = None # Prefer deleting via the resolved store backend when it is a HydrusNetwork store.
if store: # This ensures store-specific post-delete hooks run (e.g., clearing Hydrus deletion records).
# Store specified: do not fall back to a global/default Hydrus client. did_backend_delete = False
try:
registry = Store(config)
backend = registry[str(store)]
candidate = getattr(backend, "_client", None)
if candidate is not None and hasattr(candidate, "_post"):
client = candidate
except Exception as exc:
if not local_deleted:
log(f"Hydrus client unavailable for store '{store}': {exc}", file=sys.stderr)
return False
if client is None:
if not local_deleted:
log(f"Hydrus client unavailable for store '{store}'", file=sys.stderr)
return False
else:
# No store context; use default Hydrus client.
try:
client = hydrus_wrapper.get_client(config)
except Exception as exc:
if not local_deleted:
log(f"Hydrus client unavailable: {exc}", file=sys.stderr)
return False
if client is None:
if not local_deleted:
log("Hydrus client unavailable", file=sys.stderr)
return False
payload: Dict[str, Any] = {"hashes": [hash_hex]}
if reason:
payload["reason"] = reason
try: try:
client._post("/add_files/delete_files", data=payload) # type: ignore[attr-defined] if backend is not None:
deleter = getattr(backend, "delete_file", None)
if callable(deleter):
did_backend_delete = bool(deleter(hash_hex, reason=reason))
except Exception:
did_backend_delete = False
if did_backend_delete:
hydrus_deleted = True hydrus_deleted = True
title_str = str(title_val).strip() if title_val else "" title_str = str(title_val).strip() if title_val else ""
if title_str: if title_str:
debug(f"{hydrus_prefix} Deleted title:{title_str} hash:{hash_hex}", file=sys.stderr) debug(f"{hydrus_prefix} Deleted title:{title_str} hash:{hash_hex}", file=sys.stderr)
else: else:
debug(f"{hydrus_prefix} Deleted hash:{hash_hex}", file=sys.stderr) debug(f"{hydrus_prefix} Deleted hash:{hash_hex}", file=sys.stderr)
except Exception: else:
# If it's not in Hydrus (e.g. 404 or similar), that's fine # Fallback to direct client calls.
if not local_deleted: client = None
return [] if store:
# Store specified: do not fall back to a global/default Hydrus client.
try:
registry = Store(config)
backend = registry[str(store)]
candidate = getattr(backend, "_client", None)
if candidate is not None and hasattr(candidate, "_post"):
client = candidate
except Exception as exc:
if not local_deleted:
log(f"Hydrus client unavailable for store '{store}': {exc}", file=sys.stderr)
return False
if client is None:
if not local_deleted:
log(f"Hydrus client unavailable for store '{store}'", file=sys.stderr)
return False
else:
# No store context; use default Hydrus client.
try:
client = hydrus_wrapper.get_client(config)
except Exception as exc:
if not local_deleted:
log(f"Hydrus client unavailable: {exc}", file=sys.stderr)
return False
if client is None:
if not local_deleted:
log("Hydrus client unavailable", file=sys.stderr)
return False
payload: Dict[str, Any] = {"hashes": [hash_hex]}
if reason:
payload["reason"] = reason
try:
client._post("/add_files/delete_files", data=payload) # type: ignore[attr-defined]
# Best-effort clear deletion record if supported by this client.
try:
clearer = getattr(client, "clear_file_deletion_record", None)
if callable(clearer):
clearer([hash_hex])
else:
client._post("/add_files/clear_file_deletion_record", data={"hashes": [hash_hex]}) # type: ignore[attr-defined]
except Exception:
pass
hydrus_deleted = True
title_str = str(title_val).strip() if title_val else ""
if title_str:
debug(f"{hydrus_prefix} Deleted title:{title_str} hash:{hash_hex}", file=sys.stderr)
else:
debug(f"{hydrus_prefix} Deleted hash:{hash_hex}", file=sys.stderr)
except Exception:
# If it's not in Hydrus (e.g. 404 or similar), that's fine
if not local_deleted:
return []
if hydrus_deleted and hash_hex: if hydrus_deleted and hash_hex:
size_hint = None size_hint = None

267
cmdlet/download_data.py Normal file
View File

@@ -0,0 +1,267 @@
"""Smart downloader front-door.
Currently focused on Internet Archive item pages:
- Takes a piped InternetArchive search-provider row (table=internetarchive) or an archive.org details URL
- Displays a selectable table of available files/formats (PDF/ZIP/OCR/etc)
- Selecting a row via @N expands to download-file <direct-url>
This enables:
search-provider -provider internetarchive "..."
@3 # shows formats table
@2 | add-file ... # downloads selected file then pipes to add-file
"""
from __future__ import annotations
import re
import sys
from typing import Any, Dict, List, Sequence, cast
from urllib.parse import quote
from SYS.logger import log, debug
import pipeline as pipeline_context
from result_table import ResultTable
from . import _shared as sh
Cmdlet = sh.Cmdlet
SharedArgs = sh.SharedArgs
parse_cmdlet_args = sh.parse_cmdlet_args
get_field = sh.get_field
def _extract_ia_identifier(text: str) -> str:
s = str(text or "").strip()
if not s:
return ""
# https://archive.org/details/<identifier>
m = re.search(r"archive\.org/(?:details|download)/([^/?#\s]+)", s, flags=re.IGNORECASE)
if m:
return str(m.group(1) or "").strip()
# internetarchive:<identifier>
if s.lower().startswith("internetarchive:"):
return s.split(":", 1)[-1].strip()
return ""
class Download_Data(Cmdlet):
def __init__(self) -> None:
super().__init__(
name="download-data",
summary="List downloadable files/formats for provider items (e.g., Internet Archive)",
usage="download-data <url> OR @N | download-data (provider item), then select a file with @N",
alias=[],
arg=[SharedArgs.URL],
detail=[
"For Internet Archive item pages, shows a selectable list of available files (PDF/ZIP/OCR/etc).",
"Select a file row with @N to run download-file on that direct URL.",
],
exec=self.run,
)
self.register()
def run(self, result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
try:
# parse_cmdlet_args typing varies across cmdlets; keep runtime behavior.
parsed = parse_cmdlet_args(args, cast(Any, self))
except Exception:
parsed = {}
raw_urls = parsed.get("url", [])
if isinstance(raw_urls, str):
raw_urls = [raw_urls]
url_arg = str(raw_urls[0]).strip() if raw_urls else ""
piped_items: List[Any] = []
if isinstance(result, list):
piped_items = list(result)
elif result is not None:
piped_items = [result]
# Prefer piped item target if present.
target = ""
if piped_items:
target = str(get_field(piped_items[0], "path") or get_field(piped_items[0], "url") or "").strip()
if not target:
target = url_arg
table_name = ""
try:
table_name = str(get_field(piped_items[0], "table") or "").strip().lower() if piped_items else ""
except Exception:
table_name = ""
identifier = ""
if piped_items:
md = get_field(piped_items[0], "full_metadata")
if isinstance(md, dict):
identifier = str(md.get("identifier") or "").strip()
if not identifier:
identifier = _extract_ia_identifier(target)
if table_name == "internetarchive" or ("archive.org" in target.lower() and identifier):
return self._run_internetarchive(piped_items[0] if piped_items else None, identifier=identifier)
log("download-data: unsupported target (currently only Internet Archive item pages are supported)", file=sys.stderr)
return 1
@staticmethod
def _run_internetarchive(item: Any, *, identifier: str) -> int:
try:
from Provider.internetarchive import _ia as _ia_loader
except Exception as exc:
log(f"download-data: Internet Archive provider unavailable: {exc}", file=sys.stderr)
return 1
def _is_ia_metadata_file(f: Dict[str, Any]) -> bool:
try:
source = str(f.get("source") or "").strip().lower()
fmt = str(f.get("format") or "").strip().lower()
except Exception:
source = ""
fmt = ""
if source == "metadata":
return True
if fmt in {"metadata", "archive bittorrent"}:
return True
if fmt.startswith("thumbnail"):
return True
return False
ia = None
try:
ia = _ia_loader()
except Exception as exc:
log(f"download-data: Internet Archive module unavailable: {exc}", file=sys.stderr)
return 1
try:
get_item = getattr(ia, "get_item", None)
if not callable(get_item):
raise Exception("internetarchive.get_item is not available")
ia_item = cast(Any, get_item(str(identifier)))
except Exception as exc:
log(f"download-data: Internet Archive item lookup failed: {exc}", file=sys.stderr)
return 1
files: List[Dict[str, Any]] = []
try:
raw_files = getattr(ia_item, "files", None)
if isinstance(raw_files, list):
for f in raw_files:
if isinstance(f, dict):
files.append(f)
except Exception:
files = []
if not files:
try:
for f in ia_item.get_files():
name = getattr(f, "name", None)
if not name and isinstance(f, dict):
name = f.get("name")
if not name:
continue
files.append(
{
"name": str(name),
"size": getattr(f, "size", None),
"format": getattr(f, "format", None),
"source": getattr(f, "source", None),
}
)
except Exception:
files = []
if not files:
log("download-data: Internet Archive item has no files", file=sys.stderr)
return 1
# Prefer non-metadata files for the picker.
candidates = [f for f in files if not _is_ia_metadata_file(f)]
if not candidates:
candidates = list(files)
def _key(f: Dict[str, Any]) -> tuple[str, str]:
fmt = str(f.get("format") or "").strip().lower()
name = str(f.get("name") or "").strip().lower()
return (fmt, name)
candidates.sort(key=_key)
title = ""
try:
title = str(get_field(item, "title") or "").strip()
except Exception:
title = ""
table_title = f"Internet Archive: {title}".strip().rstrip(":")
if not title:
table_title = f"Internet Archive: {identifier}".strip().rstrip(":")
table = ResultTable(table_title).set_preserve_order(True)
table.set_table("internetarchive.formats")
# Selecting a row should expand to `download-file <direct-url>`.
table.set_source_command("download-file", [])
rows: List[Dict[str, Any]] = []
for f in candidates:
name = str(f.get("name") or "").strip()
if not name:
continue
fmt = str(f.get("format") or "").strip()
src = str(f.get("source") or "").strip()
size_val: Any = f.get("size")
try:
size_val = int(size_val) if size_val not in (None, "") else ""
except Exception:
# Keep as-is; ResultTable will stringify.
pass
direct_url = f"https://archive.org/download/{identifier}/{quote(name, safe='')}"
row_item: Dict[str, Any] = {
"table": "internetarchive",
"title": fmt or name,
"path": direct_url,
"url": direct_url,
"columns": [
("Format", fmt),
("Name", name),
("Size", size_val),
("Source", src),
],
# Used by @N expansion: download-file <direct-url>
"_selection_args": [direct_url],
"full_metadata": {
"identifier": identifier,
"name": name,
"format": fmt,
"source": src,
"size": f.get("size"),
},
}
rows.append(row_item)
table.add_result(row_item)
if not rows:
log("download-data: no downloadable files found for this item", file=sys.stderr)
return 1
try:
pipeline_context.set_last_result_table(table, rows, subject=item)
pipeline_context.set_current_stage_table(table)
except Exception as exc:
debug(f"[download-data] Failed to register result table: {exc}")
return 0
CMDLET = Download_Data()

View File

@@ -16,6 +16,7 @@ from . import _shared as sh
Cmdlet = sh.Cmdlet Cmdlet = sh.Cmdlet
CmdletArg = sh.CmdletArg CmdletArg = sh.CmdletArg
SharedArgs = sh.SharedArgs
create_pipe_object_result = sh.create_pipe_object_result create_pipe_object_result = sh.create_pipe_object_result
get_field = sh.get_field get_field = sh.get_field
get_pipe_object_hash = sh.get_pipe_object_hash get_pipe_object_hash = sh.get_pipe_object_hash
@@ -37,7 +38,6 @@ except ImportError:
try: try:
from metadata import ( from metadata import (
read_tags_from_file, read_tags_from_file,
dedup_tags_by_namespace,
merge_multiple_tag_lists, merge_multiple_tag_lists,
) )
HAS_METADATA_API = True HAS_METADATA_API = True
@@ -87,7 +87,7 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
delete_after = parsed.get("delete", False) delete_after = parsed.get("delete", False)
output_override: Optional[Path] = None output_override: Optional[Path] = None
output_arg = parsed.get("output") output_arg = parsed.get("path")
if output_arg: if output_arg:
try: try:
output_override = Path(str(output_arg)).expanduser() output_override = Path(str(output_arg)).expanduser()
@@ -928,10 +928,10 @@ def _merge_pdf(files: List[Path], output: Path) -> bool:
CMDLET = Cmdlet( CMDLET = Cmdlet(
name="merge-file", name="merge-file",
summary="Merge multiple files into a single output file. Supports audio, video, PDF, and text merging with optional cleanup.", summary="Merge multiple files into a single output file. Supports audio, video, PDF, and text merging with optional cleanup.",
usage="merge-file [-delete] [-output <path>] [-format <auto|mka|m4a|m4b|mp3|aac|opus|mp4|mkv|pdf|txt>]", usage="merge-file [-delete] [-path <path>] [-format <auto|mka|m4a|m4b|mp3|aac|opus|mp4|mkv|pdf|txt>]",
arg=[ arg=[
CmdletArg("-delete", type="flag", description="Delete source files after successful merge."), CmdletArg("-delete", type="flag", description="Delete source files after successful merge."),
CmdletArg("-output", description="Override output file path."), SharedArgs.PATH,
CmdletArg("-format", description="Output format (auto/mka/m4a/m4b/mp3/aac/opus/mp4/mkv/pdf/txt). Default: auto-detect from first file."), CmdletArg("-format", description="Output format (auto/mka/m4a/m4b/mp3/aac/opus/mp4/mkv/pdf/txt). Default: auto-detect from first file."),
], ],
detail=[ detail=[

View File

@@ -44,6 +44,7 @@ dependencies = [
"pypdf>=3.0.0", "pypdf>=3.0.0",
"mutagen>=1.46.0", "mutagen>=1.46.0",
"cbor2>=4.0", "cbor2>=4.0",
"zstandard>=0.23.0",
# Image and media support # Image and media support
"Pillow>=10.0.0", "Pillow>=10.0.0",

View File

@@ -15,6 +15,7 @@ internetarchive>=4.1.0
pypdf>=3.0.0 pypdf>=3.0.0
mutagen>=1.46.0 mutagen>=1.46.0
cbor2>=4.0 cbor2>=4.0
zstandard>=0.23.0
# Image and media support # Image and media support
Pillow>=10.0.0 Pillow>=10.0.0