This commit is contained in:
2026-01-05 07:51:19 -08:00
parent 8545367e28
commit 1f765cffda
32 changed files with 3447 additions and 3250 deletions

View File

@@ -10,10 +10,24 @@ Provides synchronous and asynchronous HTTP operations with:
import httpx
import asyncio
from typing import Optional, Dict, Any, Callable, BinaryIO
import sys
import time
import traceback
import re
from typing import Optional, Dict, Any, Callable, BinaryIO, List, Iterable, Set
from pathlib import Path
from urllib.parse import unquote, urlparse, parse_qs
import logging
from SYS.logger import debug, log
from SYS.models import DebugLogger, DownloadError, DownloadMediaResult, ProgressBar
from SYS.utils import ensure_directory, sha256_file
try: # Optional; used for metadata extraction when available
from SYS.metadata import extract_ytdlp_tags
except Exception: # pragma: no cover - optional dependency
extract_ytdlp_tags = None # type: ignore[assignment]
logger = logging.getLogger(__name__)
# Default configuration
@@ -366,6 +380,359 @@ class HTTPClient:
return self._client.stream(method, url, **kwargs)
def download_direct_file(
url: str,
output_dir: Path,
debug_logger: Optional[DebugLogger] = None,
quiet: bool = False,
suggested_filename: Optional[str] = None,
pipeline_progress: Optional[Any] = None,
) -> DownloadMediaResult:
"""Download a direct file (PDF, image, document, etc.) with guardrails and metadata hooks."""
ensure_directory(output_dir)
def _sanitize_filename(name: str) -> str:
# Windows-safe filename sanitization.
text = str(name or "").strip()
if not text:
return ""
text = text.replace("/", "\\")
text = text.split("\\")[-1]
invalid = set('<>:"/\\|?*')
cleaned_chars: List[str] = []
for ch in text:
o = ord(ch)
if o < 32 or ch in invalid:
cleaned_chars.append(" ")
continue
cleaned_chars.append(ch)
cleaned = " ".join("".join(cleaned_chars).split()).strip()
cleaned = cleaned.rstrip(" .")
return cleaned
def _unique_path(path: Path) -> Path:
if not path.exists():
return path
stem = path.stem
suffix = path.suffix
parent = path.parent
for i in range(1, 10_000):
candidate = parent / f"{stem} ({i}){suffix}"
if not candidate.exists():
return candidate
return parent / f"{stem} ({int(time.time())}){suffix}"
parsed_url = urlparse(url)
url_path = parsed_url.path
filename: Optional[str] = None
if parsed_url.query:
query_params = parse_qs(parsed_url.query)
for param_name in ("filename", "download", "file", "name"):
if param_name in query_params and query_params[param_name]:
filename = query_params[param_name][0]
filename = unquote(filename)
break
if not filename or not filename.strip():
filename = url_path.split("/")[-1] if url_path else ""
filename = unquote(filename)
if "?" in filename:
filename = filename.split("?")[0]
content_type = ""
try:
with HTTPClient(timeout=10.0) as client:
response = client._request("HEAD", url, follow_redirects=True)
content_disposition = response.headers.get("content-disposition", "")
try:
content_type = str(response.headers.get("content-type", "") or "").strip().lower()
except Exception:
content_type = ""
if content_disposition:
match = re.search(r'filename\*?=(?:"([^"]*)"|([^;\s]*))', content_disposition)
if match:
extracted_name = match.group(1) or match.group(2)
if extracted_name:
filename = unquote(extracted_name)
if not quiet:
debug(f"Filename from Content-Disposition: {filename}")
except Exception as exc:
if not quiet:
log(f"Could not get filename from headers: {exc}", file=sys.stderr)
try:
page_like_exts = {".php", ".asp", ".aspx", ".jsp", ".cgi"}
ext = ""
try:
ext = Path(str(filename or "")).suffix.lower()
except Exception:
ext = ""
ct0 = (content_type or "").split(";", 1)[0].strip().lower()
must_probe = bool(ct0.startswith("text/html") or ext in page_like_exts)
if must_probe:
with HTTPClient(timeout=10.0) as client:
with client._request_stream("GET", url, follow_redirects=True) as resp:
resp.raise_for_status()
ct = (
str(resp.headers.get("content-type", "") or "")
.split(";", 1)[0]
.strip()
.lower()
)
if ct.startswith("text/html"):
raise DownloadError("URL appears to be an HTML page, not a direct file")
except DownloadError:
raise
except Exception:
pass
suggested = _sanitize_filename(suggested_filename) if suggested_filename else ""
if suggested:
suggested_path = Path(suggested)
if suggested_path.suffix:
filename = suggested
else:
detected_ext = ""
try:
detected_ext = Path(str(filename)).suffix
except Exception:
detected_ext = ""
filename = suggested + detected_ext if detected_ext else suggested
try:
has_ext = bool(filename and Path(str(filename)).suffix)
except Exception:
has_ext = False
if filename and (not has_ext):
ct = (content_type or "").split(";", 1)[0].strip().lower()
ext_by_ct = {
"application/pdf": ".pdf",
"application/epub+zip": ".epub",
"application/x-mobipocket-ebook": ".mobi",
"image/jpeg": ".jpg",
"image/png": ".png",
"image/webp": ".webp",
"image/gif": ".gif",
"text/plain": ".txt",
"application/zip": ".zip",
}
if ct in ext_by_ct:
filename = f"{filename}{ext_by_ct[ct]}"
elif ct.startswith("text/html"):
raise DownloadError("URL appears to be an HTML page, not a direct file")
if not filename or not str(filename).strip():
raise DownloadError(
"Could not determine filename for URL (no Content-Disposition and no path filename)"
)
file_path = _unique_path(output_dir / str(filename))
use_pipeline_transfer = False
try:
if pipeline_progress is not None and hasattr(pipeline_progress, "update_transfer"):
ui = None
if hasattr(pipeline_progress, "ui_and_pipe_index"):
ui, _ = pipeline_progress.ui_and_pipe_index() # type: ignore[attr-defined]
use_pipeline_transfer = ui is not None
except Exception:
use_pipeline_transfer = False
progress_bar: Optional[ProgressBar] = None
if (not quiet) and (not use_pipeline_transfer):
progress_bar = ProgressBar()
transfer_started = [False]
if not quiet:
debug(f"Direct download: {filename}")
try:
start_time = time.time()
downloaded_bytes = [0]
transfer_started[0] = False
def _maybe_begin_transfer(content_length: int) -> None:
if pipeline_progress is None or transfer_started[0]:
return
try:
total_val: Optional[int] = (
int(content_length)
if isinstance(content_length, int) and content_length > 0
else None
)
except Exception:
total_val = None
try:
if hasattr(pipeline_progress, "begin_transfer"):
pipeline_progress.begin_transfer(
label=str(filename or "download"),
total=total_val,
)
transfer_started[0] = True
except Exception:
return
def progress_callback(bytes_downloaded: int, content_length: int) -> None:
downloaded_bytes[0] = int(bytes_downloaded or 0)
try:
if pipeline_progress is not None and hasattr(pipeline_progress, "update_transfer"):
_maybe_begin_transfer(content_length)
total_val: Optional[int] = (
int(content_length)
if isinstance(content_length, int) and content_length > 0
else None
)
pipeline_progress.update_transfer(
label=str(filename or "download"),
completed=int(bytes_downloaded or 0),
total=total_val,
)
except Exception:
pass
if progress_bar is not None:
progress_bar.update(
downloaded=int(bytes_downloaded or 0),
total=int(content_length) if content_length and content_length > 0 else None,
label=str(filename or "download"),
file=sys.stderr,
)
with HTTPClient(timeout=30.0) as client:
client.download(url, str(file_path), progress_callback=progress_callback)
elapsed = time.time() - start_time
try:
if progress_bar is not None:
progress_bar.finish()
except Exception:
pass
try:
if pipeline_progress is not None and transfer_started[0] and hasattr(
pipeline_progress, "finish_transfer"
):
pipeline_progress.finish_transfer(label=str(filename or "download"))
except Exception:
pass
if not quiet:
debug(f"✓ Downloaded in {elapsed:.1f}s")
ext_out = ""
try:
ext_out = Path(str(filename)).suffix.lstrip(".")
except Exception:
ext_out = ""
info: Dict[str, Any] = {
"id": str(filename).rsplit(".", 1)[0] if "." in str(filename) else str(filename),
"ext": ext_out,
"webpage_url": url,
}
hash_value = None
try:
hash_value = sha256_file(file_path)
except Exception:
pass
tags: List[str] = []
if extract_ytdlp_tags:
try:
tags = extract_ytdlp_tags(info)
except Exception as exc:
log(f"Error extracting tags: {exc}", file=sys.stderr)
if not any(str(t).startswith("title:") for t in tags):
info["title"] = str(filename)
tags = []
if extract_ytdlp_tags:
try:
tags = extract_ytdlp_tags(info)
except Exception as exc:
log(f"Error extracting tags with filename: {exc}", file=sys.stderr)
if debug_logger is not None:
debug_logger.write_record(
"direct-file-downloaded",
{"url": url, "path": str(file_path), "hash": hash_value},
)
return DownloadMediaResult(
path=file_path,
info=info,
tag=tags,
source_url=url,
hash_value=hash_value,
)
except (httpx.HTTPError, httpx.RequestError) as exc:
try:
if progress_bar is not None:
progress_bar.finish()
except Exception:
pass
try:
if pipeline_progress is not None and transfer_started[0] and hasattr(
pipeline_progress, "finish_transfer"
):
pipeline_progress.finish_transfer(label=str(filename or "download"))
except Exception:
pass
log(f"Download error: {exc}", file=sys.stderr)
if debug_logger is not None:
debug_logger.write_record(
"exception",
{"phase": "direct-file", "url": url, "error": str(exc)},
)
raise DownloadError(f"Failed to download {url}: {exc}") from exc
except Exception as exc:
try:
if progress_bar is not None:
progress_bar.finish()
except Exception:
pass
try:
if pipeline_progress is not None and transfer_started[0] and hasattr(
pipeline_progress, "finish_transfer"
):
pipeline_progress.finish_transfer(label=str(filename or "download"))
except Exception:
pass
log(f"Error downloading file: {exc}", file=sys.stderr)
if debug_logger is not None:
debug_logger.write_record(
"exception",
{
"phase": "direct-file",
"url": url,
"error": str(exc),
"traceback": traceback.format_exc(),
},
)
raise DownloadError(f"Error downloading file: {exc}") from exc
# Back-compat alias
_download_direct_file = download_direct_file
class AsyncHTTPClient:
"""Unified async HTTP client with asyncio support."""

View File

@@ -11,6 +11,7 @@ import shutil
import subprocess
import sys
import time
from collections import deque
from SYS.logger import log
from SYS.utils_constant import ALL_SUPPORTED_EXTENSIONS as GLOBAL_SUPPORTED_EXTENSIONS
@@ -18,8 +19,8 @@ import tempfile
import logging
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Iterable, Optional, Sequence, Type, TypeVar, Union, cast
from urllib.parse import urlsplit, urlencode, quote
from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple, Type, TypeVar, Union, cast
from urllib.parse import urlsplit, urlencode, quote, urlunsplit, unquote
import httpx
logger = logging.getLogger(__name__)
@@ -1828,3 +1829,742 @@ def download_hydrus_file(
print_final_progress(filename, file_size, elapsed)
return downloaded
# ============================================================================
# Hydrus metadata helpers (moved from SYS.metadata)
# ============================================================================
def _normalize_hash(value: Any) -> str:
candidate = str(value or "").strip().lower()
if not candidate:
raise ValueError("Hydrus hash is required")
if len(candidate) != 64 or any(ch not in "0123456789abcdef" for ch in candidate):
raise ValueError("Hydrus hash must be a 64-character hex string")
return candidate
def _normalize_tag(tag: Any) -> Optional[str]:
if tag is None:
return None
if isinstance(tag, str):
candidate = tag.strip()
else:
candidate = str(tag).strip()
return candidate or None
def _dedup_tags_by_namespace(tags: List[str], keep_first: bool = True) -> List[str]:
if not tags:
return []
namespace_to_tags: Dict[Optional[str], List[Tuple[int, str]]] = {}
first_appearance: Dict[Optional[str], int] = {}
for idx, tag in enumerate(tags):
namespace: Optional[str] = tag.split(":", 1)[0] if ":" in tag else None
if namespace not in first_appearance:
first_appearance[namespace] = idx
if namespace not in namespace_to_tags:
namespace_to_tags[namespace] = []
namespace_to_tags[namespace].append((idx, tag))
result: List[Tuple[int, str]] = []
for namespace, tag_list in namespace_to_tags.items():
chosen_tag = tag_list[0][1] if keep_first else tag_list[-1][1]
result.append((first_appearance[namespace], chosen_tag))
result.sort(key=lambda x: x[0])
return [tag for _, tag in result]
def _extract_tag_services(entry: Dict[str, Any]) -> List[Dict[str, Any]]:
tags_section = entry.get("tags")
services: List[Dict[str, Any]] = []
if not isinstance(tags_section, dict):
return services
names_map = tags_section.get("service_keys_to_names")
if not isinstance(names_map, dict):
names_map = {}
def get_record(service_key: Optional[str], service_name: Optional[str]) -> Dict[str, Any]:
key_lower = service_key.lower() if isinstance(service_key, str) else None
name_lower = service_name.lower() if isinstance(service_name, str) else None
for record in services:
existing_key = record.get("service_key")
if key_lower and isinstance(existing_key, str) and existing_key.lower() == key_lower:
if service_name and not record.get("service_name"):
record["service_name"] = service_name
return record
existing_name = record.get("service_name")
if name_lower and isinstance(existing_name, str) and existing_name.lower() == name_lower:
if service_key and not record.get("service_key"):
record["service_key"] = service_key
return record
record = {
"service_key": service_key,
"service_name": service_name,
"tags": [],
}
services.append(record)
return record
def _iter_current_status_lists(container: Any) -> Iterable[List[Any]]:
if isinstance(container, dict):
for status_key, tags_list in container.items():
if str(status_key) != "0":
continue
if isinstance(tags_list, list):
yield tags_list
elif isinstance(container, list):
yield container
statuses_map = tags_section.get("service_keys_to_statuses_to_tags")
if isinstance(statuses_map, dict):
for service_key, status_map in statuses_map.items():
record = get_record(service_key if isinstance(service_key, str) else None, names_map.get(service_key))
for tags_list in _iter_current_status_lists(status_map):
for tag in tags_list:
normalized = _normalize_tag(tag)
if normalized:
record["tags"].append(normalized)
ignored_keys = {
"service_keys_to_statuses_to_tags",
"service_keys_to_statuses_to_display_tags",
"service_keys_to_display_friendly_tags",
"service_keys_to_names",
"tag_display_types_to_namespaces",
"namespace_display_string_lookup",
"tag_display_decoration_colour_lookup",
}
for key, service in tags_section.items():
if key in ignored_keys:
continue
if isinstance(service, dict):
service_key = service.get("service_key") or (key if isinstance(key, str) else None)
service_name = service.get("service_name") or service.get("name") or names_map.get(service_key)
record = get_record(service_key if isinstance(service_key, str) else None, service_name)
storage = service.get("storage_tags") or service.get("statuses_to_tags") or service.get("tags")
if isinstance(storage, dict):
for tags_list in _iter_current_status_lists(storage):
for tag in tags_list:
normalized = _normalize_tag(tag)
if normalized:
record["tags"].append(normalized)
elif isinstance(storage, list):
for tag in storage:
normalized = _normalize_tag(tag)
if normalized:
record["tags"].append(normalized)
for record in services:
record["tags"] = _dedup_tags_by_namespace(record["tags"], keep_first=True)
return services
def _select_primary_tags(
services: List[Dict[str, Any]],
aggregated: List[str],
prefer_service: Optional[str]
) -> Tuple[Optional[str], List[str]]:
prefer_lower = prefer_service.lower() if isinstance(prefer_service, str) else None
if prefer_lower:
for record in services:
name = record.get("service_name")
if isinstance(name, str) and name.lower() == prefer_lower and record["tags"]:
return record.get("service_key"), record["tags"]
for record in services:
if record["tags"]:
return record.get("service_key"), record["tags"]
return None, aggregated
def _derive_title(
tags_primary: List[str],
tags_aggregated: List[str],
entry: Dict[str, Any]
) -> Optional[str]:
for source in (tags_primary, tags_aggregated):
for tag in source:
namespace, sep, value = tag.partition(":")
if sep and namespace and namespace.lower() == "title":
cleaned = value.strip()
if cleaned:
return cleaned
for key in (
"title",
"display_name",
"pretty_name",
"original_display_filename",
"original_filename",
):
value = entry.get(key)
if isinstance(value, str):
cleaned = value.strip()
if cleaned:
return cleaned
return None
def _derive_clip_time(
tags_primary: List[str],
tags_aggregated: List[str],
entry: Dict[str, Any]
) -> Optional[str]:
namespaces = {"clip", "clip_time", "cliptime"}
for source in (tags_primary, tags_aggregated):
for tag in source:
namespace, sep, value = tag.partition(":")
if sep and namespace and namespace.lower() in namespaces:
cleaned = value.strip()
if cleaned:
return cleaned
clip_value = entry.get("clip_time")
if isinstance(clip_value, str):
cleaned_clip = clip_value.strip()
if cleaned_clip:
return cleaned_clip
return None
def _summarize_hydrus_entry(
entry: Dict[str, Any],
prefer_service: Optional[str]
) -> Tuple[Dict[str, Any], List[str], Optional[str], Optional[str], Optional[str]]:
services = _extract_tag_services(entry)
aggregated: List[str] = []
seen: Set[str] = set()
for record in services:
for tag in record["tags"]:
if tag not in seen:
seen.add(tag)
aggregated.append(tag)
service_key, primary_tags = _select_primary_tags(services, aggregated, prefer_service)
title = _derive_title(primary_tags, aggregated, entry)
clip_time = _derive_clip_time(primary_tags, aggregated, entry)
summary = dict(entry)
if title and not summary.get("title"):
summary["title"] = title
if clip_time and not summary.get("clip_time"):
summary["clip_time"] = clip_time
summary["tag_service_key"] = service_key
summary["has_current_file_service"] = _has_current_file_service(entry)
if "is_local" not in summary:
summary["is_local"] = bool(entry.get("is_local"))
return summary, primary_tags, service_key, title, clip_time
def _looks_like_hash(value: Any) -> bool:
if not isinstance(value, str):
return False
candidate = value.strip().lower()
return len(candidate) == 64 and all(ch in "0123456789abcdef" for ch in candidate)
def _collect_relationship_hashes(payload: Any, accumulator: Set[str]) -> None:
if isinstance(payload, dict):
for value in payload.values():
_collect_relationship_hashes(value, accumulator)
elif isinstance(payload, (list, tuple, set)):
for value in payload:
_collect_relationship_hashes(value, accumulator)
elif isinstance(payload, str) and _looks_like_hash(payload):
accumulator.add(payload)
def _generate_hydrus_url_variants(url: str) -> List[str]:
seen: Set[str] = set()
variants: List[str] = []
def push(candidate: Optional[str]) -> None:
if not candidate:
return
text = candidate.strip()
if not text or text in seen:
return
seen.add(text)
variants.append(text)
push(url)
try:
parsed = urlsplit(url)
except Exception:
return variants
if parsed.scheme in {"http", "https"}:
alternate_scheme = "https" if parsed.scheme == "http" else "http"
push(urlunsplit((alternate_scheme, parsed.netloc, parsed.path, parsed.query, parsed.fragment)))
normalised_netloc = parsed.netloc.lower()
if normalised_netloc and normalised_netloc != parsed.netloc:
push(urlunsplit((parsed.scheme, normalised_netloc, parsed.path, parsed.query, parsed.fragment)))
if parsed.path:
trimmed_path = parsed.path.rstrip("/")
if trimmed_path != parsed.path:
push(urlunsplit((parsed.scheme, parsed.netloc, trimmed_path, parsed.query, parsed.fragment)))
else:
push(urlunsplit((parsed.scheme, parsed.netloc, parsed.path + "/", parsed.query, parsed.fragment)))
unquoted_path = unquote(parsed.path)
if unquoted_path != parsed.path:
push(urlunsplit((parsed.scheme, parsed.netloc, unquoted_path, parsed.query, parsed.fragment)))
if parsed.query or parsed.fragment:
push(urlunsplit((parsed.scheme, parsed.netloc, parsed.path, "", "")))
if parsed.path:
unquoted_path = unquote(parsed.path)
push(urlunsplit((parsed.scheme, parsed.netloc, unquoted_path, "", "")))
return variants
def _build_hydrus_query(
hashes: Optional[Sequence[str]],
file_ids: Optional[Sequence[int]],
include_relationships: bool,
minimal: bool,
) -> Dict[str, str]:
query: Dict[str, str] = {}
if hashes:
query["hashes"] = json.dumps([_normalize_hash(h) for h in hashes])
if file_ids:
query["file_ids"] = json.dumps([int(fid) for fid in file_ids])
if not query:
raise ValueError("hashes or file_ids must be provided")
query["include_service_keys_to_tags"] = json.dumps(True)
query["include_tag_services"] = json.dumps(True)
query["include_file_services"] = json.dumps(True)
if include_relationships:
query["include_file_relationships"] = json.dumps(True)
if not minimal:
extras = (
"include_url",
"include_size",
"include_width",
"include_height",
"include_duration",
"include_mime",
"include_has_audio",
"include_is_trashed",
)
for key in extras:
query[key] = json.dumps(True)
return query
def _fetch_hydrus_entries(
client: "HydrusNetwork",
hashes: Optional[Sequence[str]],
file_ids: Optional[Sequence[int]],
include_relationships: bool,
minimal: bool,
) -> List[Dict[str, Any]]:
if not hashes and not file_ids:
return []
spec = HydrusRequestSpec(
method="GET",
endpoint="/get_files/file_metadata",
query=_build_hydrus_query(hashes, file_ids, include_relationships, minimal),
)
response = client._perform_request(spec)
metadata = response.get("metadata") if isinstance(response, dict) else None
if isinstance(metadata, list):
return [entry for entry in metadata if isinstance(entry, dict)]
return []
def _has_current_file_service(entry: Dict[str, Any]) -> bool:
services = entry.get("file_services")
if not isinstance(services, dict):
return False
current = services.get("current")
if isinstance(current, dict):
for value in current.values():
if value:
return True
return False
if isinstance(current, list):
return len(current) > 0
return False
def _compute_file_flags(entry: Dict[str, Any]) -> Tuple[bool, bool, bool]:
mime = entry.get("mime")
mime_lower = mime.lower() if isinstance(mime, str) else ""
is_video = mime_lower.startswith("video/")
is_audio = mime_lower.startswith("audio/")
is_deleted = bool(entry.get("is_trashed"))
file_services = entry.get("file_services")
if not is_deleted and isinstance(file_services, dict):
deleted = file_services.get("deleted")
if isinstance(deleted, dict) and deleted:
is_deleted = True
return is_video, is_audio, is_deleted
def fetch_hydrus_metadata(payload: Dict[str, Any]) -> Dict[str, Any]:
hash_hex = None
raw_hash_value = payload.get("hash")
if raw_hash_value is not None:
hash_hex = _normalize_hash(raw_hash_value)
file_ids: List[int] = []
raw_file_ids = payload.get("file_ids")
if isinstance(raw_file_ids, (list, tuple, set)):
for value in raw_file_ids:
try:
file_ids.append(int(value))
except (TypeError, ValueError):
continue
elif raw_file_ids is not None:
try:
file_ids.append(int(raw_file_ids))
except (TypeError, ValueError):
file_ids = []
raw_file_id = payload.get("file_id")
if raw_file_id is not None:
try:
coerced = int(raw_file_id)
except (TypeError, ValueError):
coerced = None
if coerced is not None and coerced not in file_ids:
file_ids.append(coerced)
base_url = str(payload.get("api_url") or "").strip()
if not base_url:
raise ValueError("Hydrus api_url is required")
access_key = str(payload.get("access_key") or "").strip()
options_raw = payload.get("options")
options = options_raw if isinstance(options_raw, dict) else {}
prefer_service = options.get("prefer_service_name")
if isinstance(prefer_service, str):
prefer_service = prefer_service.strip()
else:
prefer_service = None
include_relationships = bool(options.get("include_relationships"))
minimal = bool(options.get("minimal"))
timeout = float(options.get("timeout") or 60.0)
client = HydrusNetwork(base_url, access_key, timeout)
hashes: Optional[List[str]] = None
if hash_hex:
hashes = [hash_hex]
if not hashes and not file_ids:
raise ValueError("Hydrus hash or file id is required")
try:
entries = _fetch_hydrus_entries(
client,
hashes,
file_ids or None,
include_relationships,
minimal
)
except HydrusRequestError as exc:
raise RuntimeError(str(exc))
if not entries:
response: Dict[str, Any] = {
"hash": hash_hex,
"metadata": {},
"tags": [],
"warnings": [f"No Hydrus metadata for {hash_hex or file_ids}"],
"error": "not_found",
}
if file_ids:
response["file_id"] = file_ids[0]
return response
entry = entries[0]
if not hash_hex:
entry_hash = entry.get("hash")
if isinstance(entry_hash, str) and entry_hash:
hash_hex = entry_hash
hashes = [hash_hex]
summary, primary_tags, service_key, title, clip_time = _summarize_hydrus_entry(entry, prefer_service)
is_video, is_audio, is_deleted = _compute_file_flags(entry)
has_current_file_service = _has_current_file_service(entry)
is_local = bool(entry.get("is_local"))
size_bytes = entry.get("size") or entry.get("file_size")
filesize_mb = None
if isinstance(size_bytes, (int, float)) and size_bytes > 0:
filesize_mb = float(size_bytes) / (1024.0 * 1024.0)
duration = entry.get("duration")
if duration is None and isinstance(entry.get("duration_ms"), (int, float)):
duration = float(entry["duration_ms"]) / 1000.0
warnings_list: List[str] = []
if not primary_tags:
warnings_list.append("No tags returned for preferred service")
relationships = None
relationship_metadata: Dict[str, Dict[str, Any]] = {}
if include_relationships and hash_hex:
try:
rel_spec = HydrusRequestSpec(
method="GET",
endpoint="/manage_file_relationships/get_file_relationships",
query={"hash": hash_hex},
)
relationships = client._perform_request(rel_spec)
except HydrusRequestError as exc:
warnings_list.append(f"Relationship lookup failed: {exc}")
relationships = None
if isinstance(relationships, dict):
related_hashes: Set[str] = set()
_collect_relationship_hashes(relationships, related_hashes)
related_hashes.discard(hash_hex)
if related_hashes:
try:
related_entries = _fetch_hydrus_entries(
client,
sorted(related_hashes),
None,
False,
True
)
except HydrusRequestError as exc:
warnings_list.append(f"Relationship metadata fetch failed: {exc}")
else:
for rel_entry in related_entries:
rel_hash = rel_entry.get("hash")
if not isinstance(rel_hash, str):
continue
rel_summary, rel_tags, _, rel_title, rel_clip = _summarize_hydrus_entry(rel_entry, prefer_service)
rel_summary["tags"] = rel_tags
if rel_title:
rel_summary["title"] = rel_title
if rel_clip:
rel_summary["clip_time"] = rel_clip
relationship_metadata[rel_hash] = rel_summary
result: Dict[str, Any] = {
"hash": entry.get("hash") or hash_hex,
"metadata": summary,
"tags": primary_tags,
"tag_service_key": service_key,
"title": title,
"clip_time": clip_time,
"duration": duration,
"filesize_mb": filesize_mb,
"is_video": is_video,
"is_audio": is_audio,
"is_deleted": is_deleted,
"is_local": is_local,
"has_current_file_service": has_current_file_service,
"matched_hash": entry.get("hash") or hash_hex,
"swap_recommended": False,
}
file_id_value = entry.get("file_id")
if isinstance(file_id_value, (int, float)):
result["file_id"] = int(file_id_value)
if relationships is not None:
result["relationships"] = relationships
if relationship_metadata:
result["relationship_metadata"] = relationship_metadata
if warnings_list:
result["warnings"] = warnings_list
return result
def fetch_hydrus_metadata_by_url(payload: Dict[str, Any]) -> Dict[str, Any]:
raw_url = payload.get("url") or payload.get("source_url")
url = str(raw_url or "").strip()
if not url:
raise ValueError("URL is required to fetch Hydrus metadata by URL")
base_url = str(payload.get("api_url") or "").strip()
if not base_url:
raise ValueError("Hydrus api_url is required")
access_key = str(payload.get("access_key") or "").strip()
options_raw = payload.get("options")
options = options_raw if isinstance(options_raw, dict) else {}
timeout = float(options.get("timeout") or 60.0)
client = HydrusNetwork(base_url, access_key, timeout)
hashes: Optional[List[str]] = None
file_ids: Optional[List[int]] = None
matched_url = None
normalised_reported = None
seen: Set[str] = set()
queue = deque()
for variant in _generate_hydrus_url_variants(url):
queue.append(variant)
if not queue:
queue.append(url)
tried_variants: List[str] = []
while queue:
candidate = queue.popleft()
candidate = str(candidate or "").strip()
if not candidate or candidate in seen:
continue
seen.add(candidate)
tried_variants.append(candidate)
spec = HydrusRequestSpec(
method="GET",
endpoint="/add_urls/get_url_files",
query={"url": candidate},
)
try:
response = client._perform_request(spec)
except HydrusRequestError as exc:
raise RuntimeError(str(exc))
response_hashes_list: List[str] = []
response_file_ids_list: List[int] = []
if isinstance(response, dict):
normalised_value = response.get("normalised_url")
if isinstance(normalised_value, str):
trimmed = normalised_value.strip()
if trimmed:
normalised_reported = normalised_reported or trimmed
if trimmed not in seen:
queue.append(trimmed)
for redirect_key in ("redirect_url", "url"):
redirect_value = response.get(redirect_key)
if isinstance(redirect_value, str):
redirect_trimmed = redirect_value.strip()
if redirect_trimmed and redirect_trimmed not in seen:
queue.append(redirect_trimmed)
raw_hashes = response.get("hashes") or response.get("file_hashes")
if isinstance(raw_hashes, list):
for item in raw_hashes:
try:
normalized = _normalize_hash(item)
except ValueError:
continue
if normalized:
response_hashes_list.append(normalized)
raw_ids = response.get("file_ids") or response.get("file_id")
if isinstance(raw_ids, list):
for item in raw_ids:
try:
response_file_ids_list.append(int(item))
except (TypeError, ValueError):
continue
elif raw_ids is not None:
try:
response_file_ids_list.append(int(raw_ids))
except (TypeError, ValueError):
pass
statuses = response.get("url_file_statuses")
if isinstance(statuses, list):
for entry in statuses:
if not isinstance(entry, dict):
continue
status_hash = entry.get("hash") or entry.get("file_hash")
if status_hash:
try:
normalized = _normalize_hash(status_hash)
except ValueError:
normalized = None
if normalized:
response_hashes_list.append(normalized)
status_id = entry.get("file_id") or entry.get("fileid")
if status_id is not None:
try:
response_file_ids_list.append(int(status_id))
except (TypeError, ValueError):
pass
if not hashes and response_hashes_list:
hashes = response_hashes_list
if not file_ids and response_file_ids_list:
file_ids = response_file_ids_list
if hashes or file_ids:
matched_url = candidate
break
if not hashes and not file_ids:
raise RuntimeError(
"No Hydrus matches for URL variants: "
+ ", ".join(tried_variants)
)
followup_payload = {
"api_url": base_url,
"access_key": access_key,
"hash": hashes[0] if hashes else None,
"file_ids": file_ids,
"options": {"timeout": timeout, "minimal": True},
}
result = fetch_hydrus_metadata(followup_payload)
result["matched_url"] = matched_url or url
result["normalised_url"] = normalised_reported or matched_url or url
result["tried_urls"] = tried_variants
return result
def _build_hydrus_context(payload: Dict[str, Any]) -> Tuple["HydrusNetwork", str, str, float, Optional[str]]:
base_url = str(payload.get("api_url") or "").strip()
if not base_url:
raise ValueError("Hydrus api_url is required")
access_key = str(payload.get("access_key") or "").strip()
options_raw = payload.get("options")
options = options_raw if isinstance(options_raw, dict) else {}
timeout = float(options.get("timeout") or payload.get("timeout") or 60.0)
prefer_service = payload.get("prefer_service_name") or options.get("prefer_service_name")
if isinstance(prefer_service, str):
prefer_service = prefer_service.strip() or None
else:
prefer_service = None
client = HydrusNetwork(base_url, access_key, timeout)
return client, base_url, access_key, timeout, prefer_service
def _refetch_hydrus_summary(
base_url: str,
access_key: str,
hash_hex: str,
timeout: float,
prefer_service: Optional[str]
) -> Dict[str, Any]:
payload: Dict[str, Any] = {
"hash": hash_hex,
"api_url": base_url,
"access_key": access_key,
"options": {
"minimal": True,
"include_relationships": False,
"timeout": timeout,
},
}
if prefer_service:
payload["options"]["prefer_service_name"] = prefer_service
return fetch_hydrus_metadata(payload)
def apply_hydrus_tag_mutation(
payload: Dict[str, Any],
add: Iterable[Any],
remove: Iterable[Any]
) -> Dict[str, Any]:
client, base_url, access_key, timeout, prefer_service = _build_hydrus_context(payload)
hash_hex = _normalize_hash(payload.get("hash"))
add_list = [_normalize_tag(tag) for tag in add if _normalize_tag(tag)]
remove_list = [_normalize_tag(tag) for tag in remove if _normalize_tag(tag)]
if not add_list and not remove_list:
raise ValueError("No tag changes supplied")
service_key = payload.get("service_key") or payload.get("tag_service_key")
summary = None
if not service_key:
summary = _refetch_hydrus_summary(base_url, access_key, hash_hex, timeout, prefer_service)
service_key = summary.get("tag_service_key")
if not isinstance(service_key, str) or not service_key:
raise RuntimeError("Unable to determine Hydrus tag service key")
actions: Dict[str, List[str]] = {}
if add_list:
actions["0"] = [tag for tag in add_list if tag]
if remove_list:
actions["1"] = [tag for tag in remove_list if tag]
if not actions:
raise ValueError("Tag mutation produced no actionable changes")
request_payload = {
"hashes": [hash_hex],
"service_keys_to_actions_to_tags": {
service_key: actions,
},
}
try:
tag_spec = HydrusRequestSpec(
method="POST",
endpoint="/add_tags/add_tags",
data=request_payload,
)
client._perform_request(tag_spec)
except HydrusRequestError as exc:
raise RuntimeError(str(exc))
summary_after = _refetch_hydrus_summary(base_url, access_key, hash_hex, timeout, prefer_service)
result = dict(summary_after)
result["added_tags"] = actions.get("0", [])
result["removed_tags"] = actions.get("1", [])
result["tag_service_key"] = summary_after.get("tag_service_key")
return result

220
API/cmdlet.py Normal file
View File

@@ -0,0 +1,220 @@
from __future__ import annotations
import contextlib
import io
from dataclasses import dataclass, field
from typing import Any, Callable, Dict, List, Optional, Sequence
from SYS import pipeline as ctx
from SYS.models import PipelineStageContext
from SYS.rich_display import capture_rich_output
CmdletCallable = Callable[[Any, Sequence[str], Dict[str, Any]], int]
@dataclass(slots=True)
class CmdletRunResult:
"""Programmatic result for a single cmdlet invocation."""
name: str
args: Sequence[str]
exit_code: int = 0
emitted: List[Any] = field(default_factory=list)
# Best-effort: cmdlets can publish tables/items via pipeline state even when
# they don't emit pipeline items.
result_table: Optional[Any] = None
result_items: List[Any] = field(default_factory=list)
result_subject: Optional[Any] = None
stdout: str = ""
stderr: str = ""
error: Optional[str] = None
def _normalize_cmd_name(name: str) -> str:
return str(name or "").replace("_", "-").strip().lower()
def resolve_cmdlet(cmd_name: str) -> Optional[CmdletCallable]:
"""Resolve a cmdlet callable by name from the registry (aliases supported)."""
try:
from SYS.cmdlet_catalog import ensure_registry_loaded
ensure_registry_loaded()
except Exception:
pass
try:
import cmdlet as cmdlet_pkg
return cmdlet_pkg.get(cmd_name)
except Exception:
return None
def run_cmdlet(
cmd: str | CmdletCallable,
args: Sequence[str] | None,
config: Dict[str, Any],
*,
piped: Any = None,
isolate: bool = True,
capture_output: bool = True,
stage_index: int = 0,
total_stages: int = 1,
pipe_index: Optional[int] = None,
worker_id: Optional[str] = None,
) -> CmdletRunResult:
"""Run a single cmdlet programmatically and return structured results.
This is intended for TUI/webapp consumers that want cmdlet behavior without
going through the interactive CLI loop.
Notes:
- When `isolate=True` (default) this runs inside `ctx.new_pipeline_state()` so
global CLI pipeline state is not mutated.
- Output capturing covers both normal `print()` and Rich output via
`capture_rich_output()`.
"""
normalized_args: Sequence[str] = list(args or [])
if isinstance(cmd, str):
name = _normalize_cmd_name(cmd)
cmd_fn = resolve_cmdlet(name)
else:
name = getattr(cmd, "__name__", "cmdlet")
cmd_fn = cmd
result = CmdletRunResult(name=name, args=normalized_args)
if not callable(cmd_fn):
result.exit_code = 1
result.error = f"Unknown command: {name}"
result.stderr = result.error
return result
stage_ctx = PipelineStageContext(
stage_index=int(stage_index),
total_stages=int(total_stages),
pipe_index=pipe_index,
worker_id=worker_id,
)
stdout_buffer = io.StringIO()
stderr_buffer = io.StringIO()
stage_text = " ".join([name, *list(normalized_args)]).strip()
state_cm = ctx.new_pipeline_state() if isolate else contextlib.nullcontext()
with state_cm:
# Keep behavior predictable: start from a clean slate.
try:
ctx.reset()
except Exception:
pass
try:
ctx.set_stage_context(stage_ctx)
except Exception:
pass
try:
ctx.set_current_cmdlet_name(name)
except Exception:
pass
try:
ctx.set_current_stage_text(stage_text)
except Exception:
pass
try:
ctx.set_current_command_text(stage_text)
except Exception:
pass
try:
run_cm = (
capture_rich_output(stdout=stdout_buffer, stderr=stderr_buffer)
if capture_output
else contextlib.nullcontext()
)
with run_cm:
with (
contextlib.redirect_stdout(stdout_buffer)
if capture_output
else contextlib.nullcontext()
):
with (
contextlib.redirect_stderr(stderr_buffer)
if capture_output
else contextlib.nullcontext()
):
result.exit_code = int(cmd_fn(piped, list(normalized_args), config))
except Exception as exc:
result.exit_code = 1
result.error = f"{type(exc).__name__}: {exc}"
finally:
result.stdout = stdout_buffer.getvalue()
result.stderr = stderr_buffer.getvalue()
# Prefer cmdlet emits (pipeline semantics).
try:
result.emitted = list(stage_ctx.emits or [])
except Exception:
result.emitted = []
# Mirror CLI behavior: if cmdlet emitted items and there is no overlay table,
# make emitted items the last result items for downstream consumers.
try:
has_overlay = bool(ctx.get_display_table())
except Exception:
has_overlay = False
if result.emitted and not has_overlay:
try:
ctx.set_last_result_items_only(list(result.emitted))
except Exception:
pass
# Best-effort snapshot of visible results.
try:
result.result_table = (
ctx.get_display_table() or ctx.get_current_stage_table() or ctx.get_last_result_table()
)
except Exception:
result.result_table = None
try:
result.result_items = list(ctx.get_last_result_items() or [])
except Exception:
result.result_items = []
try:
result.result_subject = ctx.get_last_result_subject()
except Exception:
result.result_subject = None
# Cleanup stage-local markers.
try:
ctx.clear_current_stage_text()
except Exception:
pass
try:
ctx.clear_current_cmdlet_name()
except Exception:
pass
try:
ctx.clear_current_command_text()
except Exception:
pass
try:
ctx.set_stage_context(None)
except Exception:
pass
return result