This commit is contained in:
2026-01-05 07:51:19 -08:00
parent 8545367e28
commit 1f765cffda
32 changed files with 3447 additions and 3250 deletions

View File

@@ -11,6 +11,7 @@ import shutil
import subprocess
import sys
import time
from collections import deque
from SYS.logger import log
from SYS.utils_constant import ALL_SUPPORTED_EXTENSIONS as GLOBAL_SUPPORTED_EXTENSIONS
@@ -18,8 +19,8 @@ import tempfile
import logging
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Iterable, Optional, Sequence, Type, TypeVar, Union, cast
from urllib.parse import urlsplit, urlencode, quote
from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple, Type, TypeVar, Union, cast
from urllib.parse import urlsplit, urlencode, quote, urlunsplit, unquote
import httpx
logger = logging.getLogger(__name__)
@@ -1828,3 +1829,742 @@ def download_hydrus_file(
print_final_progress(filename, file_size, elapsed)
return downloaded
# ============================================================================
# Hydrus metadata helpers (moved from SYS.metadata)
# ============================================================================
def _normalize_hash(value: Any) -> str:
candidate = str(value or "").strip().lower()
if not candidate:
raise ValueError("Hydrus hash is required")
if len(candidate) != 64 or any(ch not in "0123456789abcdef" for ch in candidate):
raise ValueError("Hydrus hash must be a 64-character hex string")
return candidate
def _normalize_tag(tag: Any) -> Optional[str]:
if tag is None:
return None
if isinstance(tag, str):
candidate = tag.strip()
else:
candidate = str(tag).strip()
return candidate or None
def _dedup_tags_by_namespace(tags: List[str], keep_first: bool = True) -> List[str]:
if not tags:
return []
namespace_to_tags: Dict[Optional[str], List[Tuple[int, str]]] = {}
first_appearance: Dict[Optional[str], int] = {}
for idx, tag in enumerate(tags):
namespace: Optional[str] = tag.split(":", 1)[0] if ":" in tag else None
if namespace not in first_appearance:
first_appearance[namespace] = idx
if namespace not in namespace_to_tags:
namespace_to_tags[namespace] = []
namespace_to_tags[namespace].append((idx, tag))
result: List[Tuple[int, str]] = []
for namespace, tag_list in namespace_to_tags.items():
chosen_tag = tag_list[0][1] if keep_first else tag_list[-1][1]
result.append((first_appearance[namespace], chosen_tag))
result.sort(key=lambda x: x[0])
return [tag for _, tag in result]
def _extract_tag_services(entry: Dict[str, Any]) -> List[Dict[str, Any]]:
tags_section = entry.get("tags")
services: List[Dict[str, Any]] = []
if not isinstance(tags_section, dict):
return services
names_map = tags_section.get("service_keys_to_names")
if not isinstance(names_map, dict):
names_map = {}
def get_record(service_key: Optional[str], service_name: Optional[str]) -> Dict[str, Any]:
key_lower = service_key.lower() if isinstance(service_key, str) else None
name_lower = service_name.lower() if isinstance(service_name, str) else None
for record in services:
existing_key = record.get("service_key")
if key_lower and isinstance(existing_key, str) and existing_key.lower() == key_lower:
if service_name and not record.get("service_name"):
record["service_name"] = service_name
return record
existing_name = record.get("service_name")
if name_lower and isinstance(existing_name, str) and existing_name.lower() == name_lower:
if service_key and not record.get("service_key"):
record["service_key"] = service_key
return record
record = {
"service_key": service_key,
"service_name": service_name,
"tags": [],
}
services.append(record)
return record
def _iter_current_status_lists(container: Any) -> Iterable[List[Any]]:
if isinstance(container, dict):
for status_key, tags_list in container.items():
if str(status_key) != "0":
continue
if isinstance(tags_list, list):
yield tags_list
elif isinstance(container, list):
yield container
statuses_map = tags_section.get("service_keys_to_statuses_to_tags")
if isinstance(statuses_map, dict):
for service_key, status_map in statuses_map.items():
record = get_record(service_key if isinstance(service_key, str) else None, names_map.get(service_key))
for tags_list in _iter_current_status_lists(status_map):
for tag in tags_list:
normalized = _normalize_tag(tag)
if normalized:
record["tags"].append(normalized)
ignored_keys = {
"service_keys_to_statuses_to_tags",
"service_keys_to_statuses_to_display_tags",
"service_keys_to_display_friendly_tags",
"service_keys_to_names",
"tag_display_types_to_namespaces",
"namespace_display_string_lookup",
"tag_display_decoration_colour_lookup",
}
for key, service in tags_section.items():
if key in ignored_keys:
continue
if isinstance(service, dict):
service_key = service.get("service_key") or (key if isinstance(key, str) else None)
service_name = service.get("service_name") or service.get("name") or names_map.get(service_key)
record = get_record(service_key if isinstance(service_key, str) else None, service_name)
storage = service.get("storage_tags") or service.get("statuses_to_tags") or service.get("tags")
if isinstance(storage, dict):
for tags_list in _iter_current_status_lists(storage):
for tag in tags_list:
normalized = _normalize_tag(tag)
if normalized:
record["tags"].append(normalized)
elif isinstance(storage, list):
for tag in storage:
normalized = _normalize_tag(tag)
if normalized:
record["tags"].append(normalized)
for record in services:
record["tags"] = _dedup_tags_by_namespace(record["tags"], keep_first=True)
return services
def _select_primary_tags(
services: List[Dict[str, Any]],
aggregated: List[str],
prefer_service: Optional[str]
) -> Tuple[Optional[str], List[str]]:
prefer_lower = prefer_service.lower() if isinstance(prefer_service, str) else None
if prefer_lower:
for record in services:
name = record.get("service_name")
if isinstance(name, str) and name.lower() == prefer_lower and record["tags"]:
return record.get("service_key"), record["tags"]
for record in services:
if record["tags"]:
return record.get("service_key"), record["tags"]
return None, aggregated
def _derive_title(
tags_primary: List[str],
tags_aggregated: List[str],
entry: Dict[str, Any]
) -> Optional[str]:
for source in (tags_primary, tags_aggregated):
for tag in source:
namespace, sep, value = tag.partition(":")
if sep and namespace and namespace.lower() == "title":
cleaned = value.strip()
if cleaned:
return cleaned
for key in (
"title",
"display_name",
"pretty_name",
"original_display_filename",
"original_filename",
):
value = entry.get(key)
if isinstance(value, str):
cleaned = value.strip()
if cleaned:
return cleaned
return None
def _derive_clip_time(
tags_primary: List[str],
tags_aggregated: List[str],
entry: Dict[str, Any]
) -> Optional[str]:
namespaces = {"clip", "clip_time", "cliptime"}
for source in (tags_primary, tags_aggregated):
for tag in source:
namespace, sep, value = tag.partition(":")
if sep and namespace and namespace.lower() in namespaces:
cleaned = value.strip()
if cleaned:
return cleaned
clip_value = entry.get("clip_time")
if isinstance(clip_value, str):
cleaned_clip = clip_value.strip()
if cleaned_clip:
return cleaned_clip
return None
def _summarize_hydrus_entry(
entry: Dict[str, Any],
prefer_service: Optional[str]
) -> Tuple[Dict[str, Any], List[str], Optional[str], Optional[str], Optional[str]]:
services = _extract_tag_services(entry)
aggregated: List[str] = []
seen: Set[str] = set()
for record in services:
for tag in record["tags"]:
if tag not in seen:
seen.add(tag)
aggregated.append(tag)
service_key, primary_tags = _select_primary_tags(services, aggregated, prefer_service)
title = _derive_title(primary_tags, aggregated, entry)
clip_time = _derive_clip_time(primary_tags, aggregated, entry)
summary = dict(entry)
if title and not summary.get("title"):
summary["title"] = title
if clip_time and not summary.get("clip_time"):
summary["clip_time"] = clip_time
summary["tag_service_key"] = service_key
summary["has_current_file_service"] = _has_current_file_service(entry)
if "is_local" not in summary:
summary["is_local"] = bool(entry.get("is_local"))
return summary, primary_tags, service_key, title, clip_time
def _looks_like_hash(value: Any) -> bool:
if not isinstance(value, str):
return False
candidate = value.strip().lower()
return len(candidate) == 64 and all(ch in "0123456789abcdef" for ch in candidate)
def _collect_relationship_hashes(payload: Any, accumulator: Set[str]) -> None:
if isinstance(payload, dict):
for value in payload.values():
_collect_relationship_hashes(value, accumulator)
elif isinstance(payload, (list, tuple, set)):
for value in payload:
_collect_relationship_hashes(value, accumulator)
elif isinstance(payload, str) and _looks_like_hash(payload):
accumulator.add(payload)
def _generate_hydrus_url_variants(url: str) -> List[str]:
seen: Set[str] = set()
variants: List[str] = []
def push(candidate: Optional[str]) -> None:
if not candidate:
return
text = candidate.strip()
if not text or text in seen:
return
seen.add(text)
variants.append(text)
push(url)
try:
parsed = urlsplit(url)
except Exception:
return variants
if parsed.scheme in {"http", "https"}:
alternate_scheme = "https" if parsed.scheme == "http" else "http"
push(urlunsplit((alternate_scheme, parsed.netloc, parsed.path, parsed.query, parsed.fragment)))
normalised_netloc = parsed.netloc.lower()
if normalised_netloc and normalised_netloc != parsed.netloc:
push(urlunsplit((parsed.scheme, normalised_netloc, parsed.path, parsed.query, parsed.fragment)))
if parsed.path:
trimmed_path = parsed.path.rstrip("/")
if trimmed_path != parsed.path:
push(urlunsplit((parsed.scheme, parsed.netloc, trimmed_path, parsed.query, parsed.fragment)))
else:
push(urlunsplit((parsed.scheme, parsed.netloc, parsed.path + "/", parsed.query, parsed.fragment)))
unquoted_path = unquote(parsed.path)
if unquoted_path != parsed.path:
push(urlunsplit((parsed.scheme, parsed.netloc, unquoted_path, parsed.query, parsed.fragment)))
if parsed.query or parsed.fragment:
push(urlunsplit((parsed.scheme, parsed.netloc, parsed.path, "", "")))
if parsed.path:
unquoted_path = unquote(parsed.path)
push(urlunsplit((parsed.scheme, parsed.netloc, unquoted_path, "", "")))
return variants
def _build_hydrus_query(
hashes: Optional[Sequence[str]],
file_ids: Optional[Sequence[int]],
include_relationships: bool,
minimal: bool,
) -> Dict[str, str]:
query: Dict[str, str] = {}
if hashes:
query["hashes"] = json.dumps([_normalize_hash(h) for h in hashes])
if file_ids:
query["file_ids"] = json.dumps([int(fid) for fid in file_ids])
if not query:
raise ValueError("hashes or file_ids must be provided")
query["include_service_keys_to_tags"] = json.dumps(True)
query["include_tag_services"] = json.dumps(True)
query["include_file_services"] = json.dumps(True)
if include_relationships:
query["include_file_relationships"] = json.dumps(True)
if not minimal:
extras = (
"include_url",
"include_size",
"include_width",
"include_height",
"include_duration",
"include_mime",
"include_has_audio",
"include_is_trashed",
)
for key in extras:
query[key] = json.dumps(True)
return query
def _fetch_hydrus_entries(
client: "HydrusNetwork",
hashes: Optional[Sequence[str]],
file_ids: Optional[Sequence[int]],
include_relationships: bool,
minimal: bool,
) -> List[Dict[str, Any]]:
if not hashes and not file_ids:
return []
spec = HydrusRequestSpec(
method="GET",
endpoint="/get_files/file_metadata",
query=_build_hydrus_query(hashes, file_ids, include_relationships, minimal),
)
response = client._perform_request(spec)
metadata = response.get("metadata") if isinstance(response, dict) else None
if isinstance(metadata, list):
return [entry for entry in metadata if isinstance(entry, dict)]
return []
def _has_current_file_service(entry: Dict[str, Any]) -> bool:
services = entry.get("file_services")
if not isinstance(services, dict):
return False
current = services.get("current")
if isinstance(current, dict):
for value in current.values():
if value:
return True
return False
if isinstance(current, list):
return len(current) > 0
return False
def _compute_file_flags(entry: Dict[str, Any]) -> Tuple[bool, bool, bool]:
mime = entry.get("mime")
mime_lower = mime.lower() if isinstance(mime, str) else ""
is_video = mime_lower.startswith("video/")
is_audio = mime_lower.startswith("audio/")
is_deleted = bool(entry.get("is_trashed"))
file_services = entry.get("file_services")
if not is_deleted and isinstance(file_services, dict):
deleted = file_services.get("deleted")
if isinstance(deleted, dict) and deleted:
is_deleted = True
return is_video, is_audio, is_deleted
def fetch_hydrus_metadata(payload: Dict[str, Any]) -> Dict[str, Any]:
hash_hex = None
raw_hash_value = payload.get("hash")
if raw_hash_value is not None:
hash_hex = _normalize_hash(raw_hash_value)
file_ids: List[int] = []
raw_file_ids = payload.get("file_ids")
if isinstance(raw_file_ids, (list, tuple, set)):
for value in raw_file_ids:
try:
file_ids.append(int(value))
except (TypeError, ValueError):
continue
elif raw_file_ids is not None:
try:
file_ids.append(int(raw_file_ids))
except (TypeError, ValueError):
file_ids = []
raw_file_id = payload.get("file_id")
if raw_file_id is not None:
try:
coerced = int(raw_file_id)
except (TypeError, ValueError):
coerced = None
if coerced is not None and coerced not in file_ids:
file_ids.append(coerced)
base_url = str(payload.get("api_url") or "").strip()
if not base_url:
raise ValueError("Hydrus api_url is required")
access_key = str(payload.get("access_key") or "").strip()
options_raw = payload.get("options")
options = options_raw if isinstance(options_raw, dict) else {}
prefer_service = options.get("prefer_service_name")
if isinstance(prefer_service, str):
prefer_service = prefer_service.strip()
else:
prefer_service = None
include_relationships = bool(options.get("include_relationships"))
minimal = bool(options.get("minimal"))
timeout = float(options.get("timeout") or 60.0)
client = HydrusNetwork(base_url, access_key, timeout)
hashes: Optional[List[str]] = None
if hash_hex:
hashes = [hash_hex]
if not hashes and not file_ids:
raise ValueError("Hydrus hash or file id is required")
try:
entries = _fetch_hydrus_entries(
client,
hashes,
file_ids or None,
include_relationships,
minimal
)
except HydrusRequestError as exc:
raise RuntimeError(str(exc))
if not entries:
response: Dict[str, Any] = {
"hash": hash_hex,
"metadata": {},
"tags": [],
"warnings": [f"No Hydrus metadata for {hash_hex or file_ids}"],
"error": "not_found",
}
if file_ids:
response["file_id"] = file_ids[0]
return response
entry = entries[0]
if not hash_hex:
entry_hash = entry.get("hash")
if isinstance(entry_hash, str) and entry_hash:
hash_hex = entry_hash
hashes = [hash_hex]
summary, primary_tags, service_key, title, clip_time = _summarize_hydrus_entry(entry, prefer_service)
is_video, is_audio, is_deleted = _compute_file_flags(entry)
has_current_file_service = _has_current_file_service(entry)
is_local = bool(entry.get("is_local"))
size_bytes = entry.get("size") or entry.get("file_size")
filesize_mb = None
if isinstance(size_bytes, (int, float)) and size_bytes > 0:
filesize_mb = float(size_bytes) / (1024.0 * 1024.0)
duration = entry.get("duration")
if duration is None and isinstance(entry.get("duration_ms"), (int, float)):
duration = float(entry["duration_ms"]) / 1000.0
warnings_list: List[str] = []
if not primary_tags:
warnings_list.append("No tags returned for preferred service")
relationships = None
relationship_metadata: Dict[str, Dict[str, Any]] = {}
if include_relationships and hash_hex:
try:
rel_spec = HydrusRequestSpec(
method="GET",
endpoint="/manage_file_relationships/get_file_relationships",
query={"hash": hash_hex},
)
relationships = client._perform_request(rel_spec)
except HydrusRequestError as exc:
warnings_list.append(f"Relationship lookup failed: {exc}")
relationships = None
if isinstance(relationships, dict):
related_hashes: Set[str] = set()
_collect_relationship_hashes(relationships, related_hashes)
related_hashes.discard(hash_hex)
if related_hashes:
try:
related_entries = _fetch_hydrus_entries(
client,
sorted(related_hashes),
None,
False,
True
)
except HydrusRequestError as exc:
warnings_list.append(f"Relationship metadata fetch failed: {exc}")
else:
for rel_entry in related_entries:
rel_hash = rel_entry.get("hash")
if not isinstance(rel_hash, str):
continue
rel_summary, rel_tags, _, rel_title, rel_clip = _summarize_hydrus_entry(rel_entry, prefer_service)
rel_summary["tags"] = rel_tags
if rel_title:
rel_summary["title"] = rel_title
if rel_clip:
rel_summary["clip_time"] = rel_clip
relationship_metadata[rel_hash] = rel_summary
result: Dict[str, Any] = {
"hash": entry.get("hash") or hash_hex,
"metadata": summary,
"tags": primary_tags,
"tag_service_key": service_key,
"title": title,
"clip_time": clip_time,
"duration": duration,
"filesize_mb": filesize_mb,
"is_video": is_video,
"is_audio": is_audio,
"is_deleted": is_deleted,
"is_local": is_local,
"has_current_file_service": has_current_file_service,
"matched_hash": entry.get("hash") or hash_hex,
"swap_recommended": False,
}
file_id_value = entry.get("file_id")
if isinstance(file_id_value, (int, float)):
result["file_id"] = int(file_id_value)
if relationships is not None:
result["relationships"] = relationships
if relationship_metadata:
result["relationship_metadata"] = relationship_metadata
if warnings_list:
result["warnings"] = warnings_list
return result
def fetch_hydrus_metadata_by_url(payload: Dict[str, Any]) -> Dict[str, Any]:
raw_url = payload.get("url") or payload.get("source_url")
url = str(raw_url or "").strip()
if not url:
raise ValueError("URL is required to fetch Hydrus metadata by URL")
base_url = str(payload.get("api_url") or "").strip()
if not base_url:
raise ValueError("Hydrus api_url is required")
access_key = str(payload.get("access_key") or "").strip()
options_raw = payload.get("options")
options = options_raw if isinstance(options_raw, dict) else {}
timeout = float(options.get("timeout") or 60.0)
client = HydrusNetwork(base_url, access_key, timeout)
hashes: Optional[List[str]] = None
file_ids: Optional[List[int]] = None
matched_url = None
normalised_reported = None
seen: Set[str] = set()
queue = deque()
for variant in _generate_hydrus_url_variants(url):
queue.append(variant)
if not queue:
queue.append(url)
tried_variants: List[str] = []
while queue:
candidate = queue.popleft()
candidate = str(candidate or "").strip()
if not candidate or candidate in seen:
continue
seen.add(candidate)
tried_variants.append(candidate)
spec = HydrusRequestSpec(
method="GET",
endpoint="/add_urls/get_url_files",
query={"url": candidate},
)
try:
response = client._perform_request(spec)
except HydrusRequestError as exc:
raise RuntimeError(str(exc))
response_hashes_list: List[str] = []
response_file_ids_list: List[int] = []
if isinstance(response, dict):
normalised_value = response.get("normalised_url")
if isinstance(normalised_value, str):
trimmed = normalised_value.strip()
if trimmed:
normalised_reported = normalised_reported or trimmed
if trimmed not in seen:
queue.append(trimmed)
for redirect_key in ("redirect_url", "url"):
redirect_value = response.get(redirect_key)
if isinstance(redirect_value, str):
redirect_trimmed = redirect_value.strip()
if redirect_trimmed and redirect_trimmed not in seen:
queue.append(redirect_trimmed)
raw_hashes = response.get("hashes") or response.get("file_hashes")
if isinstance(raw_hashes, list):
for item in raw_hashes:
try:
normalized = _normalize_hash(item)
except ValueError:
continue
if normalized:
response_hashes_list.append(normalized)
raw_ids = response.get("file_ids") or response.get("file_id")
if isinstance(raw_ids, list):
for item in raw_ids:
try:
response_file_ids_list.append(int(item))
except (TypeError, ValueError):
continue
elif raw_ids is not None:
try:
response_file_ids_list.append(int(raw_ids))
except (TypeError, ValueError):
pass
statuses = response.get("url_file_statuses")
if isinstance(statuses, list):
for entry in statuses:
if not isinstance(entry, dict):
continue
status_hash = entry.get("hash") or entry.get("file_hash")
if status_hash:
try:
normalized = _normalize_hash(status_hash)
except ValueError:
normalized = None
if normalized:
response_hashes_list.append(normalized)
status_id = entry.get("file_id") or entry.get("fileid")
if status_id is not None:
try:
response_file_ids_list.append(int(status_id))
except (TypeError, ValueError):
pass
if not hashes and response_hashes_list:
hashes = response_hashes_list
if not file_ids and response_file_ids_list:
file_ids = response_file_ids_list
if hashes or file_ids:
matched_url = candidate
break
if not hashes and not file_ids:
raise RuntimeError(
"No Hydrus matches for URL variants: "
+ ", ".join(tried_variants)
)
followup_payload = {
"api_url": base_url,
"access_key": access_key,
"hash": hashes[0] if hashes else None,
"file_ids": file_ids,
"options": {"timeout": timeout, "minimal": True},
}
result = fetch_hydrus_metadata(followup_payload)
result["matched_url"] = matched_url or url
result["normalised_url"] = normalised_reported or matched_url or url
result["tried_urls"] = tried_variants
return result
def _build_hydrus_context(payload: Dict[str, Any]) -> Tuple["HydrusNetwork", str, str, float, Optional[str]]:
base_url = str(payload.get("api_url") or "").strip()
if not base_url:
raise ValueError("Hydrus api_url is required")
access_key = str(payload.get("access_key") or "").strip()
options_raw = payload.get("options")
options = options_raw if isinstance(options_raw, dict) else {}
timeout = float(options.get("timeout") or payload.get("timeout") or 60.0)
prefer_service = payload.get("prefer_service_name") or options.get("prefer_service_name")
if isinstance(prefer_service, str):
prefer_service = prefer_service.strip() or None
else:
prefer_service = None
client = HydrusNetwork(base_url, access_key, timeout)
return client, base_url, access_key, timeout, prefer_service
def _refetch_hydrus_summary(
base_url: str,
access_key: str,
hash_hex: str,
timeout: float,
prefer_service: Optional[str]
) -> Dict[str, Any]:
payload: Dict[str, Any] = {
"hash": hash_hex,
"api_url": base_url,
"access_key": access_key,
"options": {
"minimal": True,
"include_relationships": False,
"timeout": timeout,
},
}
if prefer_service:
payload["options"]["prefer_service_name"] = prefer_service
return fetch_hydrus_metadata(payload)
def apply_hydrus_tag_mutation(
payload: Dict[str, Any],
add: Iterable[Any],
remove: Iterable[Any]
) -> Dict[str, Any]:
client, base_url, access_key, timeout, prefer_service = _build_hydrus_context(payload)
hash_hex = _normalize_hash(payload.get("hash"))
add_list = [_normalize_tag(tag) for tag in add if _normalize_tag(tag)]
remove_list = [_normalize_tag(tag) for tag in remove if _normalize_tag(tag)]
if not add_list and not remove_list:
raise ValueError("No tag changes supplied")
service_key = payload.get("service_key") or payload.get("tag_service_key")
summary = None
if not service_key:
summary = _refetch_hydrus_summary(base_url, access_key, hash_hex, timeout, prefer_service)
service_key = summary.get("tag_service_key")
if not isinstance(service_key, str) or not service_key:
raise RuntimeError("Unable to determine Hydrus tag service key")
actions: Dict[str, List[str]] = {}
if add_list:
actions["0"] = [tag for tag in add_list if tag]
if remove_list:
actions["1"] = [tag for tag in remove_list if tag]
if not actions:
raise ValueError("Tag mutation produced no actionable changes")
request_payload = {
"hashes": [hash_hex],
"service_keys_to_actions_to_tags": {
service_key: actions,
},
}
try:
tag_spec = HydrusRequestSpec(
method="POST",
endpoint="/add_tags/add_tags",
data=request_payload,
)
client._perform_request(tag_spec)
except HydrusRequestError as exc:
raise RuntimeError(str(exc))
summary_after = _refetch_hydrus_summary(base_url, access_key, hash_hex, timeout, prefer_service)
result = dict(summary_after)
result["added_tags"] = actions.get("0", [])
result["removed_tags"] = actions.get("1", [])
result["tag_service_key"] = summary_after.get("tag_service_key")
return result