This commit is contained in:
nose
2025-12-20 23:57:44 -08:00
parent b75faa49a2
commit 8ca5783970
39 changed files with 4294 additions and 1722 deletions

View File

@@ -217,10 +217,13 @@ class HydrusNetwork(Store):
# Add title to tags if provided and not already present
if title:
title_tag = f"title:{title}"
title_tag = f"title:{title}".strip().lower()
if not any(str(candidate).lower().startswith("title:") for candidate in tag_list):
tag_list = [title_tag] + list(tag_list)
# Hydrus is lowercase-only tags; normalize here for consistency.
tag_list = [str(t).strip().lower() for t in (tag_list or []) if isinstance(t, str) and str(t).strip()]
try:
# Compute file hash
file_hash = sha256_file(file_path)
@@ -445,6 +448,36 @@ class HydrusNetwork(Store):
query_lower = query.lower().strip()
# Support `ext:<value>` anywhere in the query. We filter results by the
# Hydrus metadata extension field.
def _normalize_ext_filter(value: str) -> str:
v = str(value or "").strip().lower().lstrip('.')
v = "".join(ch for ch in v if ch.isalnum())
return v
ext_filter: str | None = None
ext_only: bool = False
try:
m = re.search(r"\bext:([^\s,]+)", query_lower)
if not m:
m = re.search(r"\bextension:([^\s,]+)", query_lower)
if m:
ext_filter = _normalize_ext_filter(m.group(1)) or None
query_lower = re.sub(r"\s*\b(?:ext|extension):[^\s,]+", " ", query_lower)
query_lower = re.sub(r"\s{2,}", " ", query_lower).strip().strip(',')
query = query_lower
if ext_filter and not query_lower:
query = "*"
query_lower = "*"
ext_only = True
except Exception:
ext_filter = None
ext_only = False
# Split into meaningful terms for AND logic.
# Avoid punctuation tokens like '-' that would make matching brittle.
search_terms = [t for t in re.findall(r"[a-z0-9]+", query_lower) if t]
# Special case: url:* and url:<value>
metadata_list: list[dict[str, Any]] | None = None
if ":" in query_lower and not query_lower.startswith(":"):
@@ -508,54 +541,268 @@ class HydrusNetwork(Store):
metadata_list = _iter_url_filtered_metadata(pattern, want_any=False, fetch_limit=int(limit) if limit else 100)
# Parse the query into tags
# Handle both simple tags and complex queries
# "*" means "match all" - use system:everything tag in Hydrus
# If query has explicit namespace, use it as a tag search.
# If query is free-form, search BOTH:
# - title:*term* (title: is the only namespace searched implicitly)
# - *term* (freeform tags; we will filter out other namespace matches client-side)
tags: list[str] = []
freeform_union_search: bool = False
title_predicates: list[str] = []
freeform_predicates: list[str] = []
if query.strip() == "*":
# Use system:everything to match all files in Hydrus
tags = ["system:everything"]
elif ':' in query_lower:
tags = [query_lower]
else:
# If query doesn't have a namespace (no ':'), search all files and filter by title/tags
# If query has explicit namespace, use it as a tag search
if ':' not in query_lower:
# No namespace provided: search all files, then filter by title/tags containing the query
tags = ["system:everything"]
freeform_union_search = True
if search_terms:
# Hydrus supports wildcard matching primarily as a prefix (e.g., tag*).
# Use per-term prefix matching for both title: and freeform tags.
title_predicates = [f"title:{term}*" for term in search_terms]
freeform_predicates = [f"{term}*" for term in search_terms]
else:
# User provided explicit namespace (e.g., "creator:john" or "system:has_audio")
# Use it as a tag search
tags = [query_lower]
if not tags:
debug(f"{prefix} 0 result(s)")
return []
# If we can't extract alnum terms, fall back to the raw query text.
title_predicates = [f"title:{query_lower}*"]
freeform_predicates = [f"{query_lower}*"]
# Search files with the tags (unless url: search already produced metadata)
results = []
# Split by comma or space for AND logic
search_terms = set(query_lower.replace(',', ' ').split()) # For substring matching
def _extract_search_ids(payload: Any) -> tuple[list[int], list[str]]:
if not isinstance(payload, dict):
return [], []
raw_ids = payload.get("file_ids", [])
raw_hashes = payload.get("hashes", [])
ids_out: list[int] = []
hashes_out: list[str] = []
if isinstance(raw_ids, list):
for item in raw_ids:
try:
ids_out.append(int(item))
except (TypeError, ValueError):
continue
if isinstance(raw_hashes, list):
hashes_out = [str(h).strip() for h in raw_hashes if isinstance(h, str) and str(h).strip()]
return ids_out, hashes_out
if metadata_list is None:
search_result = client.search_files(
tags=tags,
return_hashes=True,
return_file_ids=True
)
file_ids: list[int] = []
hashes: list[str] = []
file_ids = search_result.get("file_ids", []) if isinstance(search_result, dict) else []
hashes = search_result.get("hashes", []) if isinstance(search_result, dict) else []
if freeform_union_search:
if not title_predicates and not freeform_predicates:
debug(f"{prefix} 0 result(s)")
return []
payloads: list[Any] = []
try:
payloads.append(
client.search_files(
tags=title_predicates,
return_hashes=True,
return_file_ids=True,
)
)
except Exception:
pass
try:
payloads.append(
client.search_files(
tags=freeform_predicates,
return_hashes=True,
return_file_ids=True,
)
)
except Exception:
pass
id_set: set[int] = set()
hash_set: set[str] = set()
for payload in payloads:
ids_part, hashes_part = _extract_search_ids(payload)
for fid in ids_part:
id_set.add(fid)
for hh in hashes_part:
hash_set.add(hh)
file_ids = list(id_set)
hashes = list(hash_set)
else:
if not tags:
debug(f"{prefix} 0 result(s)")
return []
search_result = client.search_files(
tags=tags,
return_hashes=True,
return_file_ids=True
)
file_ids, hashes = _extract_search_ids(search_result)
# Fast path: ext-only search. Avoid fetching metadata for an unbounded
# system:everything result set; fetch in chunks until we have enough.
if ext_only and ext_filter:
results: list[dict[str, Any]] = []
if not file_ids and not hashes:
debug(f"{prefix} 0 result(s)")
return []
# Prefer file_ids if available.
if file_ids:
chunk_size = 200
for start in range(0, len(file_ids), chunk_size):
if len(results) >= limit:
break
chunk = file_ids[start : start + chunk_size]
try:
payload = client.fetch_file_metadata(
file_ids=chunk,
include_service_keys_to_tags=True,
include_file_url=False,
include_duration=True,
include_size=True,
include_mime=True,
)
except Exception:
continue
metas = payload.get("metadata", []) if isinstance(payload, dict) else []
if not isinstance(metas, list):
continue
for meta in metas:
if len(results) >= limit:
break
if not isinstance(meta, dict):
continue
mime_type = meta.get("mime")
ext = str(meta.get("ext") or "").strip().lstrip('.')
if not ext and mime_type:
for category in mime_maps.values():
for _ext_key, info in category.items():
if mime_type in info.get("mimes", []):
ext = str(info.get("ext", "")).strip().lstrip('.')
break
if ext:
break
if _normalize_ext_filter(ext) != ext_filter:
continue
file_id = meta.get("file_id")
hash_hex = meta.get("hash")
size = meta.get("size", 0)
tags_set = meta.get("tags", {})
all_tags: list[str] = []
title = f"Hydrus File {file_id}"
if isinstance(tags_set, dict):
def _collect(tag_list: Any) -> None:
nonlocal title
if not isinstance(tag_list, list):
return
for tag in tag_list:
tag_text = str(tag) if tag else ""
if not tag_text:
continue
tag_l = tag_text.strip().lower()
if not tag_l:
continue
all_tags.append(tag_l)
if tag_l.startswith("title:") and title == f"Hydrus File {file_id}":
title = tag_l.split(":", 1)[1].strip()
for _service_name, service_tags in tags_set.items():
if not isinstance(service_tags, dict):
continue
storage_tags = service_tags.get("storage_tags", {})
if isinstance(storage_tags, dict):
for tag_list in storage_tags.values():
_collect(tag_list)
display_tags = service_tags.get("display_tags", [])
_collect(display_tags)
file_url = f"{self.URL.rstrip('/')}/get_files/file?hash={hash_hex}"
results.append(
{
"hash": hash_hex,
"url": file_url,
"name": title,
"title": title,
"size": size,
"size_bytes": size,
"store": self.NAME,
"tag": all_tags,
"file_id": file_id,
"mime": mime_type,
"ext": ext,
}
)
debug(f"{prefix} {len(results)} result(s)")
return results[:limit]
# If we only got hashes, fall back to the normal flow below.
if not file_ids and not hashes:
debug(f"{prefix} 0 result(s)")
return []
if file_ids:
metadata = client.fetch_file_metadata(file_ids=file_ids)
metadata = client.fetch_file_metadata(
file_ids=file_ids,
include_service_keys_to_tags=True,
include_file_url=False,
include_duration=True,
include_size=True,
include_mime=True,
)
metadata_list = metadata.get("metadata", [])
elif hashes:
metadata = client.fetch_file_metadata(hashes=hashes)
metadata = client.fetch_file_metadata(
hashes=hashes,
include_service_keys_to_tags=True,
include_file_url=False,
include_duration=True,
include_size=True,
include_mime=True,
)
metadata_list = metadata.get("metadata", [])
else:
metadata_list = []
# If our free-text searches produce nothing (or nothing survived downstream filtering), fallback to scanning.
if (not metadata_list) and (query_lower != "*") and (":" not in query_lower):
try:
search_result = client.search_files(
tags=["system:everything"],
return_hashes=True,
return_file_ids=True,
)
file_ids, hashes = _extract_search_ids(search_result)
if file_ids:
metadata = client.fetch_file_metadata(
file_ids=file_ids,
include_service_keys_to_tags=True,
include_file_url=False,
include_duration=True,
include_size=True,
include_mime=True,
)
metadata_list = metadata.get("metadata", [])
elif hashes:
metadata = client.fetch_file_metadata(
hashes=hashes,
include_service_keys_to_tags=True,
include_file_url=False,
include_duration=True,
include_size=True,
include_mime=True,
)
metadata_list = metadata.get("metadata", [])
except Exception:
pass
if not isinstance(metadata_list, list):
metadata_list = []
@@ -585,10 +832,13 @@ class HydrusNetwork(Store):
tag_text = str(tag) if tag else ""
if not tag_text:
continue
all_tags.append(tag_text)
all_tags_str += " " + tag_text.lower()
if tag_text.lower().startswith("title:") and title == f"Hydrus File {file_id}":
title = tag_text.split(":", 1)[1].strip()
tag_l = tag_text.strip().lower()
if not tag_l:
continue
all_tags.append(tag_l)
all_tags_str += " " + tag_l
if tag_l.startswith("title:") and title == f"Hydrus File {file_id}":
title = tag_l.split(":", 1)[1].strip()
for _service_name, service_tags in tags_set.items():
if not isinstance(service_tags, dict):
@@ -641,20 +891,15 @@ class HydrusNetwork(Store):
"ext": ext,
})
else:
# Free-form search: check if search terms match the title or tags
# Match if ALL search terms are found in title or tags (AND logic)
# AND use whole word matching
# Combine title and tags for searching
searchable_text = (title + " " + all_tags_str).lower()
# Free-form search: check if search terms match title or FREEFORM tags.
# Do NOT implicitly match other namespace tags (except title:).
freeform_tags = [t for t in all_tags if isinstance(t, str) and t and (":" not in t)]
searchable_text = (title + " " + " ".join(freeform_tags)).lower()
match = True
if query_lower != "*":
if query_lower != "*" and search_terms:
for term in search_terms:
# Regex for whole word: \bterm\b
# Escape term to handle special chars
pattern = r'\b' + re.escape(term) + r'\b'
if not re.search(pattern, searchable_text):
if term not in searchable_text:
match = False
break
@@ -675,6 +920,17 @@ class HydrusNetwork(Store):
})
debug(f"{prefix} {len(results)} result(s)")
if ext_filter:
wanted = ext_filter
filtered: list[dict[str, Any]] = []
for item in results:
try:
if _normalize_ext_filter(str(item.get("ext") or "")) == wanted:
filtered.append(item)
except Exception:
continue
results = filtered
return results[:limit]
except Exception as exc:
@@ -903,8 +1159,8 @@ class HydrusNetwork(Store):
# Extract tags from metadata
tags = self._extract_tags_from_hydrus_meta(meta, service_key, service_name)
return tags, "hydrus"
return [str(t).strip().lower() for t in tags if isinstance(t, str) and t.strip()], "hydrus"
except Exception as exc:
debug(f"{self._log_prefix()} get_tags failed: {exc}")
@@ -924,12 +1180,38 @@ class HydrusNetwork(Store):
debug(f"{self._log_prefix()} add_tag: invalid file hash '{file_identifier}'")
return False
service_name = kwargs.get("service_name") or "my tags"
# Ensure tags is a list
tag_list = list(tags) if isinstance(tags, (list, tuple)) else [str(tags)]
if not tag_list:
return False
client.add_tag(file_hash, tag_list, service_name)
return True
incoming_tags = [str(t).strip().lower() for t in (tags or []) if isinstance(t, str) and str(t).strip()]
if not incoming_tags:
return True
try:
existing_tags, _src = self.get_tag(file_hash)
except Exception:
existing_tags = []
from metadata import compute_namespaced_tag_overwrite
tags_to_remove, tags_to_add, _merged = compute_namespaced_tag_overwrite(existing_tags, incoming_tags)
if not tags_to_add and not tags_to_remove:
return True
did_any = False
if tags_to_remove:
try:
client.delete_tag(file_hash, tags_to_remove, service_name)
did_any = True
except Exception as exc:
debug(f"{self._log_prefix()} add_tag: delete_tag failed: {exc}")
if tags_to_add:
try:
client.add_tag(file_hash, tags_to_add, service_name)
did_any = True
except Exception as exc:
debug(f"{self._log_prefix()} add_tag: add_tag failed: {exc}")
return did_any
except Exception as exc:
debug(f"{self._log_prefix()} add_tag failed: {exc}")
return False
@@ -948,7 +1230,8 @@ class HydrusNetwork(Store):
debug(f"{self._log_prefix()} delete_tag: invalid file hash '{file_identifier}'")
return False
service_name = kwargs.get("service_name") or "my tags"
tag_list = list(tags) if isinstance(tags, (list, tuple)) else [str(tags)]
raw_list = list(tags) if isinstance(tags, (list, tuple)) else [str(tags)]
tag_list = [str(t).strip().lower() for t in raw_list if isinstance(t, str) and str(t).strip()]
if not tag_list:
return False
client.delete_tag(file_hash, tag_list, service_name)
@@ -1014,6 +1297,38 @@ class HydrusNetwork(Store):
debug(f"{self._log_prefix()} add_url failed: {exc}")
return False
def add_url_bulk(self, items: List[tuple[str, List[str]]], **kwargs: Any) -> bool:
"""Bulk associate urls with Hydrus files.
This is a best-effort convenience wrapper used by cmdlets to batch url associations.
Hydrus' client API is still called per (hash,url) pair, but this consolidates the
cmdlet-level control flow so url association can be deferred until the end.
"""
try:
client = self._client
if client is None:
debug(f"{self._log_prefix()} add_url_bulk: client unavailable")
return False
any_success = False
for file_identifier, urls in (items or []):
h = str(file_identifier or "").strip().lower()
if len(h) != 64:
continue
for u in (urls or []):
s = str(u or "").strip()
if not s:
continue
try:
client.associate_url(h, s)
any_success = True
except Exception:
continue
return any_success
except Exception as exc:
debug(f"{self._log_prefix()} add_url_bulk failed: {exc}")
return False
def delete_url(self, file_identifier: str, url: List[str], **kwargs: Any) -> bool:
"""Delete one or more url from a Hydrus file.
"""