This commit is contained in:
2025-12-27 06:05:07 -08:00
parent 71b542ae91
commit 8d8a2637d5
9 changed files with 943 additions and 23 deletions

View File

@@ -3,6 +3,7 @@ from __future__ import annotations
from typing import Any, Dict, List, Sequence, Optional
from pathlib import Path
import sys
import re
from SYS.logger import log
@@ -26,6 +27,184 @@ from Store import Store
from SYS.utils import sha256_file
_FIELD_NAME_RE = re.compile(r"^[A-Za-z0-9_]+$")
def _normalize_title_for_extract(text: str) -> str:
"""Normalize common separators in titles for matching.
Helps when sources use unicode dashes or odd whitespace.
"""
s = str(text or "").strip()
if not s:
return s
# Common unicode dash variants -> '-'
s = s.replace("\u2013", "-") # en dash
s = s.replace("\u2014", "-") # em dash
s = s.replace("\u2212", "-") # minus sign
s = s.replace("\u2010", "-") # hyphen
s = s.replace("\u2011", "-") # non-breaking hyphen
s = s.replace("\u2012", "-") # figure dash
s = s.replace("\u2015", "-") # horizontal bar
return s
def _strip_title_prefix(text: str) -> str:
s = str(text or "").strip()
if s.lower().startswith("title:"):
s = s.split(":", 1)[1].strip()
return s
def _literal_to_title_pattern_regex(literal: str) -> str:
"""Convert a literal chunk of a template into a regex fragment.
Keeps punctuation literal, but treats any whitespace run as \\s*.
"""
out: List[str] = []
i = 0
while i < len(literal):
ch = literal[i]
if ch.isspace():
while i < len(literal) and literal[i].isspace():
i += 1
out.append(r"\\s*")
continue
out.append(re.escape(ch))
i += 1
return "".join(out)
def _compile_extract_template(template: str) -> tuple[re.Pattern[str], List[str]]:
"""Compile a simple (field) template into a regex.
Example template:
(artist) - (album) - (disk)-(track) (title)
This is *not* user-facing regex: we only support named fields in parentheses.
"""
tpl = str(template or "").strip()
if not tpl:
raise ValueError("empty extract template")
matches = list(re.finditer(r"\(([^)]+)\)", tpl))
if not matches:
raise ValueError("extract template must contain at least one (field)")
field_names: List[str] = []
parts: List[str] = [r"^\\s*"]
last_end = 0
for idx, m in enumerate(matches):
literal = tpl[last_end : m.start()]
if literal:
parts.append(_literal_to_title_pattern_regex(literal))
raw_name = (m.group(1) or "").strip()
if not raw_name or not _FIELD_NAME_RE.fullmatch(raw_name):
raise ValueError(f"invalid field name '{raw_name}' (use A-Z, 0-9, underscore)")
field_names.append(raw_name)
is_last = idx == (len(matches) - 1)
if is_last:
parts.append(fr"(?P<{raw_name}>.+)")
else:
parts.append(fr"(?P<{raw_name}>.+?)")
last_end = m.end()
tail = tpl[last_end:]
if tail:
parts.append(_literal_to_title_pattern_regex(tail))
parts.append(r"\\s*$")
rx = "".join(parts)
return re.compile(rx, flags=re.IGNORECASE), field_names
def _extract_tags_from_title(title_text: str, template: str) -> List[str]:
"""Extract (field)->value from title_text and return ['field:value', ...]."""
title_clean = _normalize_title_for_extract(_strip_title_prefix(title_text))
if not title_clean:
return []
pattern, field_names = _compile_extract_template(template)
m = pattern.match(title_clean)
if not m:
return []
out: List[str] = []
for name in field_names:
value = (m.group(name) or "").strip()
if not value:
continue
out.append(f"{name}:{value}")
return out
def _get_title_candidates_for_extraction(res: Any, existing_tags: Optional[List[str]] = None) -> List[str]:
"""Return a list of possible title strings in priority order."""
candidates: List[str] = []
def add_candidate(val: Any) -> None:
if val is None:
return
s = _normalize_title_for_extract(_strip_title_prefix(str(val)))
if not s:
return
if s not in candidates:
candidates.append(s)
# 1) Item's title field (may be a display title, not the title: tag)
try:
add_candidate(get_field(res, "title"))
except Exception:
pass
if isinstance(res, dict):
add_candidate(res.get("title"))
# 2) title: tag from either store tags or piped tags
tags = existing_tags if isinstance(existing_tags, list) else _extract_item_tags(res)
add_candidate(_extract_title_tag(tags) or "")
# 3) Filename stem
try:
path_val = get_field(res, "path")
if path_val:
p = Path(str(path_val))
add_candidate((p.stem or "").strip())
except Exception:
pass
return candidates
def _extract_tags_from_title_candidates(candidates: List[str], template: str) -> tuple[List[str], Optional[str]]:
"""Try candidates in order; return (tags, matched_candidate)."""
for c in candidates:
extracted = _extract_tags_from_title(c, template)
if extracted:
return extracted, c
return [], None
def _try_compile_extract_template(template: Optional[str]) -> tuple[Optional[re.Pattern[str]], Optional[str]]:
"""Compile template for debug; return (pattern, error_message)."""
if template is None:
return None, None
try:
pattern, _fields = _compile_extract_template(str(template))
return pattern, None
except Exception as exc:
return None, str(exc)
def _extract_title_tag(tags: List[str]) -> Optional[str]:
"""Return the value of the first title: tag if present."""
for t in tags:
@@ -242,6 +421,8 @@ class Add_Tag(Cmdlet):
CmdletArg("tag", type="string", required=False, description="One or more tag to add. Comma- or space-separated. Can also use {list_name} syntax. If omitted, uses tag from pipeline payload.", variadic=True),
SharedArgs.QUERY,
SharedArgs.STORE,
CmdletArg("-extract", type="string", description="Extract tags from the item's title using a simple template with (field) placeholders. Example: -extract \"(artist) - (album) - (disk)-(track) (title)\" will add artist:, album:, disk:, track:, title: tags."),
CmdletArg("--extract-debug", type="flag", description="Print debug info for -extract matching (matched title source and extracted tags)."),
CmdletArg("-duplicate", type="string", description="Copy existing tag values to new namespaces. Formats: title:album,artist (explicit) or title,album,artist (inferred)"),
CmdletArg("-list", type="string", description="Load predefined tag lists from adjective.json. Comma-separated list names (e.g., -list philosophy,occult)."),
CmdletArg("--all", type="flag", description="Include temporary files in tagging (by default, only tag non-temporary files)."),
@@ -258,6 +439,7 @@ class Add_Tag(Cmdlet):
" Inferred format: -duplicate title,album,artist (first is source, rest are targets)",
"- The source namespace must already exist in the file being tagged.",
"- Target namespaces that already have a value are skipped (not overwritten).",
"- Use -extract to derive namespaced tags from the current title (title field or title: tag) using a simple template.",
],
exec=self.run,
)
@@ -272,6 +454,13 @@ class Add_Tag(Cmdlet):
# Parse arguments
parsed = parse_cmdlet_args(args, self)
extract_template = parsed.get("extract")
if extract_template is not None:
extract_template = str(extract_template)
extract_debug = bool(parsed.get("extract-debug", False))
extract_debug_rx, extract_debug_err = _try_compile_extract_template(extract_template)
query_hash = sh.parse_single_hash_query(parsed.get("query"))
if parsed.get("query") and not query_hash:
log("[add_tag] Error: -query must be of the form hash:<sha256>", file=sys.stderr)
@@ -304,8 +493,10 @@ class Add_Tag(Cmdlet):
if isinstance(raw_tag, str):
raw_tag = [raw_tag]
# Fallback: if no tag provided explicitly, try to pull from first result payload
if not raw_tag and results:
# Fallback: if no tag provided explicitly, try to pull from first result payload.
# IMPORTANT: when -extract is used, users typically want *only* extracted tags,
# not "re-add whatever tags are already in the payload".
if not raw_tag and results and not extract_template:
first = results[0]
payload_tag = None
@@ -341,8 +532,12 @@ class Add_Tag(Cmdlet):
tag_to_add = parse_tag_arguments(raw_tag)
tag_to_add = expand_tag_groups(tag_to_add)
if not tag_to_add:
log("No tag provided to add", file=sys.stderr)
if not tag_to_add and not extract_template:
log("No tag provided to add (and no -extract template provided)", file=sys.stderr)
return 1
if extract_template and extract_debug and extract_debug_err:
log(f"[add_tag] extract template error: {extract_debug_err}", file=sys.stderr)
return 1
# Get other flags
@@ -355,6 +550,9 @@ class Add_Tag(Cmdlet):
store_registry = Store(config)
extract_matched_items = 0
extract_no_match_items = 0
for res in results:
store_name: Optional[str]
raw_hash: Optional[str]
@@ -389,6 +587,24 @@ class Add_Tag(Cmdlet):
existing_lower = {t.lower() for t in existing_tag_list if isinstance(t, str)}
item_tag_to_add = list(tag_to_add)
if extract_template:
candidates = _get_title_candidates_for_extraction(res, existing_tag_list)
extracted, matched = _extract_tags_from_title_candidates(candidates, extract_template)
if extracted:
extract_matched_items += 1
if extract_debug:
log(f"[add_tag] extract matched: {matched!r} -> {extracted}", file=sys.stderr)
for new_tag in extracted:
if new_tag.lower() not in existing_lower:
item_tag_to_add.append(new_tag)
else:
extract_no_match_items += 1
if extract_debug:
rx_preview = extract_debug_rx.pattern if extract_debug_rx else "<uncompiled>"
cand_preview = "; ".join([repr(c) for c in candidates[:3]])
log(f"[add_tag] extract no match for template {extract_template!r}. regex: {rx_preview!r}. candidates: {cand_preview}", file=sys.stderr)
item_tag_to_add = collapse_namespace_tag(item_tag_to_add, "title", prefer="last")
if duplicate_arg:
@@ -492,6 +708,24 @@ class Add_Tag(Cmdlet):
# Per-item tag list (do not mutate shared list)
item_tag_to_add = list(tag_to_add)
if extract_template:
candidates2 = _get_title_candidates_for_extraction(res, existing_tag_list)
extracted2, matched2 = _extract_tags_from_title_candidates(candidates2, extract_template)
if extracted2:
extract_matched_items += 1
if extract_debug:
log(f"[add_tag] extract matched: {matched2!r} -> {extracted2}", file=sys.stderr)
for new_tag in extracted2:
if new_tag.lower() not in existing_lower:
item_tag_to_add.append(new_tag)
else:
extract_no_match_items += 1
if extract_debug:
rx_preview2 = extract_debug_rx.pattern if extract_debug_rx else "<uncompiled>"
cand_preview2 = "; ".join([repr(c) for c in candidates2[:3]])
log(f"[add_tag] extract no match for template {extract_template!r}. regex: {rx_preview2!r}. candidates: {cand_preview2}", file=sys.stderr)
item_tag_to_add = collapse_namespace_tag(item_tag_to_add, "title", prefer="last")
# Handle -duplicate logic (copy existing tag to new namespaces)
@@ -563,6 +797,12 @@ class Add_Tag(Cmdlet):
f"[add_tag] Added {total_added} new tag(s) across {len(results)} item(s); modified {total_modified} item(s)",
file=sys.stderr,
)
if extract_template and extract_matched_items == 0:
log(f"[add_tag] extract: no matches for template '{extract_template}' across {len(results)} item(s)", file=sys.stderr)
elif extract_template and extract_no_match_items > 0 and extract_debug:
log(f"[add_tag] extract: matched {extract_matched_items}, no-match {extract_no_match_items}", file=sys.stderr)
return 0