dfdfdd

2025-12-27 06:05:07 -08:00
parent 71b542ae91
commit 8d8a2637d5
9 changed files with 943 additions and 23 deletions
@@ -3,6 +3,7 @@ from __future__ import annotations
 from typing import Any, Dict, List, Sequence, Optional
 from pathlib import Path
 import sys
+import re

 from SYS.logger import log

@@ -26,6 +27,184 @@ from Store import Store
 from SYS.utils import sha256_file


+_FIELD_NAME_RE = re.compile(r"^[A-Za-z0-9_]+$")
+
+
+def _normalize_title_for_extract(text: str) -> str:
+    """Normalize common separators in titles for matching.
+
+    Helps when sources use unicode dashes or odd whitespace.
+    """
+
+    s = str(text or "").strip()
+    if not s:
+        return s
+    # Common unicode dash variants -> '-'
+    s = s.replace("\u2013", "-")  # en dash
+    s = s.replace("\u2014", "-")  # em dash
+    s = s.replace("\u2212", "-")  # minus sign
+    s = s.replace("\u2010", "-")  # hyphen
+    s = s.replace("\u2011", "-")  # non-breaking hyphen
+    s = s.replace("\u2012", "-")  # figure dash
+    s = s.replace("\u2015", "-")  # horizontal bar
+    return s
+
+
+def _strip_title_prefix(text: str) -> str:
+    s = str(text or "").strip()
+    if s.lower().startswith("title:"):
+        s = s.split(":", 1)[1].strip()
+    return s
+
+
+def _literal_to_title_pattern_regex(literal: str) -> str:
+    """Convert a literal chunk of a template into a regex fragment.
+
+    Keeps punctuation literal, but treats any whitespace run as \\s*.
+    """
+
+    out: List[str] = []
+    i = 0
+    while i < len(literal):
+        ch = literal[i]
+        if ch.isspace():
+            while i < len(literal) and literal[i].isspace():
+                i += 1
+            out.append(r"\\s*")
+            continue
+        out.append(re.escape(ch))
+        i += 1
+    return "".join(out)
+
+
+def _compile_extract_template(template: str) -> tuple[re.Pattern[str], List[str]]:
+    """Compile a simple (field) template into a regex.
+
+    Example template:
+      (artist) - (album) - (disk)-(track) (title)
+
+    This is *not* user-facing regex: we only support named fields in parentheses.
+    """
+
+    tpl = str(template or "").strip()
+    if not tpl:
+        raise ValueError("empty extract template")
+
+    matches = list(re.finditer(r"\(([^)]+)\)", tpl))
+    if not matches:
+        raise ValueError("extract template must contain at least one (field)")
+
+    field_names: List[str] = []
+    parts: List[str] = [r"^\\s*"]
+    last_end = 0
+
+    for idx, m in enumerate(matches):
+        literal = tpl[last_end : m.start()]
+        if literal:
+            parts.append(_literal_to_title_pattern_regex(literal))
+
+        raw_name = (m.group(1) or "").strip()
+        if not raw_name or not _FIELD_NAME_RE.fullmatch(raw_name):
+            raise ValueError(f"invalid field name '{raw_name}' (use A-Z, 0-9, underscore)")
+        field_names.append(raw_name)
+
+        is_last = idx == (len(matches) - 1)
+        if is_last:
+            parts.append(fr"(?P<{raw_name}>.+)")
+        else:
+            parts.append(fr"(?P<{raw_name}>.+?)")
+
+        last_end = m.end()
+
+    tail = tpl[last_end:]
+    if tail:
+        parts.append(_literal_to_title_pattern_regex(tail))
+    parts.append(r"\\s*$")
+
+    rx = "".join(parts)
+    return re.compile(rx, flags=re.IGNORECASE), field_names
+
+
+def _extract_tags_from_title(title_text: str, template: str) -> List[str]:
+    """Extract (field)->value from title_text and return ['field:value', ...]."""
+
+    title_clean = _normalize_title_for_extract(_strip_title_prefix(title_text))
+    if not title_clean:
+        return []
+
+    pattern, field_names = _compile_extract_template(template)
+    m = pattern.match(title_clean)
+    if not m:
+        return []
+
+    out: List[str] = []
+    for name in field_names:
+        value = (m.group(name) or "").strip()
+        if not value:
+            continue
+        out.append(f"{name}:{value}")
+    return out
+
+
+def _get_title_candidates_for_extraction(res: Any, existing_tags: Optional[List[str]] = None) -> List[str]:
+    """Return a list of possible title strings in priority order."""
+
+    candidates: List[str] = []
+
+    def add_candidate(val: Any) -> None:
+        if val is None:
+            return
+        s = _normalize_title_for_extract(_strip_title_prefix(str(val)))
+        if not s:
+            return
+        if s not in candidates:
+            candidates.append(s)
+
+    # 1) Item's title field (may be a display title, not the title: tag)
+    try:
+        add_candidate(get_field(res, "title"))
+    except Exception:
+        pass
+    if isinstance(res, dict):
+        add_candidate(res.get("title"))
+
+    # 2) title: tag from either store tags or piped tags
+    tags = existing_tags if isinstance(existing_tags, list) else _extract_item_tags(res)
+    add_candidate(_extract_title_tag(tags) or "")
+
+    # 3) Filename stem
+    try:
+        path_val = get_field(res, "path")
+        if path_val:
+            p = Path(str(path_val))
+            add_candidate((p.stem or "").strip())
+    except Exception:
+        pass
+
+    return candidates
+
+
+def _extract_tags_from_title_candidates(candidates: List[str], template: str) -> tuple[List[str], Optional[str]]:
+    """Try candidates in order; return (tags, matched_candidate)."""
+
+    for c in candidates:
+        extracted = _extract_tags_from_title(c, template)
+        if extracted:
+            return extracted, c
+    return [], None
+
+
+def _try_compile_extract_template(template: Optional[str]) -> tuple[Optional[re.Pattern[str]], Optional[str]]:
+    """Compile template for debug; return (pattern, error_message)."""
+    if template is None:
+        return None, None
+    try:
+        pattern, _fields = _compile_extract_template(str(template))
+        return pattern, None
+    except Exception as exc:
+        return None, str(exc)
+
+
 def _extract_title_tag(tags: List[str]) -> Optional[str]:
    """Return the value of the first title: tag if present."""
    for t in tags:
@@ -242,6 +421,8 @@ class Add_Tag(Cmdlet):
                CmdletArg("tag", type="string", required=False, description="One or more tag to add. Comma- or space-separated. Can also use {list_name} syntax. If omitted, uses tag from pipeline payload.", variadic=True),
                SharedArgs.QUERY,
                SharedArgs.STORE,
+                CmdletArg("-extract", type="string", description="Extract tags from the item's title using a simple template with (field) placeholders. Example: -extract \"(artist) - (album) - (disk)-(track) (title)\" will add artist:, album:, disk:, track:, title: tags."),
+                CmdletArg("--extract-debug", type="flag", description="Print debug info for -extract matching (matched title source and extracted tags)."),
                CmdletArg("-duplicate", type="string", description="Copy existing tag values to new namespaces. Formats: title:album,artist (explicit) or title,album,artist (inferred)"),
                CmdletArg("-list", type="string", description="Load predefined tag lists from adjective.json. Comma-separated list names (e.g., -list philosophy,occult)."),
                CmdletArg("--all", type="flag", description="Include temporary files in tagging (by default, only tag non-temporary files)."),
@@ -258,6 +439,7 @@ class Add_Tag(Cmdlet):
                "  Inferred format: -duplicate title,album,artist (first is source, rest are targets)",
                "- The source namespace must already exist in the file being tagged.",
                "- Target namespaces that already have a value are skipped (not overwritten).",
+                "- Use -extract to derive namespaced tags from the current title (title field or title: tag) using a simple template.",
            ],
            exec=self.run,
        )
@@ -272,6 +454,13 @@ class Add_Tag(Cmdlet):
        # Parse arguments
        parsed = parse_cmdlet_args(args, self)

+        extract_template = parsed.get("extract")
+        if extract_template is not None:
+            extract_template = str(extract_template)
+
+        extract_debug = bool(parsed.get("extract-debug", False))
+        extract_debug_rx, extract_debug_err = _try_compile_extract_template(extract_template)
+
        query_hash = sh.parse_single_hash_query(parsed.get("query"))
        if parsed.get("query") and not query_hash:
            log("[add_tag] Error: -query must be of the form hash:<sha256>", file=sys.stderr)
@@ -304,8 +493,10 @@ class Add_Tag(Cmdlet):
        if isinstance(raw_tag, str):
            raw_tag = [raw_tag]

-        # Fallback: if no tag provided explicitly, try to pull from first result payload
-        if not raw_tag and results:
+        # Fallback: if no tag provided explicitly, try to pull from first result payload.
+        # IMPORTANT: when -extract is used, users typically want *only* extracted tags,
+        # not "re-add whatever tags are already in the payload".
+        if not raw_tag and results and not extract_template:
            first = results[0]
            payload_tag = None
            
@@ -341,8 +532,12 @@ class Add_Tag(Cmdlet):
        tag_to_add = parse_tag_arguments(raw_tag)
        tag_to_add = expand_tag_groups(tag_to_add)

-        if not tag_to_add:
-            log("No tag provided to add", file=sys.stderr)
+        if not tag_to_add and not extract_template:
+            log("No tag provided to add (and no -extract template provided)", file=sys.stderr)
+            return 1
+
+        if extract_template and extract_debug and extract_debug_err:
+            log(f"[add_tag] extract template error: {extract_debug_err}", file=sys.stderr)
            return 1

        # Get other flags
@@ -355,6 +550,9 @@ class Add_Tag(Cmdlet):

        store_registry = Store(config)

+        extract_matched_items = 0
+        extract_no_match_items = 0
+
        for res in results:
            store_name: Optional[str]
            raw_hash: Optional[str]
@@ -389,6 +587,24 @@ class Add_Tag(Cmdlet):
                            existing_lower = {t.lower() for t in existing_tag_list if isinstance(t, str)}

                            item_tag_to_add = list(tag_to_add)
+
+                            if extract_template:
+                                candidates = _get_title_candidates_for_extraction(res, existing_tag_list)
+                                extracted, matched = _extract_tags_from_title_candidates(candidates, extract_template)
+                                if extracted:
+                                    extract_matched_items += 1
+                                    if extract_debug:
+                                        log(f"[add_tag] extract matched: {matched!r} -> {extracted}", file=sys.stderr)
+                                    for new_tag in extracted:
+                                        if new_tag.lower() not in existing_lower:
+                                            item_tag_to_add.append(new_tag)
+                                else:
+                                    extract_no_match_items += 1
+                                    if extract_debug:
+                                        rx_preview = extract_debug_rx.pattern if extract_debug_rx else "<uncompiled>"
+                                        cand_preview = "; ".join([repr(c) for c in candidates[:3]])
+                                        log(f"[add_tag] extract no match for template {extract_template!r}. regex: {rx_preview!r}. candidates: {cand_preview}", file=sys.stderr)
+
                            item_tag_to_add = collapse_namespace_tag(item_tag_to_add, "title", prefer="last")

                            if duplicate_arg:
@@ -492,6 +708,24 @@ class Add_Tag(Cmdlet):

            # Per-item tag list (do not mutate shared list)
            item_tag_to_add = list(tag_to_add)
+
+            if extract_template:
+                candidates2 = _get_title_candidates_for_extraction(res, existing_tag_list)
+                extracted2, matched2 = _extract_tags_from_title_candidates(candidates2, extract_template)
+                if extracted2:
+                    extract_matched_items += 1
+                    if extract_debug:
+                        log(f"[add_tag] extract matched: {matched2!r} -> {extracted2}", file=sys.stderr)
+                    for new_tag in extracted2:
+                        if new_tag.lower() not in existing_lower:
+                            item_tag_to_add.append(new_tag)
+                else:
+                    extract_no_match_items += 1
+                    if extract_debug:
+                        rx_preview2 = extract_debug_rx.pattern if extract_debug_rx else "<uncompiled>"
+                        cand_preview2 = "; ".join([repr(c) for c in candidates2[:3]])
+                        log(f"[add_tag] extract no match for template {extract_template!r}. regex: {rx_preview2!r}. candidates: {cand_preview2}", file=sys.stderr)
+
            item_tag_to_add = collapse_namespace_tag(item_tag_to_add, "title", prefer="last")

            # Handle -duplicate logic (copy existing tag to new namespaces)
@@ -563,6 +797,12 @@ class Add_Tag(Cmdlet):
            f"[add_tag] Added {total_added} new tag(s) across {len(results)} item(s); modified {total_modified} item(s)",
            file=sys.stderr,
        )
+
+        if extract_template and extract_matched_items == 0:
+            log(f"[add_tag] extract: no matches for template '{extract_template}' across {len(results)} item(s)", file=sys.stderr)
+        elif extract_template and extract_no_match_items > 0 and extract_debug:
+            log(f"[add_tag] extract: matched {extract_matched_items}, no-match {extract_no_match_items}", file=sys.stderr)
+
        return 0