dfdf

2025-12-20 23:57:44 -08:00
parent b75faa49a2
commit 8ca5783970
39 changed files with 4294 additions and 1722 deletions
--- a/cmdlet/add_file.py
+++ b/cmdlet/add_file.py
@@ -109,6 +109,7 @@ class Add_File(Cmdlet):

        collected_payloads: List[Dict[str, Any]] = []
        pending_relationship_pairs: Dict[str, set[tuple[str, str]]] = {}
+        pending_url_associations: Dict[str, List[tuple[str, List[str]]]] = {}
        successes = 0
        failures = 0

@@ -118,6 +119,110 @@ class Add_File(Cmdlet):
        want_final_search_store = bool(is_last_stage) and bool(is_storage_backend_location) and bool(location)
        auto_search_store_after_add = False

+        # When ingesting multiple items into a backend store, defer URL association and
+        # apply it once at the end (bulk) to avoid per-item URL API calls.
+        defer_url_association = bool(is_storage_backend_location) and bool(location) and len(items_to_process) > 1
+
+        # If we are going to persist results (-store / -provider) and the piped input contains
+        # URL download targets (e.g. playlist rows), preflight URL duplicates once up-front.
+        # IMPORTANT: Do not treat a *source URL* on an already-local file (e.g. screen-shot)
+        # as a download target; that would trigger yt-dlp preflights for non-yt-dlp URLs.
+        skip_url_downloads: set[str] = set()
+        download_mode_hint: Optional[str] = None
+        forced_ytdl_format: Optional[str] = None
+        if (provider_name or location) and isinstance(items_to_process, list) and items_to_process:
+            url_candidates: List[str] = []
+            for it in items_to_process:
+                try:
+                    po_probe = coerce_to_pipe_object(it, path_arg)
+                except Exception:
+                    continue
+
+                # If the piped item already points at a local file, we are *ingesting* it,
+                # not downloading it. Skip URL-preflight and yt-dlp probing for those.
+                try:
+                    po_path = getattr(po_probe, "path", None)
+                    po_path_s = str(po_path or "").strip()
+                    if po_path_s and not po_path_s.lower().startswith(("http://", "https://", "magnet:", "torrent:")):
+                        continue
+                except Exception:
+                    pass
+
+                try:
+                    for u in (self._get_url(it, po_probe) or []):
+                        s = str(u or "").strip()
+                        if not s:
+                            continue
+                        if s.lower().startswith(("http://", "https://", "magnet:", "torrent:")):
+                            url_candidates.append(s)
+                except Exception:
+                    continue
+
+            # Only meaningful when targeting a registered backend store.
+            if url_candidates and is_storage_backend_location and location:
+                # De-dupe in-order to keep logs stable.
+                seen: set[str] = set()
+                unique_urls: List[str] = []
+                for u in url_candidates:
+                    if u in seen:
+                        continue
+                    seen.add(u)
+                    unique_urls.append(u)
+
+                try:
+                    skip_url_downloads = self._preflight_url_duplicates_bulk(unique_urls, config)
+                except Exception:
+                    skip_url_downloads = set()
+
+                # Batch-level format preflight:
+                # - If the sample URL only has one available format, force it for the batch.
+                # - If the sample URL appears audio-only (no video codecs), prefer audio mode.
+                try:
+                    from cmdlet.download_media import is_url_supported_by_ytdlp, list_formats
+                    from tool.ytdlp import YtDlpTool
+
+                    sample_url = unique_urls[0] if unique_urls else None
+                    if sample_url and is_url_supported_by_ytdlp(str(sample_url)):
+                        cf = None
+                        try:
+                            cookie_path = YtDlpTool(config).resolve_cookiefile()
+                            if cookie_path is not None and cookie_path.is_file():
+                                cf = str(cookie_path)
+                        except Exception:
+                            cf = None
+
+                        fmts = list_formats(
+                            str(sample_url),
+                            no_playlist=False,
+                            playlist_items=None,
+                            cookiefile=cf,
+                        )
+
+                        if isinstance(fmts, list) and fmts:
+                            has_video = False
+                            try:
+                                for f in fmts:
+                                    if not isinstance(f, dict):
+                                        continue
+                                    vcodec = str(f.get("vcodec", "none") or "none").strip().lower()
+                                    if vcodec and vcodec != "none":
+                                        has_video = True
+                                        break
+                            except Exception:
+                                has_video = False
+
+                            download_mode_hint = "video" if has_video else "audio"
+
+                            if len(fmts) == 1 and isinstance(fmts[0], dict):
+                                fid = str(fmts[0].get("format_id") or "").strip()
+                                if fid:
+                                    forced_ytdl_format = fid
+                except Exception:
+                    download_mode_hint = download_mode_hint
+                    forced_ytdl_format = forced_ytdl_format
+
+        processed_url_items: set[str] = set()
+
        for item in items_to_process:
            pipe_obj = coerce_to_pipe_object(item, path_arg)

@@ -244,7 +349,148 @@ class Add_File(Cmdlet):
                        if isinstance(media_path_or_url, str) and media_path_or_url.lower().startswith(
                            ("http://", "https://", "magnet:", "torrent:")
                        ):
-                            code = self._delegate_to_download_data(item, media_path_or_url, location, provider_name, args, config)
+                            # If the user provided a destination (-store / -provider), download here and then
+                            # continue normal add-file logic so the downloaded file is actually ingested.
+                            url_str = str(media_path_or_url)
+                            if (provider_name or location):
+                                # Avoid re-processing the same URL multiple times in a batch.
+                                if url_str in processed_url_items:
+                                    successes += 1
+                                    continue
+                                processed_url_items.add(url_str)
+
+                                # If bulk preflight found this URL already stored, skip downloading.
+                                if url_str in skip_url_downloads:
+                                    log(f"Skipping download (already stored): {url_str}", file=sys.stderr)
+                                    successes += 1
+                                    continue
+
+                                downloaded_pipe_dicts = self._download_streaming_url_as_pipe_objects(
+                                    url_str,
+                                    config,
+                                    mode_hint=download_mode_hint,
+                                    ytdl_format_hint=forced_ytdl_format,
+                                )
+                                if not downloaded_pipe_dicts:
+                                    failures += 1
+                                    continue
+
+                                # Merge original tags/notes/relationships into each downloaded item and ingest.
+                                for dl_item in downloaded_pipe_dicts:
+                                    try:
+                                        if isinstance(dl_item, dict):
+                                            # Merge tags
+                                            base_tags = list(getattr(pipe_obj, "tag", None) or [])
+                                            if base_tags:
+                                                dl_tags = list(dl_item.get("tag") or [])
+                                                dl_item["tag"] = merge_sequences(dl_tags, base_tags, case_sensitive=False)
+
+                                            # Carry notes/relationships forward when present on the original.
+                                            base_notes = getattr(pipe_obj, "notes", None)
+                                            if base_notes and ("notes" not in dl_item):
+                                                dl_item["notes"] = base_notes
+                                            base_rels = getattr(pipe_obj, "relationships", None)
+                                            if base_rels and ("relationships" not in dl_item):
+                                                dl_item["relationships"] = base_rels
+                                    except Exception:
+                                        pass
+
+                                    dl_pipe_obj = coerce_to_pipe_object(dl_item, None)
+                                    try:
+                                        dl_media_path = Path(str(getattr(dl_pipe_obj, "path", "") or ""))
+                                    except Exception:
+                                        dl_media_path = None
+
+                                    if dl_media_path is None or not self._validate_source(dl_media_path):
+                                        failures += 1
+                                        continue
+
+                                    if provider_name:
+                                        if str(provider_name).strip().lower() == "matrix":
+                                            room_id = None
+                                            if provider_room:
+                                                room_id = str(provider_room).strip()
+                                            if not room_id:
+                                                try:
+                                                    matrix_conf = config.get("provider", {}).get("matrix", {}) if isinstance(config, dict) else {}
+                                                    room_id = str(matrix_conf.get("room_id") or "").strip() or None
+                                                except Exception:
+                                                    room_id = None
+                                            if not room_id:
+                                                pending = [
+                                                    {
+                                                        "path": str(dl_media_path),
+                                                        "pipe_obj": dl_pipe_obj,
+                                                        "delete_after": bool(delete_after_item),
+                                                    }
+                                                ]
+                                                return self._matrix_prompt_room_selection(pending, config, list(args))
+
+                                            code = self._handle_matrix_upload(
+                                                dl_media_path,
+                                                dl_pipe_obj,
+                                                config,
+                                                delete_after_item,
+                                                room_id=room_id,
+                                            )
+                                        else:
+                                            code = self._handle_provider_upload(
+                                                dl_media_path,
+                                                provider_name,
+                                                dl_pipe_obj,
+                                                config,
+                                                delete_after_item,
+                                            )
+                                        if code == 0:
+                                            successes += 1
+                                        else:
+                                            failures += 1
+                                        continue
+
+                                    if location:
+                                        try:
+                                            store = Store(config)
+                                            backends = store.list_backends()
+                                            if location in backends:
+                                                code = self._handle_storage_backend(
+                                                    dl_item,
+                                                    dl_media_path,
+                                                    location,
+                                                    dl_pipe_obj,
+                                                    config,
+                                                    delete_after_item,
+                                                    collect_payloads=collected_payloads,
+                                                    collect_relationship_pairs=pending_relationship_pairs,
+                                                    defer_url_association=defer_url_association,
+                                                    pending_url_associations=pending_url_associations,
+                                                    suppress_last_stage_overlay=want_final_search_store,
+                                                    auto_search_store=auto_search_store_after_add,
+                                                )
+                                            else:
+                                                code = self._handle_local_export(
+                                                    dl_media_path,
+                                                    location,
+                                                    dl_pipe_obj,
+                                                    config,
+                                                    delete_after_item,
+                                                )
+                                        except Exception as exc:
+                                            debug(f"[add-file] ERROR: Failed to resolve location: {exc}")
+                                            log(f"Invalid location: {location}", file=sys.stderr)
+                                            failures += 1
+                                            continue
+
+                                        if code == 0:
+                                            successes += 1
+                                        else:
+                                            failures += 1
+                                        continue
+
+                                # Finished processing all downloaded items for this URL.
+                                continue
+
+                            # No destination specified: keep legacy behavior (download-media only).
+                            code = self._delegate_to_download_data(item, url_str, location, provider_name, args, config)
                            if code == 0:
                                successes += 1
                            else:
@@ -303,6 +549,8 @@ class Add_File(Cmdlet):
                                delete_after_item,
                                collect_payloads=collected_payloads,
                                collect_relationship_pairs=pending_relationship_pairs,
+                                defer_url_association=defer_url_association,
+                                pending_url_associations=pending_url_associations,
                                suppress_last_stage_overlay=want_final_search_store,
                                auto_search_store=auto_search_store_after_add,
                            )
@@ -329,6 +577,13 @@ class Add_File(Cmdlet):
                    except Exception:
                        pass

+        # Apply deferred url associations (bulk) before showing the final store table.
+        if pending_url_associations:
+            try:
+                Add_File._apply_pending_url_associations(pending_url_associations, config)
+            except Exception:
+                pass
+
        # Always end add-file -store (when last stage) by showing the canonical store table.
        # This keeps output consistent and ensures @N selection works for multi-item ingests.
        if want_final_search_store and collected_payloads:
@@ -383,7 +638,7 @@ class Add_File(Cmdlet):

            query = "hash:" + ",".join(hashes)
            args = ["-store", str(store), query]
-            log(f"[add-file] Refresh: search-store -store {store} \"{query}\"", file=sys.stderr)
+            debug(f"[add-file] Refresh: search-store -store {store} \"{query}\"")

            # Run search-store under a temporary stage context so its ctx.emit() calls
            # don't interfere with the outer add-file pipeline stage.
@@ -1440,6 +1695,292 @@ class Add_File(Cmdlet):

        return 0

+    @staticmethod
+    def _preflight_url_duplicates_bulk(urls: Sequence[str], config: Dict[str, Any]) -> set[str]:
+        """Return a set of URLs that appear to already exist in any searchable backend.
+
+        This is a best-effort check used to avoid re-downloading already-stored media when
+        a batch of URL items is piped into add-file.
+        """
+        skip: set[str] = set()
+        try:
+            storage = Store(config)
+            backend_names = list(storage.list_searchable_backends() or [])
+        except Exception:
+            return skip
+
+        for raw in urls:
+            u = str(raw or "").strip()
+            if not u:
+                continue
+
+            for backend_name in backend_names:
+                try:
+                    if str(backend_name).strip().lower() == "temp":
+                        continue
+                except Exception:
+                    pass
+                try:
+                    backend = storage[backend_name]
+                except Exception:
+                    continue
+
+                try:
+                    hits = backend.search(f"url:{u}", limit=1) or []
+                except Exception:
+                    hits = []
+                if hits:
+                    skip.add(u)
+                    break
+
+        return skip
+
+    @staticmethod
+    def _download_streaming_url_as_pipe_objects(
+        url: str,
+        config: Dict[str, Any],
+        *,
+        mode_hint: Optional[str] = None,
+        ytdl_format_hint: Optional[str] = None,
+    ) -> List[Dict[str, Any]]:
+        """Download a yt-dlp-supported URL and return PipeObject-style dict(s).
+
+        This does not rely on pipeline stage context and is used so add-file can ingest
+        URL selections directly (download -> add to store/provider) in one invocation.
+        """
+        url_str = str(url or "").strip()
+        if not url_str:
+            return []
+
+        try:
+            from cmdlet.download_media import (
+                CMDLET as dl_cmdlet,
+                _download_with_timeout,
+                is_url_supported_by_ytdlp,
+                list_formats,
+                _format_chapters_note,
+                _best_subtitle_sidecar,
+                _read_text_file,
+            )
+            from models import DownloadOptions
+            from tool.ytdlp import YtDlpTool
+        except Exception:
+            return []
+
+        if not is_url_supported_by_ytdlp(url_str):
+            return []
+
+        try:
+            from config import resolve_output_dir
+
+            out_dir = resolve_output_dir(config)
+            if out_dir is None:
+                return []
+        except Exception:
+            return []
+
+        cookies_path = None
+        try:
+            cookie_candidate = YtDlpTool(config).resolve_cookiefile()
+            if cookie_candidate is not None and cookie_candidate.is_file():
+                cookies_path = cookie_candidate
+        except Exception:
+            cookies_path = None
+
+        quiet_download = False
+        try:
+            quiet_download = bool((config or {}).get("_quiet_background_output"))
+        except Exception:
+            quiet_download = False
+
+        # Decide download mode.
+        # Default to video unless we have a hint or the URL appears to be audio-only.
+        mode = str(mode_hint or "").strip().lower() if mode_hint else ""
+        if mode not in {"audio", "video"}:
+            mode = "video"
+            # Best-effort: infer from formats for this URL (one-time, no playlist probing).
+            try:
+                cf = str(cookies_path) if cookies_path is not None and cookies_path.is_file() else None
+                fmts_probe = list_formats(url_str, no_playlist=False, playlist_items=None, cookiefile=cf)
+                if isinstance(fmts_probe, list) and fmts_probe:
+                    has_video = False
+                    for f in fmts_probe:
+                        if not isinstance(f, dict):
+                            continue
+                        vcodec = str(f.get("vcodec", "none") or "none").strip().lower()
+                        if vcodec and vcodec != "none":
+                            has_video = True
+                            break
+                    mode = "video" if has_video else "audio"
+            except Exception:
+                mode = "video"
+
+        # Pick a safe initial format selector.
+        # Important: yt-dlp defaults like "251/140" are YouTube-specific and break Bandcamp.
+        fmt_hint = str(ytdl_format_hint).strip() if ytdl_format_hint else ""
+        if fmt_hint:
+            chosen_format: Optional[str] = fmt_hint
+        else:
+            chosen_format = None
+            if mode == "audio":
+                # Generic audio selector that works across extractors.
+                chosen_format = "bestaudio/best"
+
+        opts = DownloadOptions(
+            url=url_str,
+            mode=mode,
+            output_dir=Path(out_dir),
+            cookies_path=cookies_path,
+            ytdl_format=chosen_format,
+            quiet=quiet_download,
+            embed_chapters=True,
+            write_sub=True,
+        )
+
+        # Download with a small amount of resilience for format errors.
+        try:
+            result_obj = _download_with_timeout(opts, timeout_seconds=300)
+        except Exception as exc:
+            msg = str(exc)
+            # If a format is invalid/unsupported, try:
+            # - if only one format exists, retry with that id
+            # - else for audio-only sources, retry with bestaudio/best
+            try:
+                format_error = "Requested format is not available" in msg
+            except Exception:
+                format_error = False
+
+            if format_error:
+                try:
+                    cf = str(cookies_path) if cookies_path is not None and cookies_path.is_file() else None
+                    fmts = list_formats(url_str, no_playlist=False, playlist_items=None, cookiefile=cf)
+                    if isinstance(fmts, list) and len(fmts) == 1 and isinstance(fmts[0], dict):
+                        fid = str(fmts[0].get("format_id") or "").strip()
+                        if fid:
+                            opts = DownloadOptions(
+                                url=url_str,
+                                mode=mode,
+                                output_dir=Path(out_dir),
+                                cookies_path=cookies_path,
+                                ytdl_format=fid,
+                                quiet=quiet_download,
+                                embed_chapters=True,
+                                write_sub=True,
+                            )
+                            result_obj = _download_with_timeout(opts, timeout_seconds=300)
+                            # proceed
+                        else:
+                            raise
+                    elif mode == "audio" and (not chosen_format or chosen_format != "bestaudio/best"):
+                        opts = DownloadOptions(
+                            url=url_str,
+                            mode=mode,
+                            output_dir=Path(out_dir),
+                            cookies_path=cookies_path,
+                            ytdl_format="bestaudio/best",
+                            quiet=quiet_download,
+                            embed_chapters=True,
+                            write_sub=True,
+                        )
+                        result_obj = _download_with_timeout(opts, timeout_seconds=300)
+                    else:
+                        raise
+                except Exception as exc2:
+                    log(f"[add-file] Download failed for {url_str}: {exc2}", file=sys.stderr)
+                    return []
+            else:
+                log(f"[add-file] Download failed for {url_str}: {exc}", file=sys.stderr)
+                return []
+
+        results: List[Any]
+        if isinstance(result_obj, list):
+            results = list(result_obj)
+        else:
+            paths = getattr(result_obj, "paths", None)
+            if isinstance(paths, list) and paths:
+                # Section downloads: create one result per file.
+                from models import DownloadMediaResult
+
+                results = []
+                for p in paths:
+                    try:
+                        p_path = Path(p)
+                    except Exception:
+                        continue
+                    if not p_path.exists() or p_path.is_dir():
+                        continue
+                    try:
+                        hv = sha256_file(p_path)
+                    except Exception:
+                        hv = None
+                    try:
+                        results.append(
+                            DownloadMediaResult(
+                                path=p_path,
+                                info=getattr(result_obj, "info", {}) or {},
+                                tag=list(getattr(result_obj, "tag", []) or []),
+                                source_url=getattr(result_obj, "source_url", None) or url_str,
+                                hash_value=hv,
+                            )
+                        )
+                    except Exception:
+                        continue
+            else:
+                results = [result_obj]
+
+        out: List[Dict[str, Any]] = []
+        for downloaded in results:
+            try:
+                po = dl_cmdlet._build_pipe_object(downloaded, url_str, opts)
+
+                # Attach chapter timestamps note (best-effort).
+                try:
+                    info = downloaded.info if isinstance(getattr(downloaded, "info", None), dict) else {}
+                except Exception:
+                    info = {}
+                try:
+                    chapters_text = _format_chapters_note(info)
+                except Exception:
+                    chapters_text = None
+                if chapters_text:
+                    notes = po.get("notes")
+                    if not isinstance(notes, dict):
+                        notes = {}
+                    notes.setdefault("chapters", chapters_text)
+                    po["notes"] = notes
+
+                # Capture subtitle sidecar into notes and remove it so add-file won't ingest it later.
+                try:
+                    media_path = Path(str(po.get("path") or ""))
+                except Exception:
+                    media_path = None
+                if media_path is not None and media_path.exists() and media_path.is_file():
+                    try:
+                        sub_path = _best_subtitle_sidecar(media_path)
+                    except Exception:
+                        sub_path = None
+                    if sub_path is not None:
+                        sub_text = _read_text_file(sub_path)
+                        if sub_text:
+                            notes = po.get("notes")
+                            if not isinstance(notes, dict):
+                                notes = {}
+                            notes["sub"] = sub_text
+                            po["notes"] = notes
+                        try:
+                            sub_path.unlink()
+                        except Exception:
+                            pass
+
+                # Mark as temp artifact from download-media so add-file can auto-delete after ingest.
+                po["action"] = "cmdlet:download-media"
+                po["is_temp"] = True
+                out.append(po)
+            except Exception:
+                continue
+
+        return out
+
    @staticmethod
    def _download_soulseek_file(
        result: Any,
@@ -1640,7 +2181,9 @@ class Add_File(Cmdlet):
        ctx.set_current_stage_table(table)

        print()
-        print(table.format_plain())
+        from rich_display import stdout_console
+
+        stdout_console().print(table)
        print("\nSelect room(s) with @N (e.g. @1 or @1-3) to upload the selected item(s)")
        return 0

@@ -1710,6 +2253,8 @@ class Add_File(Cmdlet):
        *,
        collect_payloads: Optional[List[Dict[str, Any]]] = None,
        collect_relationship_pairs: Optional[Dict[str, set[tuple[str, str]]]] = None,
+        defer_url_association: bool = False,
+        pending_url_associations: Optional[Dict[str, List[tuple[str, List[str]]]]] = None,
        suppress_last_stage_overlay: bool = False,
        auto_search_store: bool = True,
    ) -> int:
@@ -1822,7 +2367,7 @@ class Add_File(Cmdlet):
                media_path, 
                title=title,
                tag=tags,
-                url=url
+                url=[] if (defer_url_association and url) else url
            )
            ##log(f"✓ File added to '{backend_name}': {file_identifier}", file=sys.stderr)

@@ -1859,10 +2404,16 @@ class Add_File(Cmdlet):
            # If we have url(s), ensure they get associated with the destination file.
            # This mirrors `add-url` behavior but avoids emitting extra pipeline noise.
            if url:
-                try:
-                    backend.add_url(resolved_hash, list(url))
-                except Exception:
-                    pass
+                if defer_url_association and pending_url_associations is not None:
+                    try:
+                        pending_url_associations.setdefault(str(backend_name), []).append((str(resolved_hash), list(url)))
+                    except Exception:
+                        pass
+                else:
+                    try:
+                        backend.add_url(resolved_hash, list(url))
+                    except Exception:
+                        pass

            # If a subtitle note was provided upstream (e.g., download-media writes notes.sub),
            # persist it automatically like add-note would.
@@ -1965,6 +2516,68 @@ class Add_File(Cmdlet):

    # --- Helpers ---

+    @staticmethod
+    def _apply_pending_url_associations(pending: Dict[str, List[tuple[str, List[str]]]], config: Dict[str, Any]) -> None:
+        """Apply deferred URL associations in bulk, grouped per backend."""
+
+        try:
+            store = Store(config)
+        except Exception:
+            return
+
+        for backend_name, pairs in (pending or {}).items():
+            if not pairs:
+                continue
+            try:
+                backend = store[backend_name]
+            except Exception:
+                continue
+
+            # Merge URLs per hash and de-duplicate.
+            merged: Dict[str, List[str]] = {}
+            for file_hash, urls in pairs:
+                h = str(file_hash or "").strip().lower()
+                if len(h) != 64:
+                    continue
+                url_list: List[str] = []
+                try:
+                    for u in (urls or []):
+                        s = str(u or "").strip()
+                        if s:
+                            url_list.append(s)
+                except Exception:
+                    url_list = []
+                if not url_list:
+                    continue
+
+                bucket = merged.setdefault(h, [])
+                seen = set(bucket)
+                for u in url_list:
+                    if u in seen:
+                        continue
+                    seen.add(u)
+                    bucket.append(u)
+
+            items: List[tuple[str, List[str]]] = [(h, u) for h, u in merged.items() if u]
+            if not items:
+                continue
+
+            bulk = getattr(backend, "add_url_bulk", None)
+            if callable(bulk):
+                try:
+                    bulk(items)
+                    continue
+                except Exception:
+                    pass
+
+            single = getattr(backend, "add_url", None)
+            if callable(single):
+                for h, u in items:
+                    try:
+                        single(h, u)
+                    except Exception:
+                        continue
+
    @staticmethod
    def _load_sidecar_bundle(
        media_path: Path,