This commit is contained in:
nose
2025-12-20 23:57:44 -08:00
parent b75faa49a2
commit 8ca5783970
39 changed files with 4294 additions and 1722 deletions

View File

@@ -109,6 +109,7 @@ class Add_File(Cmdlet):
collected_payloads: List[Dict[str, Any]] = []
pending_relationship_pairs: Dict[str, set[tuple[str, str]]] = {}
pending_url_associations: Dict[str, List[tuple[str, List[str]]]] = {}
successes = 0
failures = 0
@@ -118,6 +119,110 @@ class Add_File(Cmdlet):
want_final_search_store = bool(is_last_stage) and bool(is_storage_backend_location) and bool(location)
auto_search_store_after_add = False
# When ingesting multiple items into a backend store, defer URL association and
# apply it once at the end (bulk) to avoid per-item URL API calls.
defer_url_association = bool(is_storage_backend_location) and bool(location) and len(items_to_process) > 1
# If we are going to persist results (-store / -provider) and the piped input contains
# URL download targets (e.g. playlist rows), preflight URL duplicates once up-front.
# IMPORTANT: Do not treat a *source URL* on an already-local file (e.g. screen-shot)
# as a download target; that would trigger yt-dlp preflights for non-yt-dlp URLs.
skip_url_downloads: set[str] = set()
download_mode_hint: Optional[str] = None
forced_ytdl_format: Optional[str] = None
if (provider_name or location) and isinstance(items_to_process, list) and items_to_process:
url_candidates: List[str] = []
for it in items_to_process:
try:
po_probe = coerce_to_pipe_object(it, path_arg)
except Exception:
continue
# If the piped item already points at a local file, we are *ingesting* it,
# not downloading it. Skip URL-preflight and yt-dlp probing for those.
try:
po_path = getattr(po_probe, "path", None)
po_path_s = str(po_path or "").strip()
if po_path_s and not po_path_s.lower().startswith(("http://", "https://", "magnet:", "torrent:")):
continue
except Exception:
pass
try:
for u in (self._get_url(it, po_probe) or []):
s = str(u or "").strip()
if not s:
continue
if s.lower().startswith(("http://", "https://", "magnet:", "torrent:")):
url_candidates.append(s)
except Exception:
continue
# Only meaningful when targeting a registered backend store.
if url_candidates and is_storage_backend_location and location:
# De-dupe in-order to keep logs stable.
seen: set[str] = set()
unique_urls: List[str] = []
for u in url_candidates:
if u in seen:
continue
seen.add(u)
unique_urls.append(u)
try:
skip_url_downloads = self._preflight_url_duplicates_bulk(unique_urls, config)
except Exception:
skip_url_downloads = set()
# Batch-level format preflight:
# - If the sample URL only has one available format, force it for the batch.
# - If the sample URL appears audio-only (no video codecs), prefer audio mode.
try:
from cmdlet.download_media import is_url_supported_by_ytdlp, list_formats
from tool.ytdlp import YtDlpTool
sample_url = unique_urls[0] if unique_urls else None
if sample_url and is_url_supported_by_ytdlp(str(sample_url)):
cf = None
try:
cookie_path = YtDlpTool(config).resolve_cookiefile()
if cookie_path is not None and cookie_path.is_file():
cf = str(cookie_path)
except Exception:
cf = None
fmts = list_formats(
str(sample_url),
no_playlist=False,
playlist_items=None,
cookiefile=cf,
)
if isinstance(fmts, list) and fmts:
has_video = False
try:
for f in fmts:
if not isinstance(f, dict):
continue
vcodec = str(f.get("vcodec", "none") or "none").strip().lower()
if vcodec and vcodec != "none":
has_video = True
break
except Exception:
has_video = False
download_mode_hint = "video" if has_video else "audio"
if len(fmts) == 1 and isinstance(fmts[0], dict):
fid = str(fmts[0].get("format_id") or "").strip()
if fid:
forced_ytdl_format = fid
except Exception:
download_mode_hint = download_mode_hint
forced_ytdl_format = forced_ytdl_format
processed_url_items: set[str] = set()
for item in items_to_process:
pipe_obj = coerce_to_pipe_object(item, path_arg)
@@ -244,7 +349,148 @@ class Add_File(Cmdlet):
if isinstance(media_path_or_url, str) and media_path_or_url.lower().startswith(
("http://", "https://", "magnet:", "torrent:")
):
code = self._delegate_to_download_data(item, media_path_or_url, location, provider_name, args, config)
# If the user provided a destination (-store / -provider), download here and then
# continue normal add-file logic so the downloaded file is actually ingested.
url_str = str(media_path_or_url)
if (provider_name or location):
# Avoid re-processing the same URL multiple times in a batch.
if url_str in processed_url_items:
successes += 1
continue
processed_url_items.add(url_str)
# If bulk preflight found this URL already stored, skip downloading.
if url_str in skip_url_downloads:
log(f"Skipping download (already stored): {url_str}", file=sys.stderr)
successes += 1
continue
downloaded_pipe_dicts = self._download_streaming_url_as_pipe_objects(
url_str,
config,
mode_hint=download_mode_hint,
ytdl_format_hint=forced_ytdl_format,
)
if not downloaded_pipe_dicts:
failures += 1
continue
# Merge original tags/notes/relationships into each downloaded item and ingest.
for dl_item in downloaded_pipe_dicts:
try:
if isinstance(dl_item, dict):
# Merge tags
base_tags = list(getattr(pipe_obj, "tag", None) or [])
if base_tags:
dl_tags = list(dl_item.get("tag") or [])
dl_item["tag"] = merge_sequences(dl_tags, base_tags, case_sensitive=False)
# Carry notes/relationships forward when present on the original.
base_notes = getattr(pipe_obj, "notes", None)
if base_notes and ("notes" not in dl_item):
dl_item["notes"] = base_notes
base_rels = getattr(pipe_obj, "relationships", None)
if base_rels and ("relationships" not in dl_item):
dl_item["relationships"] = base_rels
except Exception:
pass
dl_pipe_obj = coerce_to_pipe_object(dl_item, None)
try:
dl_media_path = Path(str(getattr(dl_pipe_obj, "path", "") or ""))
except Exception:
dl_media_path = None
if dl_media_path is None or not self._validate_source(dl_media_path):
failures += 1
continue
if provider_name:
if str(provider_name).strip().lower() == "matrix":
room_id = None
if provider_room:
room_id = str(provider_room).strip()
if not room_id:
try:
matrix_conf = config.get("provider", {}).get("matrix", {}) if isinstance(config, dict) else {}
room_id = str(matrix_conf.get("room_id") or "").strip() or None
except Exception:
room_id = None
if not room_id:
pending = [
{
"path": str(dl_media_path),
"pipe_obj": dl_pipe_obj,
"delete_after": bool(delete_after_item),
}
]
return self._matrix_prompt_room_selection(pending, config, list(args))
code = self._handle_matrix_upload(
dl_media_path,
dl_pipe_obj,
config,
delete_after_item,
room_id=room_id,
)
else:
code = self._handle_provider_upload(
dl_media_path,
provider_name,
dl_pipe_obj,
config,
delete_after_item,
)
if code == 0:
successes += 1
else:
failures += 1
continue
if location:
try:
store = Store(config)
backends = store.list_backends()
if location in backends:
code = self._handle_storage_backend(
dl_item,
dl_media_path,
location,
dl_pipe_obj,
config,
delete_after_item,
collect_payloads=collected_payloads,
collect_relationship_pairs=pending_relationship_pairs,
defer_url_association=defer_url_association,
pending_url_associations=pending_url_associations,
suppress_last_stage_overlay=want_final_search_store,
auto_search_store=auto_search_store_after_add,
)
else:
code = self._handle_local_export(
dl_media_path,
location,
dl_pipe_obj,
config,
delete_after_item,
)
except Exception as exc:
debug(f"[add-file] ERROR: Failed to resolve location: {exc}")
log(f"Invalid location: {location}", file=sys.stderr)
failures += 1
continue
if code == 0:
successes += 1
else:
failures += 1
continue
# Finished processing all downloaded items for this URL.
continue
# No destination specified: keep legacy behavior (download-media only).
code = self._delegate_to_download_data(item, url_str, location, provider_name, args, config)
if code == 0:
successes += 1
else:
@@ -303,6 +549,8 @@ class Add_File(Cmdlet):
delete_after_item,
collect_payloads=collected_payloads,
collect_relationship_pairs=pending_relationship_pairs,
defer_url_association=defer_url_association,
pending_url_associations=pending_url_associations,
suppress_last_stage_overlay=want_final_search_store,
auto_search_store=auto_search_store_after_add,
)
@@ -329,6 +577,13 @@ class Add_File(Cmdlet):
except Exception:
pass
# Apply deferred url associations (bulk) before showing the final store table.
if pending_url_associations:
try:
Add_File._apply_pending_url_associations(pending_url_associations, config)
except Exception:
pass
# Always end add-file -store (when last stage) by showing the canonical store table.
# This keeps output consistent and ensures @N selection works for multi-item ingests.
if want_final_search_store and collected_payloads:
@@ -383,7 +638,7 @@ class Add_File(Cmdlet):
query = "hash:" + ",".join(hashes)
args = ["-store", str(store), query]
log(f"[add-file] Refresh: search-store -store {store} \"{query}\"", file=sys.stderr)
debug(f"[add-file] Refresh: search-store -store {store} \"{query}\"")
# Run search-store under a temporary stage context so its ctx.emit() calls
# don't interfere with the outer add-file pipeline stage.
@@ -1440,6 +1695,292 @@ class Add_File(Cmdlet):
return 0
@staticmethod
def _preflight_url_duplicates_bulk(urls: Sequence[str], config: Dict[str, Any]) -> set[str]:
"""Return a set of URLs that appear to already exist in any searchable backend.
This is a best-effort check used to avoid re-downloading already-stored media when
a batch of URL items is piped into add-file.
"""
skip: set[str] = set()
try:
storage = Store(config)
backend_names = list(storage.list_searchable_backends() or [])
except Exception:
return skip
for raw in urls:
u = str(raw or "").strip()
if not u:
continue
for backend_name in backend_names:
try:
if str(backend_name).strip().lower() == "temp":
continue
except Exception:
pass
try:
backend = storage[backend_name]
except Exception:
continue
try:
hits = backend.search(f"url:{u}", limit=1) or []
except Exception:
hits = []
if hits:
skip.add(u)
break
return skip
@staticmethod
def _download_streaming_url_as_pipe_objects(
url: str,
config: Dict[str, Any],
*,
mode_hint: Optional[str] = None,
ytdl_format_hint: Optional[str] = None,
) -> List[Dict[str, Any]]:
"""Download a yt-dlp-supported URL and return PipeObject-style dict(s).
This does not rely on pipeline stage context and is used so add-file can ingest
URL selections directly (download -> add to store/provider) in one invocation.
"""
url_str = str(url or "").strip()
if not url_str:
return []
try:
from cmdlet.download_media import (
CMDLET as dl_cmdlet,
_download_with_timeout,
is_url_supported_by_ytdlp,
list_formats,
_format_chapters_note,
_best_subtitle_sidecar,
_read_text_file,
)
from models import DownloadOptions
from tool.ytdlp import YtDlpTool
except Exception:
return []
if not is_url_supported_by_ytdlp(url_str):
return []
try:
from config import resolve_output_dir
out_dir = resolve_output_dir(config)
if out_dir is None:
return []
except Exception:
return []
cookies_path = None
try:
cookie_candidate = YtDlpTool(config).resolve_cookiefile()
if cookie_candidate is not None and cookie_candidate.is_file():
cookies_path = cookie_candidate
except Exception:
cookies_path = None
quiet_download = False
try:
quiet_download = bool((config or {}).get("_quiet_background_output"))
except Exception:
quiet_download = False
# Decide download mode.
# Default to video unless we have a hint or the URL appears to be audio-only.
mode = str(mode_hint or "").strip().lower() if mode_hint else ""
if mode not in {"audio", "video"}:
mode = "video"
# Best-effort: infer from formats for this URL (one-time, no playlist probing).
try:
cf = str(cookies_path) if cookies_path is not None and cookies_path.is_file() else None
fmts_probe = list_formats(url_str, no_playlist=False, playlist_items=None, cookiefile=cf)
if isinstance(fmts_probe, list) and fmts_probe:
has_video = False
for f in fmts_probe:
if not isinstance(f, dict):
continue
vcodec = str(f.get("vcodec", "none") or "none").strip().lower()
if vcodec and vcodec != "none":
has_video = True
break
mode = "video" if has_video else "audio"
except Exception:
mode = "video"
# Pick a safe initial format selector.
# Important: yt-dlp defaults like "251/140" are YouTube-specific and break Bandcamp.
fmt_hint = str(ytdl_format_hint).strip() if ytdl_format_hint else ""
if fmt_hint:
chosen_format: Optional[str] = fmt_hint
else:
chosen_format = None
if mode == "audio":
# Generic audio selector that works across extractors.
chosen_format = "bestaudio/best"
opts = DownloadOptions(
url=url_str,
mode=mode,
output_dir=Path(out_dir),
cookies_path=cookies_path,
ytdl_format=chosen_format,
quiet=quiet_download,
embed_chapters=True,
write_sub=True,
)
# Download with a small amount of resilience for format errors.
try:
result_obj = _download_with_timeout(opts, timeout_seconds=300)
except Exception as exc:
msg = str(exc)
# If a format is invalid/unsupported, try:
# - if only one format exists, retry with that id
# - else for audio-only sources, retry with bestaudio/best
try:
format_error = "Requested format is not available" in msg
except Exception:
format_error = False
if format_error:
try:
cf = str(cookies_path) if cookies_path is not None and cookies_path.is_file() else None
fmts = list_formats(url_str, no_playlist=False, playlist_items=None, cookiefile=cf)
if isinstance(fmts, list) and len(fmts) == 1 and isinstance(fmts[0], dict):
fid = str(fmts[0].get("format_id") or "").strip()
if fid:
opts = DownloadOptions(
url=url_str,
mode=mode,
output_dir=Path(out_dir),
cookies_path=cookies_path,
ytdl_format=fid,
quiet=quiet_download,
embed_chapters=True,
write_sub=True,
)
result_obj = _download_with_timeout(opts, timeout_seconds=300)
# proceed
else:
raise
elif mode == "audio" and (not chosen_format or chosen_format != "bestaudio/best"):
opts = DownloadOptions(
url=url_str,
mode=mode,
output_dir=Path(out_dir),
cookies_path=cookies_path,
ytdl_format="bestaudio/best",
quiet=quiet_download,
embed_chapters=True,
write_sub=True,
)
result_obj = _download_with_timeout(opts, timeout_seconds=300)
else:
raise
except Exception as exc2:
log(f"[add-file] Download failed for {url_str}: {exc2}", file=sys.stderr)
return []
else:
log(f"[add-file] Download failed for {url_str}: {exc}", file=sys.stderr)
return []
results: List[Any]
if isinstance(result_obj, list):
results = list(result_obj)
else:
paths = getattr(result_obj, "paths", None)
if isinstance(paths, list) and paths:
# Section downloads: create one result per file.
from models import DownloadMediaResult
results = []
for p in paths:
try:
p_path = Path(p)
except Exception:
continue
if not p_path.exists() or p_path.is_dir():
continue
try:
hv = sha256_file(p_path)
except Exception:
hv = None
try:
results.append(
DownloadMediaResult(
path=p_path,
info=getattr(result_obj, "info", {}) or {},
tag=list(getattr(result_obj, "tag", []) or []),
source_url=getattr(result_obj, "source_url", None) or url_str,
hash_value=hv,
)
)
except Exception:
continue
else:
results = [result_obj]
out: List[Dict[str, Any]] = []
for downloaded in results:
try:
po = dl_cmdlet._build_pipe_object(downloaded, url_str, opts)
# Attach chapter timestamps note (best-effort).
try:
info = downloaded.info if isinstance(getattr(downloaded, "info", None), dict) else {}
except Exception:
info = {}
try:
chapters_text = _format_chapters_note(info)
except Exception:
chapters_text = None
if chapters_text:
notes = po.get("notes")
if not isinstance(notes, dict):
notes = {}
notes.setdefault("chapters", chapters_text)
po["notes"] = notes
# Capture subtitle sidecar into notes and remove it so add-file won't ingest it later.
try:
media_path = Path(str(po.get("path") or ""))
except Exception:
media_path = None
if media_path is not None and media_path.exists() and media_path.is_file():
try:
sub_path = _best_subtitle_sidecar(media_path)
except Exception:
sub_path = None
if sub_path is not None:
sub_text = _read_text_file(sub_path)
if sub_text:
notes = po.get("notes")
if not isinstance(notes, dict):
notes = {}
notes["sub"] = sub_text
po["notes"] = notes
try:
sub_path.unlink()
except Exception:
pass
# Mark as temp artifact from download-media so add-file can auto-delete after ingest.
po["action"] = "cmdlet:download-media"
po["is_temp"] = True
out.append(po)
except Exception:
continue
return out
@staticmethod
def _download_soulseek_file(
result: Any,
@@ -1640,7 +2181,9 @@ class Add_File(Cmdlet):
ctx.set_current_stage_table(table)
print()
print(table.format_plain())
from rich_display import stdout_console
stdout_console().print(table)
print("\nSelect room(s) with @N (e.g. @1 or @1-3) to upload the selected item(s)")
return 0
@@ -1710,6 +2253,8 @@ class Add_File(Cmdlet):
*,
collect_payloads: Optional[List[Dict[str, Any]]] = None,
collect_relationship_pairs: Optional[Dict[str, set[tuple[str, str]]]] = None,
defer_url_association: bool = False,
pending_url_associations: Optional[Dict[str, List[tuple[str, List[str]]]]] = None,
suppress_last_stage_overlay: bool = False,
auto_search_store: bool = True,
) -> int:
@@ -1822,7 +2367,7 @@ class Add_File(Cmdlet):
media_path,
title=title,
tag=tags,
url=url
url=[] if (defer_url_association and url) else url
)
##log(f"✓ File added to '{backend_name}': {file_identifier}", file=sys.stderr)
@@ -1859,10 +2404,16 @@ class Add_File(Cmdlet):
# If we have url(s), ensure they get associated with the destination file.
# This mirrors `add-url` behavior but avoids emitting extra pipeline noise.
if url:
try:
backend.add_url(resolved_hash, list(url))
except Exception:
pass
if defer_url_association and pending_url_associations is not None:
try:
pending_url_associations.setdefault(str(backend_name), []).append((str(resolved_hash), list(url)))
except Exception:
pass
else:
try:
backend.add_url(resolved_hash, list(url))
except Exception:
pass
# If a subtitle note was provided upstream (e.g., download-media writes notes.sub),
# persist it automatically like add-note would.
@@ -1965,6 +2516,68 @@ class Add_File(Cmdlet):
# --- Helpers ---
@staticmethod
def _apply_pending_url_associations(pending: Dict[str, List[tuple[str, List[str]]]], config: Dict[str, Any]) -> None:
"""Apply deferred URL associations in bulk, grouped per backend."""
try:
store = Store(config)
except Exception:
return
for backend_name, pairs in (pending or {}).items():
if not pairs:
continue
try:
backend = store[backend_name]
except Exception:
continue
# Merge URLs per hash and de-duplicate.
merged: Dict[str, List[str]] = {}
for file_hash, urls in pairs:
h = str(file_hash or "").strip().lower()
if len(h) != 64:
continue
url_list: List[str] = []
try:
for u in (urls or []):
s = str(u or "").strip()
if s:
url_list.append(s)
except Exception:
url_list = []
if not url_list:
continue
bucket = merged.setdefault(h, [])
seen = set(bucket)
for u in url_list:
if u in seen:
continue
seen.add(u)
bucket.append(u)
items: List[tuple[str, List[str]]] = [(h, u) for h, u in merged.items() if u]
if not items:
continue
bulk = getattr(backend, "add_url_bulk", None)
if callable(bulk):
try:
bulk(items)
continue
except Exception:
pass
single = getattr(backend, "add_url", None)
if callable(single):
for h, u in items:
try:
single(h, u)
except Exception:
continue
@staticmethod
def _load_sidecar_bundle(
media_path: Path,