Files
Medios-Macina/cmdlet/get_url.py

642 lines
24 KiB
Python
Raw Normal View History

2025-11-25 20:09:33 -08:00
from __future__ import annotations
2026-01-16 01:47:00 -08:00
from queue import SimpleQueue
from threading import Thread
2025-12-14 00:53:52 -08:00
from dataclasses import dataclass
2025-12-29 17:05:03 -08:00
from typing import Any, Dict, List, Sequence, Optional, Set, Tuple
2025-12-01 01:10:16 -08:00
import sys
2025-12-29 17:05:03 -08:00
import re
from fnmatch import fnmatch
2026-01-17 03:37:11 -08:00
from urllib.parse import urlparse, parse_qsl, urlencode, urlunparse
2026-01-19 06:24:09 -08:00
from ._shared import (
Cmdlet,
SharedArgs,
parse_cmdlet_args,
get_field,
normalize_hash,
2025-12-16 23:23:43 -08:00
)
2026-01-22 04:22:06 -08:00
from . import _shared as sh
2025-12-11 19:04:02 -08:00
from SYS.logger import log
2026-01-18 10:50:42 -08:00
from SYS.result_table import Table
2025-12-11 19:04:02 -08:00
from Store import Store
from SYS import pipeline as ctx
2025-11-25 20:09:33 -08:00
2025-12-14 00:53:52 -08:00
@dataclass
class UrlItem:
url: str
hash: str
store: str
2025-12-30 05:48:01 -08:00
title: str = ""
2025-12-30 23:19:02 -08:00
size: int | None = None
ext: str = ""
2025-12-14 00:53:52 -08:00
2025-12-11 12:47:30 -08:00
class Get_Url(Cmdlet):
2025-12-29 17:05:03 -08:00
"""Get url associated with files via hash+store, or search urls by pattern."""
2025-12-14 00:53:52 -08:00
2026-01-16 01:47:00 -08:00
STORE_SEARCH_TIMEOUT_SECONDS = 6.0
2025-12-14 00:53:52 -08:00
def __init__(self) -> None:
super().__init__(
name="get-url",
2025-12-29 17:05:03 -08:00
summary="List url associated with a file, or search urls by pattern",
usage='@1 | get-url OR get-url -url "https://www.youtube.com/watch?v=xx"',
arg=[SharedArgs.QUERY,
SharedArgs.STORE,
SharedArgs.URL],
2025-12-14 00:53:52 -08:00
detail=[
2025-12-29 17:05:03 -08:00
"- Get url for file: @1 | get-url (requires hash+store from result)",
'- Search url across stores: get-url -url "www.google.com" (strips protocol & www prefix)',
'- Wildcard matching: get-url -url "youtube.com*" (matches all youtube.com urls)',
"- Pattern matching: domain matching ignores protocol (https://, http://, ftp://)",
2025-12-14 00:53:52 -08:00
],
exec=self.run,
)
self.register()
2025-12-29 17:05:03 -08:00
@staticmethod
def _normalize_url_for_search(url: str) -> str:
"""Strip protocol and www prefix from URL for searching.
Examples:
https://www.youtube.com/watch?v=xx -> youtube.com/watch?v=xx
http://www.google.com -> google.com
ftp://files.example.com -> files.example.com
"""
url = str(url or "").strip()
2026-01-17 03:37:11 -08:00
# Strip fragment (e.g., #t=10) before matching
url = url.split("#", 1)[0]
# Strip common time/tracking query params for matching
try:
parsed = urlparse(url)
except Exception:
parsed = None
if parsed is not None and parsed.query:
time_keys = {"t", "start", "time_continue", "timestamp", "time", "begin"}
tracking_prefixes = ("utm_",)
try:
pairs = parse_qsl(parsed.query, keep_blank_values=True)
filtered = []
for key, val in pairs:
key_norm = str(key or "").lower()
if key_norm in time_keys:
continue
if key_norm.startswith(tracking_prefixes):
continue
filtered.append((key, val))
if filtered:
url = urlunparse(parsed._replace(query=urlencode(filtered, doseq=True)))
else:
url = urlunparse(parsed._replace(query=""))
except Exception:
pass
2025-12-29 17:05:03 -08:00
# Remove protocol (http://, https://, ftp://, etc.)
url = re.sub(r"^[a-z][a-z0-9+.-]*://", "", url, flags=re.IGNORECASE)
# Remove www. prefix (case-insensitive)
url = re.sub(r"^www\.", "", url, flags=re.IGNORECASE)
return url.lower()
2026-01-17 02:36:06 -08:00
@staticmethod
def _looks_like_url_pattern(value: str) -> bool:
v = str(value or "").strip().lower()
if not v:
return False
if "://" in v:
return True
if v.startswith(("magnet:", "torrent:", "ytdl:", "tidal:", "ftp:", "sftp:", "file:")):
return True
return "." in v and "/" in v
2025-12-29 17:05:03 -08:00
@staticmethod
def _match_url_pattern(url: str, pattern: str) -> bool:
"""Match URL against pattern with wildcard support.
Strips protocol/www from both URL and pattern before matching.
Supports * and ? wildcards.
"""
2026-01-17 02:36:06 -08:00
raw_pattern = str(pattern or "").strip()
2025-12-29 17:05:03 -08:00
normalized_url = Get_Url._normalize_url_for_search(url)
2026-01-17 02:36:06 -08:00
normalized_pattern = Get_Url._normalize_url_for_search(raw_pattern)
2025-12-29 17:05:03 -08:00
2026-01-17 02:36:06 -08:00
looks_like_url = Get_Url._looks_like_url_pattern(raw_pattern)
has_wildcards = "*" in normalized_pattern or (
not looks_like_url and "?" in normalized_pattern
)
2026-01-16 01:47:00 -08:00
if has_wildcards:
return fnmatch(normalized_url, normalized_pattern)
normalized_url_no_slash = normalized_url.rstrip("/")
normalized_pattern_no_slash = normalized_pattern.rstrip("/")
if normalized_pattern_no_slash and normalized_pattern_no_slash == normalized_url_no_slash:
return True
return normalized_pattern in normalized_url
def _execute_search_with_timeout(
self,
backend: Any,
query: str,
limit: int,
store_name: str,
**kwargs: Any,
) -> Optional[List[Any]]:
queue: SimpleQueue[tuple[str, Any]] = SimpleQueue()
def _worker() -> None:
try:
queue.put(("ok", backend.search(query, limit=limit, **kwargs)))
except Exception as exc:
queue.put(("err", exc))
worker = Thread(target=_worker, daemon=True)
worker.start()
worker.join(timeout=self.STORE_SEARCH_TIMEOUT_SECONDS)
if worker.is_alive():
debug(
f"Store '{store_name}' search timed out after {self.STORE_SEARCH_TIMEOUT_SECONDS}s",
file=sys.stderr,
)
return None
if queue.empty():
return []
status, payload = queue.get()
if status == "err":
debug(
f"Store '{store_name}' search failed: {payload}",
file=sys.stderr,
)
return []
return payload or []
2025-12-29 17:05:03 -08:00
2025-12-30 05:48:01 -08:00
@staticmethod
def _extract_first_url(value: Any) -> Optional[str]:
if isinstance(value, str):
v = value.strip()
return v or None
if isinstance(value, (list, tuple)):
for item in value:
if isinstance(item, str) and item.strip():
return item.strip()
return None
2026-01-16 01:47:00 -08:00
@staticmethod
def _extract_urls_from_hit(hit: Any) -> List[str]:
"""Extract candidate URLs directly from a search hit, if present."""
raw = None
try:
raw = get_field(hit, "known_urls")
if not raw:
raw = get_field(hit, "urls")
if not raw:
raw = get_field(hit, "url")
if not raw:
raw = get_field(hit, "source_url") or get_field(hit, "source_urls")
except Exception:
raw = None
if isinstance(raw, str):
val = raw.strip()
return [val] if val else []
if isinstance(raw, (list, tuple)):
out: list[str] = []
for item in raw:
if not isinstance(item, str):
continue
v = item.strip()
if v:
out.append(v)
return out
return []
2025-12-30 05:48:01 -08:00
@staticmethod
def _extract_title_from_result(result: Any) -> Optional[str]:
# Prefer explicit title field.
# Fall back to ResultTable-style columns list.
cols = None
if isinstance(result, dict):
cols = result.get("columns")
else:
cols = getattr(result, "columns", None)
if isinstance(cols, list):
for pair in cols:
try:
if isinstance(pair, (list, tuple)) and len(pair) == 2:
k, v = pair
if str(k or "").strip().lower() in {"title", "name"}:
if isinstance(v, str) and v.strip():
return v.strip()
except Exception:
continue
return None
@staticmethod
2026-01-24 01:38:12 -08:00
def _extract_size_from_hit(hit: Any) -> int | None:
for key in ("size", "file_size", "filesize", "size_bytes"):
try:
val = get_field(hit, key)
except Exception:
val = None
if val is None:
continue
if isinstance(val, (int, float)):
return int(val)
try:
return int(val)
except Exception:
continue
return None
2025-12-30 05:48:01 -08:00
2025-12-30 23:19:02 -08:00
@staticmethod
2026-01-24 01:38:12 -08:00
def _extract_ext_from_hit(hit: Any) -> str:
for key in ("ext", "extension"):
try:
ext_val = get_field(hit, key)
except Exception:
ext_val = None
if isinstance(ext_val, str) and ext_val.strip():
return ext_val.strip().lstrip(".")
return ""
2025-12-30 23:19:02 -08:00
def _search_urls_across_stores(self,
pattern: str,
config: Dict[str,
Any]) -> Tuple[List[UrlItem],
List[str]]:
2025-12-29 17:05:03 -08:00
"""Search for URLs matching pattern across all stores.
Returns:
Tuple of (matching_items, found_stores)
"""
items: List[UrlItem] = []
found_stores: Set[str] = set()
2026-01-16 01:47:00 -08:00
MAX_RESULTS = 256
2025-12-29 17:05:03 -08:00
try:
storage = Store(config)
store_names = storage.list_backends() if hasattr(storage,
"list_backends") else []
2025-12-29 17:05:03 -08:00
if not store_names:
log("Error: No stores configured", file=sys.stderr)
return items, list(found_stores)
for store_name in store_names:
2026-01-16 01:47:00 -08:00
if len(items) >= MAX_RESULTS:
break
2025-12-29 17:05:03 -08:00
try:
backend = storage[store_name]
2025-12-30 05:48:01 -08:00
# Search only URL-bearing records using the backend's URL search capability.
# This avoids the expensive/incorrect "search('*')" scan.
2025-12-29 17:05:03 -08:00
try:
2025-12-30 05:48:01 -08:00
raw_pattern = str(pattern or "").strip()
2026-01-17 02:36:06 -08:00
looks_like_url = self._looks_like_url_pattern(raw_pattern)
has_wildcards = "*" in raw_pattern or (
not looks_like_url and "?" in raw_pattern
)
2025-12-30 05:48:01 -08:00
# If this is a Hydrus backend and the pattern is a single URL,
2026-01-16 01:47:00 -08:00
# normalize it through the official API. Skip for bare domains.
2025-12-30 05:48:01 -08:00
normalized_url = None
2026-01-17 02:36:06 -08:00
normalized_search_pattern = None
if not has_wildcards and looks_like_url:
normalized_search_pattern = self._normalize_url_for_search(
raw_pattern
)
if (
normalized_search_pattern
and normalized_search_pattern != raw_pattern
):
debug(
"get-url normalized raw pattern: %s -> %s",
raw_pattern,
normalized_search_pattern,
)
if hasattr(backend, "get_url_info"):
try:
info = backend.get_url_info(raw_pattern) # type: ignore[attr-defined]
if isinstance(info, dict):
norm = (
2026-01-18 03:23:01 -08:00
info.get("normalized_url")
2026-01-17 02:36:06 -08:00
or info.get("normalized_url")
)
if isinstance(norm, str) and norm.strip():
normalized_url = self._normalize_url_for_search(
norm.strip()
)
except Exception:
pass
if (
normalized_url
and normalized_url != normalized_search_pattern
and normalized_url != raw_pattern
):
debug(
"get-url normalized backend result: %s -> %s",
raw_pattern,
normalized_url,
)
target_pattern = (
normalized_url
or normalized_search_pattern
or raw_pattern
2026-01-16 01:47:00 -08:00
)
if has_wildcards or not target_pattern:
search_query = "url:*"
else:
wrapped_pattern = f"*{target_pattern}*"
search_query = f"url:{wrapped_pattern}"
search_limit = max(1, min(MAX_RESULTS, 1000))
search_results = self._execute_search_with_timeout(
backend,
search_query,
search_limit,
store_name,
pattern_hint=target_pattern,
2026-01-24 01:38:12 -08:00
minimal=True,
2026-01-24 09:11:05 -08:00
url_only=True,
2026-01-16 01:47:00 -08:00
)
if search_results is None:
continue
search_results = search_results or []
2025-12-30 05:48:01 -08:00
for hit in (search_results or []):
2026-01-16 01:47:00 -08:00
if len(items) >= MAX_RESULTS:
break
2025-12-30 05:48:01 -08:00
file_hash = None
if isinstance(hit, dict):
file_hash = hit.get("hash") or hit.get("file_hash")
if not file_hash:
continue
file_hash = str(file_hash)
2026-01-24 01:38:12 -08:00
title = self._extract_title_from_result(hit) or ""
size = self._extract_size_from_hit(hit)
ext = self._extract_ext_from_hit(hit)
2025-12-30 23:19:02 -08:00
2026-01-16 01:47:00 -08:00
urls = self._extract_urls_from_hit(hit)
if not urls:
try:
urls = backend.get_url(file_hash)
except Exception:
urls = []
2025-12-30 05:48:01 -08:00
2026-01-24 01:38:12 -08:00
hit_added = False
2025-12-30 05:48:01 -08:00
for url in (urls or []):
2026-01-16 01:47:00 -08:00
if len(items) >= MAX_RESULTS:
break
2025-12-30 05:48:01 -08:00
if not self._match_url_pattern(str(url), raw_pattern):
2025-12-29 17:05:03 -08:00
continue
2026-01-16 01:47:00 -08:00
2026-01-12 04:05:52 -08:00
from SYS.metadata import normalize_urls
valid = normalize_urls([str(url)])
if not valid:
continue
2025-12-30 05:48:01 -08:00
items.append(
UrlItem(
url=str(url),
hash=str(file_hash),
store=str(store_name),
title=str(title or ""),
2025-12-30 23:19:02 -08:00
size=size,
ext=str(ext or ""),
2025-12-30 05:48:01 -08:00
)
)
2026-01-24 01:38:12 -08:00
hit_added = True
if hit_added:
found_stores.add(str(store_name))
2026-01-16 01:47:00 -08:00
if len(items) >= MAX_RESULTS:
break
2025-12-30 05:48:01 -08:00
except Exception as exc:
debug(
f"Error searching store '{store_name}': {exc}",
file=sys.stderr
)
continue
2025-12-29 17:05:03 -08:00
except KeyError:
continue
except Exception as exc:
debug(
f"Error searching store '{store_name}': {exc}",
file=sys.stderr
)
2025-12-29 17:05:03 -08:00
continue
return items, list(found_stores)
except Exception as exc:
log(f"Error searching stores: {exc}", file=sys.stderr)
return items, []
2025-12-11 12:47:30 -08:00
def run(self, result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
2025-12-29 17:05:03 -08:00
"""Get url for file via hash+store, or search urls by pattern."""
2025-12-11 12:47:30 -08:00
parsed = parse_cmdlet_args(args, self)
2025-12-20 02:12:45 -08:00
2025-12-29 17:05:03 -08:00
# Check if user provided a URL pattern to search for
search_pattern = parsed.get("url")
2026-01-22 04:22:06 -08:00
# Support positional URL search or "url:" query prefix
if not search_pattern:
query = parsed.get("query")
if query:
if str(query).lower().startswith("url:"):
search_pattern = query[4:].strip()
elif self._looks_like_url_pattern(query) or (
"." in str(query) and len(str(query)) < 64
):
# If it looks like a domain or URL, and isn't a long hash,
# treat a positional query as a search pattern.
search_pattern = query
2025-12-29 17:05:03 -08:00
if search_pattern:
# URL search mode: find all files with matching URLs across stores
items, stores_searched = self._search_urls_across_stores(search_pattern, config)
if not items:
log(f"No urls matching pattern: {search_pattern}", file=sys.stderr)
return 1
2025-12-30 23:19:02 -08:00
# NOTE: The CLI can auto-render tables from emitted items. When emitting
# dataclass objects, the generic-object renderer will include `hash` as a
# visible column. To keep HASH available for chaining but hidden from the
# table, emit dicts (dict rendering hides `hash`) and provide an explicit
# `columns` list to force display order and size formatting.
display_items: List[Dict[str, Any]] = []
2025-12-29 17:05:03 -08:00
table = (
2026-01-18 10:50:42 -08:00
Table(
2025-12-30 23:19:02 -08:00
"url",
max_columns=5
2026-01-18 10:50:42 -08:00
)._perseverance(True).set_table("url").set_value_case("preserve")
2025-12-29 17:05:03 -08:00
)
table.set_source_command("get-url", ["-url", search_pattern])
for item in items:
2025-12-30 23:19:02 -08:00
payload: Dict[str, Any] = {
# Keep fields for downstream cmdlets.
"hash": item.hash,
"store": item.store,
"url": item.url,
"title": item.title,
"size": item.size,
"ext": item.ext,
# Force the visible table columns + ordering.
"columns": [
("Title", item.title),
("Url", item.url),
("Size", item.size),
("Ext", item.ext),
("Store", item.store),
],
}
display_items.append(payload)
table.add_result(payload)
ctx.set_last_result_table(table if display_items else None, display_items, subject=result)
2026-01-12 04:05:52 -08:00
# Emit after table state is finalized to prevent side effects in TUI rendering
for d in display_items:
ctx.emit(d)
log(
f"Found {len(items)} matching url(s) in {len(stores_searched)} store(s)"
)
2025-12-29 17:05:03 -08:00
return 0
# Original mode: Get URLs for a specific file by hash+store
2025-12-20 02:12:45 -08:00
query_hash = sh.parse_single_hash_query(parsed.get("query"))
if parsed.get("query") and not query_hash:
log("Error: -query must be of the form hash:<sha256>")
return 1
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
# Extract hash and store from result or args
2025-12-20 02:12:45 -08:00
file_hash = query_hash or get_field(result, "hash")
2025-12-11 12:47:30 -08:00
store_name = parsed.get("store") or get_field(result, "store")
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
if not file_hash:
log(
'Error: No file hash provided (pipe an item or use -query "hash:<sha256>")'
)
2025-12-11 12:47:30 -08:00
return 1
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
if not store_name:
log("Error: No store name provided")
return 1
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
# Get backend and retrieve url
2025-12-01 01:10:16 -08:00
try:
2025-12-11 19:04:02 -08:00
storage = Store(config)
2025-12-11 12:47:30 -08:00
backend = storage[store_name]
2025-12-11 19:04:02 -08:00
urls = backend.get_url(file_hash)
2026-01-12 04:05:52 -08:00
# Filter URLs to avoid data leakage from dirty DBs
from SYS.metadata import normalize_urls
urls = normalize_urls(urls)
2025-12-11 19:04:02 -08:00
2026-01-12 20:01:45 -08:00
from SYS.result_table import ItemDetailView, extract_item_metadata
# Prepare metadata for the detail view
metadata = extract_item_metadata(result)
2026-01-20 00:31:44 -08:00
# Enrich the metadata with tags if missing
if not metadata.get("Tags"):
try:
item_tags = get_field(result, "tag") or get_field(result, "tags") or []
row_tags = []
if isinstance(item_tags, list):
row_tags.extend([str(t) for t in item_tags])
elif isinstance(item_tags, str):
row_tags.append(item_tags)
# Also collect from backend
if file_hash and store_name:
try:
# Re-use existing backend variable
if backend and hasattr(backend, "get_tag"):
b_tags, _ = backend.get_tag(file_hash)
if b_tags:
row_tags.extend([str(t) for t in b_tags])
except Exception:
pass
if row_tags:
row_tags = sorted(list(set(row_tags)))
metadata["Tags"] = ", ".join(row_tags)
except Exception:
pass
2026-01-12 20:01:45 -08:00
if file_hash:
metadata["Hash"] = file_hash
if store_name:
metadata["Store"] = store_name
2025-12-14 00:53:52 -08:00
2025-12-25 04:49:22 -08:00
table = (
2026-01-12 20:01:45 -08:00
ItemDetailView(
"Urls",
item_metadata=metadata,
max_columns=1
2026-01-18 10:50:42 -08:00
)._perseverance(True).set_table("url").set_value_case("preserve")
2025-12-25 04:49:22 -08:00
)
2025-12-14 00:53:52 -08:00
table.set_source_command("get-url", [])
items: List[UrlItem] = []
for u in list(urls or []):
u = str(u or "").strip()
if not u:
continue
row = table.add_row()
row.add_column("Url", u)
item = UrlItem(url=u, hash=file_hash, store=str(store_name))
items.append(item)
2026-01-12 04:05:52 -08:00
# Use overlay mode to avoid "merging" with the previous status/table state.
# This is idiomatic for detail views and prevents the search table from being
# contaminated by partial re-renders.
2026-01-12 20:33:14 -08:00
ctx.set_last_result_table_overlay(table, items, subject=result)
2026-01-12 04:05:52 -08:00
# Emit items at the end for pipeline continuity
for item in items:
ctx.emit(item)
2025-12-14 00:53:52 -08:00
if not items:
2026-01-12 20:33:14 -08:00
# Still log it but the panel will show the item context
2025-12-14 00:53:52 -08:00
log("No url found", file=sys.stderr)
return 0
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
except KeyError:
log(f"Error: Storage backend '{store_name}' not configured")
return 1
2025-12-01 01:10:16 -08:00
except Exception as exc:
2025-12-11 12:47:30 -08:00
log(f"Error retrieving url: {exc}", file=sys.stderr)
return 1
2025-12-01 01:10:16 -08:00
2025-12-11 12:47:30 -08:00
2025-12-29 17:05:03 -08:00
# Import debug function from logger if available
try:
from SYS.logger import debug
except ImportError:
def debug(*args, **kwargs):
pass # Fallback no-op
2025-11-25 20:09:33 -08:00
2025-12-29 17:05:03 -08:00
CMDLET = Get_Url()