Files
Medios-Macina/cmdlet/get_url.py

299 lines
11 KiB
Python
Raw Normal View History

2025-11-25 20:09:33 -08:00
from __future__ import annotations
2025-12-14 00:53:52 -08:00
from dataclasses import dataclass
2025-12-29 17:05:03 -08:00
from typing import Any, Dict, List, Sequence, Optional, Set, Tuple
2025-12-01 01:10:16 -08:00
import sys
2025-12-29 17:05:03 -08:00
import re
from fnmatch import fnmatch
from urllib.parse import urlparse
2025-11-25 20:09:33 -08:00
import pipeline as ctx
2025-12-16 23:23:43 -08:00
from . import _shared as sh
Cmdlet, SharedArgs, parse_cmdlet_args, get_field, normalize_hash = (
sh.Cmdlet,
sh.SharedArgs,
sh.parse_cmdlet_args,
sh.get_field,
sh.normalize_hash,
)
2025-12-11 19:04:02 -08:00
from SYS.logger import log
from Store import Store
2025-11-25 20:09:33 -08:00
2025-12-14 00:53:52 -08:00
@dataclass
class UrlItem:
url: str
hash: str
store: str
2025-12-11 12:47:30 -08:00
class Get_Url(Cmdlet):
2025-12-29 17:05:03 -08:00
"""Get url associated with files via hash+store, or search urls by pattern."""
2025-12-14 00:53:52 -08:00
def __init__(self) -> None:
super().__init__(
name="get-url",
2025-12-29 17:05:03 -08:00
summary="List url associated with a file, or search urls by pattern",
usage='@1 | get-url OR get-url -url "https://www.youtube.com/watch?v=xx"',
arg=[SharedArgs.QUERY,
SharedArgs.STORE,
SharedArgs.URL],
2025-12-14 00:53:52 -08:00
detail=[
2025-12-29 17:05:03 -08:00
"- Get url for file: @1 | get-url (requires hash+store from result)",
'- Search url across stores: get-url -url "www.google.com" (strips protocol & www prefix)',
'- Wildcard matching: get-url -url "youtube.com*" (matches all youtube.com urls)',
"- Pattern matching: domain matching ignores protocol (https://, http://, ftp://)",
2025-12-14 00:53:52 -08:00
],
exec=self.run,
)
self.register()
2025-12-29 17:05:03 -08:00
@staticmethod
def _normalize_url_for_search(url: str) -> str:
"""Strip protocol and www prefix from URL for searching.
Examples:
https://www.youtube.com/watch?v=xx -> youtube.com/watch?v=xx
http://www.google.com -> google.com
ftp://files.example.com -> files.example.com
"""
url = str(url or "").strip()
# Remove protocol (http://, https://, ftp://, etc.)
url = re.sub(r"^[a-z][a-z0-9+.-]*://", "", url, flags=re.IGNORECASE)
# Remove www. prefix (case-insensitive)
url = re.sub(r"^www\.", "", url, flags=re.IGNORECASE)
return url.lower()
@staticmethod
def _match_url_pattern(url: str, pattern: str) -> bool:
"""Match URL against pattern with wildcard support.
Strips protocol/www from both URL and pattern before matching.
Supports * and ? wildcards.
"""
normalized_url = Get_Url._normalize_url_for_search(url)
normalized_pattern = Get_Url._normalize_url_for_search(pattern)
# Use fnmatch for wildcard matching (* and ?)
return fnmatch(normalized_url, normalized_pattern)
def _search_urls_across_stores(self,
pattern: str,
config: Dict[str,
Any]) -> Tuple[List[UrlItem],
List[str]]:
2025-12-29 17:05:03 -08:00
"""Search for URLs matching pattern across all stores.
Returns:
Tuple of (matching_items, found_stores)
"""
items: List[UrlItem] = []
found_stores: Set[str] = set()
try:
storage = Store(config)
store_names = storage.list_backends() if hasattr(storage,
"list_backends") else []
2025-12-29 17:05:03 -08:00
if not store_names:
log("Error: No stores configured", file=sys.stderr)
return items, list(found_stores)
for store_name in store_names:
try:
backend = storage[store_name]
# Try to search files in this backend
# For now, we'll iterate through known files (this is a limitation)
# Each backend should ideally support get_all_files() or similar
# For now, we use search with a broad query to find candidates
try:
# Try to get files via search (backend-specific)
search_results = backend.search("*", limit=1000)
if search_results:
for result in search_results:
file_hash = result.get("hash"
) or result.get("file_hash")
2025-12-29 17:05:03 -08:00
if not file_hash:
continue
try:
urls = backend.get_url(file_hash)
if urls:
for url in urls:
if self._match_url_pattern(str(url),
pattern):
2025-12-29 17:05:03 -08:00
items.append(
UrlItem(
url=str(url),
hash=file_hash,
store=store_name,
)
)
found_stores.add(store_name)
except Exception:
pass
except Exception:
# Backend might not support search; skip
pass
except KeyError:
continue
except Exception as exc:
debug(
f"Error searching store '{store_name}': {exc}",
file=sys.stderr
)
2025-12-29 17:05:03 -08:00
continue
return items, list(found_stores)
except Exception as exc:
log(f"Error searching stores: {exc}", file=sys.stderr)
return items, []
2025-12-11 12:47:30 -08:00
def run(self, result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
2025-12-29 17:05:03 -08:00
"""Get url for file via hash+store, or search urls by pattern."""
2025-12-11 12:47:30 -08:00
parsed = parse_cmdlet_args(args, self)
2025-12-20 02:12:45 -08:00
2025-12-29 17:05:03 -08:00
# Check if user provided a URL pattern to search for
search_pattern = parsed.get("url")
if search_pattern:
# URL search mode: find all files with matching URLs across stores
items, stores_searched = self._search_urls_across_stores(search_pattern, config)
if not items:
log(f"No urls matching pattern: {search_pattern}", file=sys.stderr)
return 1
# Create result table
from result_table import ResultTable
table = (
ResultTable(
"URL Search Results",
max_columns=3
).set_preserve_order(True).set_table("urls").set_value_case("preserve")
2025-12-29 17:05:03 -08:00
)
table.set_source_command("get-url", ["-url", search_pattern])
# Group by store for display
by_store: Dict[str,
List[UrlItem]] = {}
2025-12-29 17:05:03 -08:00
for item in items:
if item.store not in by_store:
by_store[item.store] = []
by_store[item.store].append(item)
# Add rows grouped by store
for store_name in sorted(by_store.keys()):
store_items = by_store[store_name]
for idx, item in enumerate(store_items):
row = table.add_row()
if idx == 0:
row.add_column("Store", store_name)
else:
row.add_column("Store", "")
row.add_column("Url", item.url)
# Normalize for display
normalized = self._normalize_url_for_search(item.url)
row.add_column("Hash", item.hash[:16]) # Show first 16 chars
ctx.emit(item)
ctx.set_last_result_table(table if items else None, items, subject=result)
log(
f"Found {len(items)} matching url(s) in {len(stores_searched)} store(s)"
)
2025-12-29 17:05:03 -08:00
return 0
# Original mode: Get URLs for a specific file by hash+store
2025-12-20 02:12:45 -08:00
query_hash = sh.parse_single_hash_query(parsed.get("query"))
if parsed.get("query") and not query_hash:
log("Error: -query must be of the form hash:<sha256>")
return 1
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
# Extract hash and store from result or args
2025-12-20 02:12:45 -08:00
file_hash = query_hash or get_field(result, "hash")
2025-12-11 12:47:30 -08:00
store_name = parsed.get("store") or get_field(result, "store")
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
if not file_hash:
log(
'Error: No file hash provided (pipe an item or use -query "hash:<sha256>")'
)
2025-12-11 12:47:30 -08:00
return 1
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
if not store_name:
log("Error: No store name provided")
return 1
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
# Normalize hash
file_hash = normalize_hash(file_hash)
if not file_hash:
log("Error: Invalid hash format")
return 1
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
# Get backend and retrieve url
2025-12-01 01:10:16 -08:00
try:
2025-12-11 19:04:02 -08:00
storage = Store(config)
2025-12-11 12:47:30 -08:00
backend = storage[store_name]
2025-12-11 19:04:02 -08:00
urls = backend.get_url(file_hash)
2025-12-14 00:53:52 -08:00
from result_table import ResultTable
title = str(get_field(result, "title") or "").strip()
table_title = "Title"
if title:
table_title = f"Title: {title}"
2025-12-25 04:49:22 -08:00
table = (
ResultTable(
table_title,
max_columns=1
).set_preserve_order(True).set_table("url").set_value_case("preserve")
2025-12-25 04:49:22 -08:00
)
2025-12-14 00:53:52 -08:00
table.set_source_command("get-url", [])
items: List[UrlItem] = []
for u in list(urls or []):
u = str(u or "").strip()
if not u:
continue
row = table.add_row()
row.add_column("Url", u)
item = UrlItem(url=u, hash=file_hash, store=str(store_name))
items.append(item)
ctx.emit(item)
# Make this a real result table so @.. / @,, can navigate it
ctx.set_last_result_table(table if items else None, items, subject=result)
if not items:
log("No url found", file=sys.stderr)
return 0
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
except KeyError:
log(f"Error: Storage backend '{store_name}' not configured")
return 1
2025-12-01 01:10:16 -08:00
except Exception as exc:
2025-12-11 12:47:30 -08:00
log(f"Error retrieving url: {exc}", file=sys.stderr)
return 1
2025-12-01 01:10:16 -08:00
2025-12-11 12:47:30 -08:00
2025-12-29 17:05:03 -08:00
# Import debug function from logger if available
try:
from SYS.logger import debug
except ImportError:
def debug(*args, **kwargs):
pass # Fallback no-op
2025-11-25 20:09:33 -08:00
2025-12-29 17:05:03 -08:00
CMDLET = Get_Url()