df
Some checks failed
smoke-mm / Install & smoke test mm --help (push) Has been cancelled
Some checks failed
smoke-mm / Install & smoke test mm --help (push) Has been cancelled
This commit is contained in:
@@ -1,8 +1,11 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, List, Sequence
|
||||
from typing import Any, Dict, List, Sequence, Optional, Set, Tuple
|
||||
import sys
|
||||
import re
|
||||
from fnmatch import fnmatch
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import pipeline as ctx
|
||||
from . import _shared as sh
|
||||
@@ -26,51 +29,199 @@ class UrlItem:
|
||||
|
||||
|
||||
class Get_Url(Cmdlet):
|
||||
"""Get url associated with files via hash+store."""
|
||||
"""Get url associated with files via hash+store, or search urls by pattern."""
|
||||
|
||||
def __init__(self) -> None:
|
||||
super().__init__(
|
||||
name="get-url",
|
||||
summary="List url associated with a file",
|
||||
usage="@1 | get-url",
|
||||
arg=[
|
||||
SharedArgs.QUERY,
|
||||
SharedArgs.STORE,
|
||||
],
|
||||
summary="List url associated with a file, or search urls by pattern",
|
||||
usage='@1 | get-url OR get-url -url "https://www.youtube.com/watch?v=xx"',
|
||||
arg=[SharedArgs.QUERY, SharedArgs.STORE, SharedArgs.URL],
|
||||
detail=[
|
||||
"- Lists all url associated with file identified by hash+store",
|
||||
"- Get url for file: @1 | get-url (requires hash+store from result)",
|
||||
'- Search url across stores: get-url -url "www.google.com" (strips protocol & www prefix)',
|
||||
'- Wildcard matching: get-url -url "youtube.com*" (matches all youtube.com urls)',
|
||||
"- Pattern matching: domain matching ignores protocol (https://, http://, ftp://)",
|
||||
],
|
||||
exec=self.run,
|
||||
)
|
||||
self.register()
|
||||
|
||||
|
||||
@staticmethod
|
||||
def _normalize_url_for_search(url: str) -> str:
|
||||
"""Strip protocol and www prefix from URL for searching.
|
||||
|
||||
Examples:
|
||||
https://www.youtube.com/watch?v=xx -> youtube.com/watch?v=xx
|
||||
http://www.google.com -> google.com
|
||||
ftp://files.example.com -> files.example.com
|
||||
"""
|
||||
url = str(url or "").strip()
|
||||
|
||||
# Remove protocol (http://, https://, ftp://, etc.)
|
||||
url = re.sub(r"^[a-z][a-z0-9+.-]*://", "", url, flags=re.IGNORECASE)
|
||||
|
||||
# Remove www. prefix (case-insensitive)
|
||||
url = re.sub(r"^www\.", "", url, flags=re.IGNORECASE)
|
||||
|
||||
return url.lower()
|
||||
|
||||
@staticmethod
|
||||
def _match_url_pattern(url: str, pattern: str) -> bool:
|
||||
"""Match URL against pattern with wildcard support.
|
||||
|
||||
Strips protocol/www from both URL and pattern before matching.
|
||||
Supports * and ? wildcards.
|
||||
"""
|
||||
normalized_url = Get_Url._normalize_url_for_search(url)
|
||||
normalized_pattern = Get_Url._normalize_url_for_search(pattern)
|
||||
|
||||
# Use fnmatch for wildcard matching (* and ?)
|
||||
return fnmatch(normalized_url, normalized_pattern)
|
||||
|
||||
def _search_urls_across_stores(
|
||||
self, pattern: str, config: Dict[str, Any]
|
||||
) -> Tuple[List[UrlItem], List[str]]:
|
||||
"""Search for URLs matching pattern across all stores.
|
||||
|
||||
Returns:
|
||||
Tuple of (matching_items, found_stores)
|
||||
"""
|
||||
items: List[UrlItem] = []
|
||||
found_stores: Set[str] = set()
|
||||
|
||||
try:
|
||||
storage = Store(config)
|
||||
store_names = storage.list_backends() if hasattr(storage, "list_backends") else []
|
||||
|
||||
if not store_names:
|
||||
log("Error: No stores configured", file=sys.stderr)
|
||||
return items, list(found_stores)
|
||||
|
||||
for store_name in store_names:
|
||||
try:
|
||||
backend = storage[store_name]
|
||||
|
||||
# Try to search files in this backend
|
||||
# For now, we'll iterate through known files (this is a limitation)
|
||||
# Each backend should ideally support get_all_files() or similar
|
||||
# For now, we use search with a broad query to find candidates
|
||||
try:
|
||||
# Try to get files via search (backend-specific)
|
||||
search_results = backend.search("*", limit=1000)
|
||||
if search_results:
|
||||
for result in search_results:
|
||||
file_hash = result.get("hash") or result.get("file_hash")
|
||||
if not file_hash:
|
||||
continue
|
||||
|
||||
try:
|
||||
urls = backend.get_url(file_hash)
|
||||
if urls:
|
||||
for url in urls:
|
||||
if self._match_url_pattern(str(url), pattern):
|
||||
items.append(
|
||||
UrlItem(
|
||||
url=str(url),
|
||||
hash=file_hash,
|
||||
store=store_name,
|
||||
)
|
||||
)
|
||||
found_stores.add(store_name)
|
||||
except Exception:
|
||||
pass
|
||||
except Exception:
|
||||
# Backend might not support search; skip
|
||||
pass
|
||||
|
||||
except KeyError:
|
||||
continue
|
||||
except Exception as exc:
|
||||
debug(f"Error searching store '{store_name}': {exc}", file=sys.stderr)
|
||||
continue
|
||||
|
||||
return items, list(found_stores)
|
||||
|
||||
except Exception as exc:
|
||||
log(f"Error searching stores: {exc}", file=sys.stderr)
|
||||
return items, []
|
||||
|
||||
def run(self, result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
|
||||
"""Get url for file via hash+store backend."""
|
||||
"""Get url for file via hash+store, or search urls by pattern."""
|
||||
parsed = parse_cmdlet_args(args, self)
|
||||
|
||||
# Check if user provided a URL pattern to search for
|
||||
search_pattern = parsed.get("url")
|
||||
|
||||
if search_pattern:
|
||||
# URL search mode: find all files with matching URLs across stores
|
||||
items, stores_searched = self._search_urls_across_stores(search_pattern, config)
|
||||
|
||||
if not items:
|
||||
log(f"No urls matching pattern: {search_pattern}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Create result table
|
||||
from result_table import ResultTable
|
||||
|
||||
table = (
|
||||
ResultTable("URL Search Results", max_columns=3)
|
||||
.set_preserve_order(True)
|
||||
.set_table("urls")
|
||||
.set_value_case("preserve")
|
||||
)
|
||||
table.set_source_command("get-url", ["-url", search_pattern])
|
||||
|
||||
# Group by store for display
|
||||
by_store: Dict[str, List[UrlItem]] = {}
|
||||
for item in items:
|
||||
if item.store not in by_store:
|
||||
by_store[item.store] = []
|
||||
by_store[item.store].append(item)
|
||||
|
||||
# Add rows grouped by store
|
||||
for store_name in sorted(by_store.keys()):
|
||||
store_items = by_store[store_name]
|
||||
for idx, item in enumerate(store_items):
|
||||
row = table.add_row()
|
||||
if idx == 0:
|
||||
row.add_column("Store", store_name)
|
||||
else:
|
||||
row.add_column("Store", "")
|
||||
row.add_column("Url", item.url)
|
||||
# Normalize for display
|
||||
normalized = self._normalize_url_for_search(item.url)
|
||||
row.add_column("Hash", item.hash[:16]) # Show first 16 chars
|
||||
ctx.emit(item)
|
||||
|
||||
ctx.set_last_result_table(table if items else None, items, subject=result)
|
||||
log(f"Found {len(items)} matching url(s) in {len(stores_searched)} store(s)")
|
||||
return 0
|
||||
|
||||
# Original mode: Get URLs for a specific file by hash+store
|
||||
query_hash = sh.parse_single_hash_query(parsed.get("query"))
|
||||
if parsed.get("query") and not query_hash:
|
||||
log("Error: -query must be of the form hash:<sha256>")
|
||||
return 1
|
||||
|
||||
|
||||
# Extract hash and store from result or args
|
||||
file_hash = query_hash or get_field(result, "hash")
|
||||
store_name = parsed.get("store") or get_field(result, "store")
|
||||
|
||||
|
||||
if not file_hash:
|
||||
log("Error: No file hash provided (pipe an item or use -query \"hash:<sha256>\")")
|
||||
log('Error: No file hash provided (pipe an item or use -query "hash:<sha256>")')
|
||||
return 1
|
||||
|
||||
|
||||
if not store_name:
|
||||
log("Error: No store name provided")
|
||||
return 1
|
||||
|
||||
|
||||
# Normalize hash
|
||||
file_hash = normalize_hash(file_hash)
|
||||
if not file_hash:
|
||||
log("Error: Invalid hash format")
|
||||
return 1
|
||||
|
||||
|
||||
# Get backend and retrieve url
|
||||
try:
|
||||
storage = Store(config)
|
||||
@@ -111,7 +262,7 @@ class Get_Url(Cmdlet):
|
||||
log("No url found", file=sys.stderr)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
except KeyError:
|
||||
log(f"Error: Storage backend '{store_name}' not configured")
|
||||
return 1
|
||||
@@ -120,6 +271,13 @@ class Get_Url(Cmdlet):
|
||||
return 1
|
||||
|
||||
|
||||
# Import debug function from logger if available
|
||||
try:
|
||||
from SYS.logger import debug
|
||||
except ImportError:
|
||||
|
||||
def debug(*args, **kwargs):
|
||||
pass # Fallback no-op
|
||||
|
||||
|
||||
CMDLET = Get_Url()
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user