d

2026-01-17 03:37:11 -08:00
parent c6fd6b4224
commit 5e76c44155
2 changed files with 294 additions and 22 deletions
@@ -3060,20 +3060,17 @@ def check_url_exists_in_storage(
    in_pipeline = bool(stage_ctx is not None or ("|" in str(current_cmd_text or "")))
    if in_pipeline:
        try:
-            cached_cmd = pipeline_context.load_value("preflight.url_duplicates.command", default="")
+            already_checked = bool(
-            cached_decision = pipeline_context.load_value("preflight.url_duplicates.continue", default=None)
+                pipeline_context.load_value(
                    "preflight.url_duplicates.checked", default=False
                )
            )
        except Exception:
-            cached_cmd = ""
+            already_checked = False
            cached_decision = None
-        if cached_decision is not None and str(cached_cmd or "") == str(current_cmd_text or ""):
+        if already_checked:
-            if bool(cached_decision):
+            debug("Bulk URL preflight: already checked in pipeline; skipping duplicate check")
            return True
            try:
                pipeline_context.request_pipeline_stop(reason="duplicate-url declined", exit_code=0)
            except Exception:
                pass
            return False
    def _load_preflight_cache() -> Dict[str, Any]:
        try:
@@ -3088,6 +3085,40 @@ def check_url_exists_in_storage(
        except Exception:
            pass
    def _mark_preflight_checked() -> None:
        if not in_pipeline:
            return
        try:
            pipeline_context.store_value("preflight.url_duplicates.checked", True)
        except Exception:
            pass
        preflight_cache = _load_preflight_cache()
        preflight_cache["url_duplicates_checked"] = True
        url_dup_cache = preflight_cache.get("url_duplicates")
        if not isinstance(url_dup_cache, dict):
            url_dup_cache = {}
        url_dup_cache["checked"] = True
        preflight_cache["url_duplicates"] = url_dup_cache
        _store_preflight_cache(preflight_cache)
    if in_pipeline:
        try:
            cached_cmd = pipeline_context.load_value("preflight.url_duplicates.command", default="")
            cached_decision = pipeline_context.load_value("preflight.url_duplicates.continue", default=None)
        except Exception:
            cached_cmd = ""
            cached_decision = None
        if cached_decision is not None and str(cached_cmd or "") == str(current_cmd_text or ""):
            _mark_preflight_checked()
            if bool(cached_decision):
                return True
            try:
                pipeline_context.request_pipeline_stop(reason="duplicate-url declined", exit_code=0)
            except Exception:
                pass
            return False
    unique_urls: List[str] = []
    for u in urls or []:
        s = str(u or "").strip()
@@ -3107,6 +3138,46 @@ def check_url_exists_in_storage(
        except Exception:
            return False
    def _normalize_url_for_search(value: str) -> str:
        url = str(value or "").strip()
        # Strip fragment (e.g., #t=10) before matching
        url = url.split("#", 1)[0]
        # Strip common time/tracking query params for matching
        try:
            parsed = urlparse(url)
        except Exception:
            parsed = None
        if parsed is not None and parsed.query:
            time_keys = {"t", "start", "time_continue", "timestamp", "time", "begin"}
            tracking_prefixes = ("utm_",)
            try:
                pairs = parse_qsl(parsed.query, keep_blank_values=True)
                filtered = []
                for key, val in pairs:
                    key_norm = str(key or "").lower()
                    if key_norm in time_keys:
                        continue
                    if key_norm.startswith(tracking_prefixes):
                        continue
                    filtered.append((key, val))
                if filtered:
                    url = urlunparse(parsed._replace(query=urlencode(filtered, doseq=True)))
                else:
                    url = urlunparse(parsed._replace(query=""))
            except Exception:
                pass
        # Remove protocol (http://, https://, ftp://, etc.)
        url = re.sub(r"^[a-z][a-z0-9+.-]*://", "", url, flags=re.IGNORECASE)
        # Remove www. prefix (case-insensitive)
        url = re.sub(r"^www\.", "", url, flags=re.IGNORECASE)
        return url.lower()
    def _expand_url_variants(value: str) -> List[str]:
        if not _httpish(value):
            return []
@@ -3121,6 +3192,51 @@ def check_url_exists_in_storage(
        out: List[str] = []
        def _add_variant(candidate: str) -> None:
            _maybe_add(candidate)
            try:
                lower = str(candidate or "").lower()
            except Exception:
                lower = ""
            if lower and lower != candidate:
                _maybe_add(lower)
            try:
                parsed_candidate = urlparse(candidate)
            except Exception:
                parsed_candidate = None
            if parsed_candidate is None:
                return
            host = (parsed_candidate.hostname or "").strip().lower()
            if host.startswith("www."):
                host = host[4:]
                if host:
                    netloc = host
                    try:
                        if parsed_candidate.port:
                            netloc = f"{netloc}:{parsed_candidate.port}"
                    except Exception:
                        pass
                    try:
                        if parsed_candidate.username or parsed_candidate.password:
                            userinfo = parsed_candidate.username or ""
                            if parsed_candidate.password:
                                userinfo = f"{userinfo}:{parsed_candidate.password}"
                            if userinfo:
                                netloc = f"{userinfo}@{netloc}"
                    except Exception:
                        pass
                    alt = urlunparse(parsed_candidate._replace(netloc=netloc))
                    _maybe_add(alt)
                    try:
                        lower_alt = alt.lower()
                    except Exception:
                        lower_alt = ""
                    if lower_alt and lower_alt != alt:
                        _maybe_add(lower_alt)
        def _maybe_add(candidate: str) -> None:
            if not candidate or candidate == value:
                return
@@ -3128,7 +3244,7 @@ def check_url_exists_in_storage(
                out.append(candidate)
        if parsed.fragment:
-            _maybe_add(urlunparse(parsed._replace(fragment="")))
+            _add_variant(urlunparse(parsed._replace(fragment="")))
        time_keys = {"t", "start", "time_continue", "timestamp", "time", "begin"}
        tracking_prefixes = ("utm_",)
@@ -3153,7 +3269,7 @@ def check_url_exists_in_storage(
            if removed:
                new_query = urlencode(filtered_pairs, doseq=True) if filtered_pairs else ""
-                _maybe_add(urlunparse(parsed._replace(query=new_query, fragment="")))
+                _add_variant(urlunparse(parsed._replace(query=new_query, fragment="")))
        return out
@@ -3176,13 +3292,35 @@ def check_url_exists_in_storage(
                continue
            if n2 not in filtered:
                filtered.append(n2)
        lowered: List[str] = []
        for n2 in filtered:
            try:
                lower = n2.lower()
            except Exception:
                lower = ""
            if lower and lower != n2 and lower not in filtered and lower not in lowered:
                lowered.append(lower)
        normalized: List[str] = []
        for n2 in filtered:
            norm = _normalize_url_for_search(n2)
            if norm and norm not in normalized and norm not in filtered:
                normalized.append(norm)
        expanded: List[str] = []
        for n2 in filtered:
            for extra in _expand_url_variants(n2):
-                if extra not in expanded and extra not in filtered:
+                if extra not in expanded and extra not in filtered and extra not in lowered:
                    expanded.append(extra)
                norm_extra = _normalize_url_for_search(extra)
                if (
                    norm_extra
                    and norm_extra not in normalized
                    and norm_extra not in filtered
                    and norm_extra not in expanded
                    and norm_extra not in lowered
                ):
                    normalized.append(norm_extra)
-        combined = filtered + expanded
+        combined = filtered + expanded + lowered + normalized
        url_needles[u] = combined if combined else [u]
    if in_pipeline:
@@ -3196,15 +3334,28 @@ def check_url_exists_in_storage(
        if cached_set:
            all_cached = True
            for original_url, needles in url_needles.items():
-                if original_url in cached_set:
+                original_cached = str(original_url or "") in cached_set
                needles_cached = True
                if original_cached:
                    for needle in (needles or []):
                        needle_text = str(needle or "")
                        if not needle_text:
                            continue
-                if any(n in cached_set for n in (needles or [])):
+                        if needle_text not in cached_set:
                            needles_cached = False
                            break
                else:
                    needles_cached = False
                if original_cached and needles_cached:
                    continue
                all_cached = False
                break
            if all_cached:
                debug("Bulk URL preflight: cached for pipeline; skipping duplicate check")
                _mark_preflight_checked()
                return True
    def _search_backend_url_hits(
@@ -3215,14 +3366,103 @@ def check_url_exists_in_storage(
    ) -> Optional[Dict[str, Any]]:
        backend_hits: List[Dict[str, Any]] = []
        for needle in (needles or [])[:3]:
            needle_text = str(needle or "").strip()
            if not needle_text:
                continue
            search_needle = _normalize_url_for_search(needle_text) or needle_text
            query = f"url:*{search_needle}*"
            try:
-                backend_hits = backend.search(f"url:{needle}", limit=1) or []
+                backend_hits = backend.search(query, limit=1) or []
                if backend_hits:
                    break
            except Exception:
                continue
        if not backend_hits:
            def _match_normalized_url(pattern_text: str, candidate_url: str) -> bool:
                pattern_norm = _normalize_url_for_search(pattern_text)
                candidate_norm = _normalize_url_for_search(candidate_url)
                if not pattern_norm or not candidate_norm:
                    return False
                if pattern_norm == candidate_norm:
                    return True
                return pattern_norm in candidate_norm
            fallback_hits: List[Dict[str, Any]] = []
            try:
                fallback_hits = backend.search("url:*", limit=200) or []
            except Exception:
                fallback_hits = []
            for hit in fallback_hits:
                url_values: List[str] = []
                try:
                    raw_urls = get_field(hit, "known_urls") or get_field(hit, "urls") or get_field(hit, "url")
                    if isinstance(raw_urls, str) and raw_urls.strip():
                        url_values.append(raw_urls.strip())
                    elif isinstance(raw_urls, (list, tuple, set)):
                        for item in raw_urls:
                            if isinstance(item, str) and item.strip():
                                url_values.append(item.strip())
                except Exception:
                    url_values = []
                if not url_values:
                    try:
                        file_hash = hit.get("hash") if isinstance(hit, dict) else None
                        if file_hash:
                            fetched = backend.get_url(str(file_hash))
                            if isinstance(fetched, str) and fetched.strip():
                                url_values.append(fetched.strip())
                            elif isinstance(fetched, (list, tuple, set)):
                                for item in fetched:
                                    if isinstance(item, str) and item.strip():
                                        url_values.append(item.strip())
                    except Exception:
                        pass
                if not url_values:
                    continue
                matched = False
                for url_value in url_values:
                    for needle in (needles or []):
                        if _match_normalized_url(str(needle or ""), str(url_value or "")):
                            matched = True
                            break
                    if matched:
                        break
                if not matched:
                    continue
                title = "(exists)"
                try:
                    title = hit.get("title") or hit.get("name") or hit.get("target") or hit.get("path") or "(exists)"
                except Exception:
                    title = "(exists)"
                file_hash = ""
                try:
                    file_hash = hit.get("hash") or hit.get("file_hash") or hit.get("sha256") or ""
                except Exception:
                    file_hash = ""
                return {
                    "title": str(title),
                    "store": str(hit.get("store") or backend_name),
                    "hash": str(file_hash or ""),
                    "ext": "",
                    "size": None,
                    "url": original_url,
                    "columns": [
                        ("Title", str(title)),
                        ("Store", str(hit.get("store") or backend_name)),
                        ("Hash", str(file_hash or "")),
                        ("URL", original_url),
                    ],
                }
            return None
        hit = backend_hits[0]
@@ -3326,7 +3566,7 @@ def check_url_exists_in_storage(
                found_hash: Optional[str] = None
                found = False
-                for needle in (needles or [])[:3]:
+                for needle in (needles or [])[:6]:
                    if not _httpish(needle):
                        continue
                    try:
@@ -3415,6 +3655,7 @@ def check_url_exists_in_storage(
            url_dup_cache["urls"] = sorted(cached_set)
            preflight_cache["url_duplicates"] = url_dup_cache
            _store_preflight_cache(preflight_cache)
            _mark_preflight_checked()
        return True
    table = ResultTable(f"URL already exists ({len(matched_urls)} url(s))", max_columns=10)
@@ -3477,6 +3718,8 @@ def check_url_exists_in_storage(
                    pipeline_context.request_pipeline_stop(reason="duplicate-url declined", exit_code=0)
                 except Exception:
                    pass
            _mark_preflight_checked()
            return False
        _mark_preflight_checked()
    return True
@@ -7,7 +7,7 @@ from typing import Any, Dict, List, Sequence, Optional, Set, Tuple
 import sys
 import re
 from fnmatch import fnmatch
-from urllib.parse import urlparse
+from urllib.parse import urlparse, parse_qsl, urlencode, urlunparse
 from . import _shared as sh
 Cmdlet, SharedArgs, parse_cmdlet_args, get_field, normalize_hash = (
@@ -67,6 +67,35 @@ class Get_Url(Cmdlet):
        """
        url = str(url or "").strip()
        # Strip fragment (e.g., #t=10) before matching
        url = url.split("#", 1)[0]
        # Strip common time/tracking query params for matching
        try:
            parsed = urlparse(url)
        except Exception:
            parsed = None
        if parsed is not None and parsed.query:
            time_keys = {"t", "start", "time_continue", "timestamp", "time", "begin"}
            tracking_prefixes = ("utm_",)
            try:
                pairs = parse_qsl(parsed.query, keep_blank_values=True)
                filtered = []
                for key, val in pairs:
                    key_norm = str(key or "").lower()
                    if key_norm in time_keys:
                        continue
                    if key_norm.startswith(tracking_prefixes):
                        continue
                    filtered.append((key, val))
                if filtered:
                    url = urlunparse(parsed._replace(query=urlencode(filtered, doseq=True)))
                else:
                    url = urlunparse(parsed._replace(query=""))
            except Exception:
                pass
        # Remove protocol (http://, https://, ftp://, etc.)
        url = re.sub(r"^[a-z][a-z0-9+.-]*://", "", url, flags=re.IGNORECASE)