This commit is contained in:
nose
2025-12-14 00:53:52 -08:00
parent 52a79b0086
commit a03eb0d1be
24 changed files with 2785 additions and 1868 deletions

View File

@@ -30,6 +30,8 @@ def _resolve_file_hash(db_hash: Optional[str], file_path: Path) -> Optional[str]
return _normalize_hash(file_path.stem)
class Folder(Store):
""""""
# Track which locations have already been migrated to avoid repeated migrations
@@ -359,6 +361,17 @@ class Folder(Store):
else:
shutil.copy2(str(file_path), str(save_file))
debug(f"Local copy: {save_file}", file=sys.stderr)
# Best-effort: capture duration for media
duration_value: float | None = None
try:
from SYS.utils import ffprobe
probe = ffprobe(str(save_file))
duration = probe.get("duration")
if isinstance(duration, (int, float)) and duration > 0:
duration_value = float(duration)
except Exception:
duration_value = None
# Save to database
with API_folder_store(Path(self._location)) as db:
@@ -368,7 +381,8 @@ class Folder(Store):
db.save_metadata(save_file, {
'hash': file_hash,
'ext': ext_clean,
'size': file_path.stat().st_size
'size': file_path.stat().st_size,
'duration': duration_value,
})
# Add tags if provided
@@ -405,6 +419,21 @@ class Folder(Store):
results = []
search_dir = Path(self._location).expanduser()
def _url_like_pattern(value: str) -> str:
# Interpret user patterns as substring matches (with optional glob wildcards).
v = (value or "").strip().lower()
if not v or v == "*":
return "%"
v = v.replace("%", "\\%").replace("_", "\\_")
v = v.replace("*", "%").replace("?", "_")
if "%" not in v and "_" not in v:
return f"%{v}%"
if not v.startswith("%"):
v = "%" + v
if not v.endswith("%"):
v = v + "%"
return v
tokens = [t.strip() for t in query.split(',') if t.strip()]
if not match_all and len(tokens) == 1 and _normalize_hash(query):
@@ -453,6 +482,8 @@ class Folder(Store):
try:
with DatabaseAPI(search_dir) as api:
if tokens and len(tokens) > 1:
url_fetch_limit = (limit or 45) * 50
def _like_pattern(term: str) -> str:
return term.replace('*', '%').replace('?', '_')
@@ -473,6 +504,11 @@ class Folder(Store):
h = api.get_file_hash_by_hash(normalized_hash)
return {h} if h else set()
if namespace == 'url':
if not pattern or pattern == '*':
return api.get_file_hashes_with_any_url(limit=url_fetch_limit)
return api.get_file_hashes_by_url_like(_url_like_pattern(pattern), limit=url_fetch_limit)
if namespace == 'store':
if pattern not in {'local', 'file', 'filesystem'}:
return set()
@@ -562,6 +598,29 @@ class Folder(Store):
if limit is not None and len(results) >= limit:
return results
return results
if namespace == "url":
if not pattern or pattern == "*":
rows = api.get_files_with_any_url(limit)
else:
rows = api.get_files_by_url_like(_url_like_pattern(pattern), limit)
for file_hash, file_path_str, size_bytes, ext in rows:
if not file_path_str:
continue
file_path = Path(file_path_str)
if not file_path.exists():
continue
if size_bytes is None:
try:
size_bytes = file_path.stat().st_size
except OSError:
size_bytes = None
tags = api.get_tags_for_file(file_hash)
entry = _create_entry(file_path, tags, size_bytes, file_hash)
results.append(entry)
if limit is not None and len(results) >= limit:
return results
return results
query_pattern = f"{namespace}:%"
rows = api.get_files_by_namespace_pattern(query_pattern, limit)
@@ -592,126 +651,59 @@ class Folder(Store):
if limit is not None and len(results) >= limit:
return results
elif not match_all:
# Strict tag-based search only (no filename/path searching).
terms = [t.strip() for t in query_lower.replace(',', ' ').split() if t.strip()]
if not terms:
terms = [query_lower]
debug(f"Performing filename/tag search for terms: {terms}")
fetch_limit = (limit or 45) * 50
conditions = ["LOWER(f.file_path) LIKE ?" for _ in terms]
params = [f"%{t}%" for t in terms]
rows = api.get_files_by_multiple_path_conditions(conditions, params, fetch_limit)
debug(f"Found {len(rows)} filename matches in DB (before whole-word filter)")
word_regex = None
if len(terms) == 1:
term = terms[0]
has_wildcard = '*' in term or '?' in term
if has_wildcard:
try:
from fnmatch import translate
word_regex = re.compile(translate(term), re.IGNORECASE)
except Exception:
word_regex = None
else:
try:
pattern = r'(?<![a-zA-Z0-9])' + re.escape(term) + r'(?![a-zA-Z0-9])'
word_regex = re.compile(pattern, re.IGNORECASE)
except Exception:
word_regex = None
seen_files = set()
for file_id, file_path_str, size_bytes, file_hash in rows:
if not file_path_str or file_path_str in seen_files:
continue
if word_regex:
p = Path(file_path_str)
if not word_regex.search(p.name):
# AND semantics across terms: each term must match at least one tag.
hits: dict[str, dict[str, Any]] = {}
for term in terms:
tag_pattern = f"%{term}%"
term_rows = api.get_files_by_namespace_pattern(tag_pattern, fetch_limit)
for file_hash, file_path_str, size_bytes, ext in term_rows:
if not file_path_str:
continue
seen_files.add(file_path_str)
file_path = Path(file_path_str)
if file_path.exists():
if size_bytes is None:
size_bytes = file_path.stat().st_size
tags = api.get_tags_for_file(file_hash)
entry = _create_entry(file_path, tags, size_bytes, file_hash)
results.append(entry)
if limit is not None and len(results) >= limit:
return results
entry = hits.get(file_hash)
if entry:
entry["count"] += 1
if size_bytes is not None:
entry["size"] = size_bytes
else:
hits[file_hash] = {
"path": file_path_str,
"size": size_bytes,
"hash": file_hash,
"count": 1,
}
if terms:
title_hits: dict[str, dict[str, Any]] = {}
for term in terms:
title_pattern = f"title:%{term}%"
title_rows = api.get_files_by_title_tag_pattern(title_pattern, fetch_limit)
for file_hash, file_path_str, size_bytes, ext in title_rows:
if not file_path_str:
continue
entry = title_hits.get(file_hash)
if entry:
entry["count"] += 1
if size_bytes is not None:
entry["size"] = size_bytes
else:
title_hits[file_hash] = {
"path": file_path_str,
"size": size_bytes,
"hash": file_hash,
"count": 1,
}
if title_hits:
required = len(terms)
for file_hash, info in title_hits.items():
if info.get("count") != required:
continue
file_path_str = info.get("path")
if not file_path_str or file_path_str in seen_files:
continue
file_path = Path(file_path_str)
if not file_path.exists():
continue
seen_files.add(file_path_str)
size_bytes = info.get("size")
if size_bytes is None:
try:
size_bytes = file_path.stat().st_size
except OSError:
size_bytes = None
tags = api.get_tags_for_file(file_hash)
entry = _create_entry(file_path, tags, size_bytes, info.get("hash"))
results.append(entry)
if limit is not None and len(results) >= limit:
return results
query_pattern = f"%{query_lower}%"
tag_rows = api.get_files_by_simple_tag_pattern(query_pattern, limit)
for file_hash, file_path_str, size_bytes, ext in tag_rows:
required = len(terms)
seen_files: set[str] = set()
for file_hash, info in hits.items():
if info.get("count") != required:
continue
file_path_str = info.get("path")
if not file_path_str or file_path_str in seen_files:
continue
seen_files.add(file_path_str)
file_path = Path(file_path_str)
if file_path.exists():
if size_bytes is None:
if not file_path.exists():
continue
seen_files.add(file_path_str)
size_bytes = info.get("size")
if size_bytes is None:
try:
size_bytes = file_path.stat().st_size
tags = api.get_tags_for_file(file_hash)
entry = _create_entry(file_path, tags, size_bytes, file_hash)
results.append(entry)
if limit is not None and len(results) >= limit:
return results
except OSError:
size_bytes = None
tags = api.get_tags_for_file(file_hash)
entry_obj = _create_entry(file_path, tags, size_bytes, info.get("hash"))
results.append(entry_obj)
if limit is not None and len(results) >= limit:
break
else:
rows = api.get_all_files(limit)
@@ -726,10 +718,8 @@ class Folder(Store):
entry = _create_entry(file_path, tags, size_bytes, file_hash)
results.append(entry)
if results:
debug(f"Returning {len(results)} results from DB")
else:
debug("No results found in DB")
backend_label = str(getattr(self, "_name", "") or getattr(self, "NAME", "") or "folder")
debug(f"[folder:{backend_label}] {len(results)} result(s)")
return results
except Exception as e:
@@ -938,9 +928,11 @@ class Folder(Store):
file_hash = file_identifier
if self._location:
try:
from metadata import normalize_urls
with API_folder_store(Path(self._location)) as db:
meta = db.get_metadata(file_hash) or {}
return list(meta.get("url") or [])
urls = normalize_urls(meta.get("url"))
return urls
except Exception as exc:
debug(f"Local DB get_metadata failed: {exc}")
return []
@@ -955,11 +947,13 @@ class Folder(Store):
file_hash = file_identifier
if self._location:
try:
from metadata import normalize_urls
with API_folder_store(Path(self._location)) as db:
meta = db.get_metadata(file_hash) or {}
existing_urls = list(meta.get("url") or [])
existing_urls = normalize_urls(meta.get("url"))
incoming_urls = normalize_urls(url)
changed = False
for u in list(url or []):
for u in list(incoming_urls or []):
if not u:
continue
if u not in existing_urls:
@@ -982,10 +976,11 @@ class Folder(Store):
file_hash = file_identifier
if self._location:
try:
from metadata import normalize_urls
with API_folder_store(Path(self._location)) as db:
meta = db.get_metadata(file_hash) or {}
existing_urls = list(meta.get("url") or [])
remove_set = {u for u in (url or []) if u}
existing_urls = normalize_urls(meta.get("url"))
remove_set = {u for u in normalize_urls(url) if u}
if not remove_set:
return False
new_urls = [u for u in existing_urls if u not in remove_set]

View File

@@ -264,6 +264,170 @@ class HydrusNetwork(Store):
debug(f"Searching Hydrus for: {query}")
def _extract_urls(meta_obj: Any) -> list[str]:
if not isinstance(meta_obj, dict):
return []
raw = meta_obj.get("url")
if raw is None:
raw = meta_obj.get("urls")
if isinstance(raw, str):
val = raw.strip()
return [val] if val else []
if isinstance(raw, list):
out: list[str] = []
for item in raw:
if not isinstance(item, str):
continue
s = item.strip()
if s:
out.append(s)
return out
return []
def _iter_url_filtered_metadata(url_value: str | None, want_any: bool, fetch_limit: int) -> list[dict[str, Any]]:
"""Best-effort URL search by scanning Hydrus metadata with include_file_url=True."""
# First try a fast system predicate if Hydrus supports it.
candidate_file_ids: list[int] = []
try:
if want_any:
predicate = "system:has url"
url_search = client.search_files(
tags=[predicate],
return_hashes=False,
return_file_ids=True,
return_file_count=False,
)
ids = url_search.get("file_ids", []) if isinstance(url_search, dict) else []
if isinstance(ids, list):
candidate_file_ids = [int(x) for x in ids if isinstance(x, (int, float, str)) and str(x).strip().isdigit()]
except Exception:
candidate_file_ids = []
if not candidate_file_ids:
# Fallback: scan from system:everything and filter by URL substring.
everything = client.search_files(
tags=["system:everything"],
return_hashes=False,
return_file_ids=True,
return_file_count=False,
)
ids = everything.get("file_ids", []) if isinstance(everything, dict) else []
if isinstance(ids, list):
candidate_file_ids = [int(x) for x in ids if isinstance(x, (int, float))]
if not candidate_file_ids:
return []
needle = (url_value or "").strip().lower()
chunk_size = 200
out: list[dict[str, Any]] = []
for start in range(0, len(candidate_file_ids), chunk_size):
if len(out) >= fetch_limit:
break
chunk = candidate_file_ids[start : start + chunk_size]
try:
payload = client.fetch_file_metadata(
file_ids=chunk,
include_file_url=True,
include_service_keys_to_tags=True,
include_duration=True,
include_size=True,
include_mime=True,
)
except Exception:
continue
metas = payload.get("metadata", []) if isinstance(payload, dict) else []
if not isinstance(metas, list):
continue
for meta in metas:
if not isinstance(meta, dict):
continue
urls = _extract_urls(meta)
if not urls:
continue
if want_any:
out.append(meta)
if len(out) >= fetch_limit:
break
continue
if not needle:
continue
if any(needle in u.lower() for u in urls):
out.append(meta)
if len(out) >= fetch_limit:
break
return out
query_lower = query.lower().strip()
# Special case: url:* and url:<value>
metadata_list: list[dict[str, Any]] | None = None
if ":" in query_lower and not query_lower.startswith(":"):
namespace, pattern = query_lower.split(":", 1)
namespace = namespace.strip().lower()
pattern = pattern.strip()
if namespace == "url":
if not pattern or pattern == "*":
metadata_list = _iter_url_filtered_metadata(None, want_any=True, fetch_limit=int(limit) if limit else 100)
else:
# Fast-path: exact URL via /add_url/get_url_files when a full URL is provided.
try:
if pattern.startswith("http://") or pattern.startswith("https://"):
from API.HydrusNetwork import HydrusRequestSpec
spec = HydrusRequestSpec(method="GET", endpoint="/add_url/get_url_files", query={"url": pattern})
response = client._perform_request(spec) # type: ignore[attr-defined]
hashes: list[str] = []
file_ids: list[int] = []
if isinstance(response, dict):
raw_hashes = response.get("hashes") or response.get("file_hashes")
if isinstance(raw_hashes, list):
hashes = [str(h).strip() for h in raw_hashes if isinstance(h, str) and str(h).strip()]
raw_ids = response.get("file_ids")
if isinstance(raw_ids, list):
for item in raw_ids:
try:
file_ids.append(int(item))
except (TypeError, ValueError):
continue
if file_ids:
payload = client.fetch_file_metadata(
file_ids=file_ids,
include_file_url=True,
include_service_keys_to_tags=True,
include_duration=True,
include_size=True,
include_mime=True,
)
metas = payload.get("metadata", []) if isinstance(payload, dict) else []
if isinstance(metas, list):
metadata_list = [m for m in metas if isinstance(m, dict)]
elif hashes:
payload = client.fetch_file_metadata(
hashes=hashes,
include_file_url=True,
include_service_keys_to_tags=True,
include_duration=True,
include_size=True,
include_mime=True,
)
metas = payload.get("metadata", []) if isinstance(payload, dict) else []
if isinstance(metas, list):
metadata_list = [m for m in metas if isinstance(m, dict)]
except Exception:
metadata_list = None
# Fallback: substring scan
if metadata_list is None:
metadata_list = _iter_url_filtered_metadata(pattern, want_any=False, fetch_limit=int(limit) if limit else 100)
# Parse the query into tags
# Handle both simple tags and complex queries
# "*" means "match all" - use system:everything tag in Hydrus
@@ -271,7 +435,6 @@ class HydrusNetwork(Store):
# Use system:everything to match all files in Hydrus
tags = ["system:everything"]
else:
query_lower = query.lower().strip()
# If query doesn't have a namespace (no ':'), search all files and filter by title/tags
# If query has explicit namespace, use it as a tag search
if ':' not in query_lower:
@@ -286,30 +449,36 @@ class HydrusNetwork(Store):
debug(f"Found 0 result(s)")
return []
# Search files with the tags
search_result = client.search_files(
tags=tags,
return_hashes=True,
return_file_ids=True
)
# Extract file IDs from search result
file_ids = search_result.get("file_ids", [])
hashes = search_result.get("hashes", [])
if not file_ids and not hashes:
debug(f"Found 0 result(s)")
return []
# Fetch metadata for the found files
# Search files with the tags (unless url: search already produced metadata)
results = []
query_lower = query.lower().strip()
# Split by comma or space for AND logic
search_terms = set(query_lower.replace(',', ' ').split()) # For substring matching
if file_ids:
metadata = client.fetch_file_metadata(file_ids=file_ids)
metadata_list = metadata.get("metadata", [])
if metadata_list is None:
search_result = client.search_files(
tags=tags,
return_hashes=True,
return_file_ids=True
)
file_ids = search_result.get("file_ids", []) if isinstance(search_result, dict) else []
hashes = search_result.get("hashes", []) if isinstance(search_result, dict) else []
if not file_ids and not hashes:
debug(f"Found 0 result(s)")
return []
if file_ids:
metadata = client.fetch_file_metadata(file_ids=file_ids)
metadata_list = metadata.get("metadata", [])
elif hashes:
metadata = client.fetch_file_metadata(hashes=hashes)
metadata_list = metadata.get("metadata", [])
else:
metadata_list = []
if not isinstance(metadata_list, list):
metadata_list = []
for meta in metadata_list:
if len(results) >= limit:

View File

@@ -119,6 +119,37 @@ class Store:
self._backend_errors: Dict[str, str] = {}
self._load_backends()
def _maybe_register_temp_alias(self, store_type: str, backend_name: str, kwargs: Dict[str, Any], backend: BaseStore) -> None:
"""If a folder backend points at config['temp'], also expose it as the 'temp' backend.
This keeps config compatibility (e.g. existing 'default') while presenting the temp
directory under a clearer name.
"""
try:
if _normalize_store_type(store_type) != "folder":
return
temp_value = self._config.get("temp")
if not temp_value:
return
path_value = kwargs.get("PATH") or kwargs.get("path")
if not path_value:
return
temp_path = Path(str(temp_value)).expanduser().resolve()
backend_path = Path(str(path_value)).expanduser().resolve()
if backend_path != temp_path:
return
# If the user already has a dedicated temp backend, do nothing.
if "temp" in self._backends:
return
# Keep original name working, but add an alias.
if backend_name != "temp":
self._backends["temp"] = backend
except Exception:
return
def _load_backends(self) -> None:
store_cfg = self._config.get("store")
if not isinstance(store_cfg, dict):
@@ -161,6 +192,9 @@ class Store:
backend_name = str(kwargs.get("NAME") or instance_name)
self._backends[backend_name] = backend
# If this is the configured temp directory, also alias it as 'temp'.
self._maybe_register_temp_alias(store_type, backend_name, kwargs, backend)
except Exception as exc:
err_text = str(exc)
self._backend_errors[str(instance_name)] = err_text
@@ -177,11 +211,24 @@ class Store:
return sorted(self._backends.keys())
def list_searchable_backends(self) -> list[str]:
searchable: list[str] = []
# De-duplicate backends by instance (aliases can point at the same object).
def _rank(name: str) -> int:
n = str(name or "").strip().lower()
if n == "temp":
return 0
if n == "default":
return 2
return 1
chosen: Dict[int, str] = {}
for name, backend in self._backends.items():
if type(backend).search is not BaseStore.search:
searchable.append(name)
return sorted(searchable)
if type(backend).search is BaseStore.search:
continue
key = id(backend)
prev = chosen.get(key)
if prev is None or _rank(name) < _rank(prev):
chosen[key] = name
return sorted(chosen.values())
def __getitem__(self, backend_name: str) -> BaseStore:
if backend_name not in self._backends: