Files
Medios-Macina/helper/query_parser.py

160 lines
4.9 KiB
Python
Raw Normal View History

2025-11-25 20:09:33 -08:00
"""Dynamic query parser for filtering and field extraction.
Supports query syntax like:
- isbn:0557677203
- author:"Albert Pike"
- title:"Morals and Dogma"
- year:2010
- isbn:0557677203 author:"Albert Pike"
- Mixed with free text: "Morals" isbn:0557677203
This allows flexible query strings that can be parsed by any search provider
to extract specific fields for filtering and searching.
"""
from typing import Dict, List, Tuple, Optional, Any
import re
def parse_query(query: str) -> Dict[str, Any]:
"""Parse a query string into field:value pairs and free text.
Args:
query: Query string like 'isbn:0557677203 author:"Albert Pike" Morals'
Returns:
Dictionary with:
- 'fields': Dict[field_name, field_value] for structured fields
- 'text': str with remaining free text
- 'raw': str original query
"""
result = {
'fields': {},
'text': '',
'raw': query,
}
if not query or not query.strip():
return result
query = query.strip()
remaining_parts = []
# Pattern to match: field:value or field:"quoted value"
# Matches: word: followed by either quoted string or unquoted word
pattern = r'(\w+):(?:"([^"]*)"|(\S+))'
pos = 0
for match in re.finditer(pattern, query):
# Add any text before this match
if match.start() > pos:
before_text = query[pos:match.start()].strip()
if before_text:
remaining_parts.append(before_text)
field_name = match.group(1).lower()
field_value = match.group(2) if match.group(2) is not None else match.group(3)
result['fields'][field_name] = field_value
pos = match.end()
# Add any remaining text after last match
if pos < len(query):
remaining_text = query[pos:].strip()
if remaining_text:
remaining_parts.append(remaining_text)
result['text'] = ' '.join(remaining_parts)
return result
def get_field(parsed_query: Dict[str, Any], field_name: str, default: Optional[str] = None) -> Optional[str]:
"""Get a field value from parsed query, with optional default.
Args:
parsed_query: Result from parse_query()
field_name: Field name to look up (case-insensitive)
default: Default value if field not found
Returns:
Field value or default
"""
return parsed_query.get('fields', {}).get(field_name.lower(), default)
def has_field(parsed_query: Dict[str, Any], field_name: str) -> bool:
"""Check if a field exists in parsed query.
Args:
parsed_query: Result from parse_query()
field_name: Field name to check (case-insensitive)
Returns:
True if field exists
"""
return field_name.lower() in parsed_query.get('fields', {})
def get_free_text(parsed_query: Dict[str, Any]) -> str:
"""Get the free text portion of a parsed query.
Args:
parsed_query: Result from parse_query()
Returns:
Free text or empty string
"""
return parsed_query.get('text', '')
def build_query_for_provider(
parsed_query: Dict[str, Any],
provider: str,
extraction_map: Optional[Dict[str, str]] = None
) -> Tuple[str, Dict[str, str]]:
"""Build a search query and filters dict for a specific provider.
Different providers have different search syntax. This function
extracts the appropriate fields for each provider.
Args:
parsed_query: Result from parse_query()
provider: Provider name ('libgen', 'openlibrary', 'soulseek')
extraction_map: Optional mapping of field names to provider-specific names
e.g. {'isbn': 'isbn', 'author': 'author', 'title': 'title'}
Returns:
Tuple of (search_query: str, extracted_fields: Dict[field, value])
"""
extraction_map = extraction_map or {}
extracted = {}
free_text = get_free_text(parsed_query)
# Extract fields based on map
for field_name, provider_key in extraction_map.items():
if has_field(parsed_query, field_name):
extracted[provider_key] = get_field(parsed_query, field_name)
# If provider-specific extraction needed, providers can implement it
# For now, return the free text as query
return free_text, extracted
if __name__ == '__main__':
# Test cases
test_queries = [
'isbn:0557677203',
'isbn:0557677203 author:"Albert Pike"',
'Morals and Dogma isbn:0557677203',
'title:"Morals and Dogma" author:"Albert Pike" year:2010',
'search term without fields',
'author:"John Smith" title:"A Book"',
]
for query in test_queries:
print(f"\nQuery: {query}")
parsed = parse_query(query)
print(f" Fields: {parsed['fields']}")
print(f" Text: {parsed['text']}")