160 lines
4.9 KiB
Python
160 lines
4.9 KiB
Python
"""Dynamic query parser for filtering and field extraction.
|
|
|
|
Supports query syntax like:
|
|
- isbn:0557677203
|
|
- author:"Albert Pike"
|
|
- title:"Morals and Dogma"
|
|
- year:2010
|
|
- isbn:0557677203 author:"Albert Pike"
|
|
- Mixed with free text: "Morals" isbn:0557677203
|
|
|
|
This allows flexible query strings that can be parsed by any search provider
|
|
to extract specific fields for filtering and searching.
|
|
"""
|
|
|
|
from typing import Dict, List, Tuple, Optional, Any
|
|
import re
|
|
|
|
|
|
def parse_query(query: str) -> Dict[str, Any]:
|
|
"""Parse a query string into field:value pairs and free text.
|
|
|
|
Args:
|
|
query: Query string like 'isbn:0557677203 author:"Albert Pike" Morals'
|
|
|
|
Returns:
|
|
Dictionary with:
|
|
- 'fields': Dict[field_name, field_value] for structured fields
|
|
- 'text': str with remaining free text
|
|
- 'raw': str original query
|
|
"""
|
|
result = {
|
|
'fields': {},
|
|
'text': '',
|
|
'raw': query,
|
|
}
|
|
|
|
if not query or not query.strip():
|
|
return result
|
|
|
|
query = query.strip()
|
|
remaining_parts = []
|
|
|
|
# Pattern to match: field:value or field:"quoted value"
|
|
# Matches: word: followed by either quoted string or unquoted word
|
|
pattern = r'(\w+):(?:"([^"]*)"|(\S+))'
|
|
|
|
pos = 0
|
|
for match in re.finditer(pattern, query):
|
|
# Add any text before this match
|
|
if match.start() > pos:
|
|
before_text = query[pos:match.start()].strip()
|
|
if before_text:
|
|
remaining_parts.append(before_text)
|
|
|
|
field_name = match.group(1).lower()
|
|
field_value = match.group(2) if match.group(2) is not None else match.group(3)
|
|
|
|
result['fields'][field_name] = field_value
|
|
pos = match.end()
|
|
|
|
# Add any remaining text after last match
|
|
if pos < len(query):
|
|
remaining_text = query[pos:].strip()
|
|
if remaining_text:
|
|
remaining_parts.append(remaining_text)
|
|
|
|
result['text'] = ' '.join(remaining_parts)
|
|
|
|
return result
|
|
|
|
|
|
def get_field(parsed_query: Dict[str, Any], field_name: str, default: Optional[str] = None) -> Optional[str]:
|
|
"""Get a field value from parsed query, with optional default.
|
|
|
|
Args:
|
|
parsed_query: Result from parse_query()
|
|
field_name: Field name to look up (case-insensitive)
|
|
default: Default value if field not found
|
|
|
|
Returns:
|
|
Field value or default
|
|
"""
|
|
return parsed_query.get('fields', {}).get(field_name.lower(), default)
|
|
|
|
|
|
def has_field(parsed_query: Dict[str, Any], field_name: str) -> bool:
|
|
"""Check if a field exists in parsed query.
|
|
|
|
Args:
|
|
parsed_query: Result from parse_query()
|
|
field_name: Field name to check (case-insensitive)
|
|
|
|
Returns:
|
|
True if field exists
|
|
"""
|
|
return field_name.lower() in parsed_query.get('fields', {})
|
|
|
|
|
|
def get_free_text(parsed_query: Dict[str, Any]) -> str:
|
|
"""Get the free text portion of a parsed query.
|
|
|
|
Args:
|
|
parsed_query: Result from parse_query()
|
|
|
|
Returns:
|
|
Free text or empty string
|
|
"""
|
|
return parsed_query.get('text', '')
|
|
|
|
|
|
def build_query_for_provider(
|
|
parsed_query: Dict[str, Any],
|
|
provider: str,
|
|
extraction_map: Optional[Dict[str, str]] = None
|
|
) -> Tuple[str, Dict[str, str]]:
|
|
"""Build a search query and filters dict for a specific provider.
|
|
|
|
Different providers have different search syntax. This function
|
|
extracts the appropriate fields for each provider.
|
|
|
|
Args:
|
|
parsed_query: Result from parse_query()
|
|
provider: Provider name ('libgen', 'openlibrary', 'soulseek')
|
|
extraction_map: Optional mapping of field names to provider-specific names
|
|
e.g. {'isbn': 'isbn', 'author': 'author', 'title': 'title'}
|
|
|
|
Returns:
|
|
Tuple of (search_query: str, extracted_fields: Dict[field, value])
|
|
"""
|
|
extraction_map = extraction_map or {}
|
|
extracted = {}
|
|
free_text = get_free_text(parsed_query)
|
|
|
|
# Extract fields based on map
|
|
for field_name, provider_key in extraction_map.items():
|
|
if has_field(parsed_query, field_name):
|
|
extracted[provider_key] = get_field(parsed_query, field_name)
|
|
|
|
# If provider-specific extraction needed, providers can implement it
|
|
# For now, return the free text as query
|
|
return free_text, extracted
|
|
|
|
|
|
if __name__ == '__main__':
|
|
# Test cases
|
|
test_queries = [
|
|
'isbn:0557677203',
|
|
'isbn:0557677203 author:"Albert Pike"',
|
|
'Morals and Dogma isbn:0557677203',
|
|
'title:"Morals and Dogma" author:"Albert Pike" year:2010',
|
|
'search term without fields',
|
|
'author:"John Smith" title:"A Book"',
|
|
]
|
|
|
|
for query in test_queries:
|
|
print(f"\nQuery: {query}")
|
|
parsed = parse_query(query)
|
|
print(f" Fields: {parsed['fields']}")
|
|
print(f" Text: {parsed['text']}")
|