Files
delphi-database/app/api/search_highlight.py
2025-08-14 19:16:28 -05:00

166 lines
5.6 KiB
Python

"""
Server-side highlight utilities for search results.
These functions generate HTML snippets with <strong> around matched tokens,
preserving the original casing of the source text. All non-HTML segments are
HTML-escaped server-side to prevent injection. Only the <strong> tags added by
this module are emitted as HTML; any pre-existing HTML in source text is
escaped.
"""
from typing import List, Tuple, Any
import re
def build_query_tokens(query: str) -> List[str]:
"""Split query into alphanumeric tokens, trimming punctuation and deduping while preserving order."""
if not query:
return []
raw_parts = re.sub(r"[,_;:]+", " ", str(query or "").strip()).split()
cleaned: List[str] = []
seen = set()
for part in raw_parts:
token = re.sub(r"^[^A-Za-z0-9]+|[^A-Za-z0-9]+$", "", part)
lowered = token.lower()
if token and lowered not in seen:
cleaned.append(token)
seen.add(lowered)
return cleaned
def _merge_ranges(ranges: List[Tuple[int, int]]) -> List[Tuple[int, int]]:
if not ranges:
return []
ranges.sort(key=lambda x: (x[0], x[1]))
merged: List[Tuple[int, int]] = []
cur_s, cur_e = ranges[0]
for s, e in ranges[1:]:
if s <= cur_e:
cur_e = max(cur_e, e)
else:
merged.append((cur_s, cur_e))
cur_s, cur_e = s, e
merged.append((cur_s, cur_e))
return merged
def highlight_text(value: str, tokens: List[str]) -> str:
"""Return `value` with case-insensitive matches of `tokens` wrapped in <strong>, preserving original casing.
Non-highlighted segments and the highlighted text content are HTML-escaped.
Only the surrounding <strong> wrappers are emitted as markup.
"""
if value is None:
return ""
def _escape_html(text: str) -> str:
# Minimal, safe HTML escaping
if text is None:
return ""
# Replace ampersand first to avoid double-escaping
text = str(text)
text = text.replace("&", "&amp;")
text = text.replace("<", "&lt;")
text = text.replace(">", "&gt;")
text = text.replace('"', "&quot;")
text = text.replace("'", "&#39;")
return text
source = str(value)
if not source or not tokens:
return _escape_html(source)
haystack = source.lower()
ranges: List[Tuple[int, int]] = []
# Deduplicate tokens case-insensitively to avoid redundant scans (parity with client)
unique_needles = []
seen_needles = set()
for t in tokens:
needle = str(t or "").lower()
if needle and needle not in seen_needles:
unique_needles.append(needle)
seen_needles.add(needle)
for needle in unique_needles:
start = 0
last_possible = max(0, len(haystack) - len(needle))
while start <= last_possible and len(needle) > 0:
idx = haystack.find(needle, start)
if idx == -1:
break
ranges.append((idx, idx + len(needle)))
start = idx + 1
if not ranges:
return _escape_html(source)
parts: List[str] = []
merged = _merge_ranges(ranges)
pos = 0
for s, e in merged:
if pos < s:
parts.append(_escape_html(source[pos:s]))
parts.append("<strong>" + _escape_html(source[s:e]) + "</strong>")
pos = e
if pos < len(source):
parts.append(_escape_html(source[pos:]))
return "".join(parts)
def create_customer_highlight(customer: Any, query: str) -> str:
if not query:
return ""
tokens = build_query_tokens(query)
full_name = f"{getattr(customer, 'first', '') or ''} {getattr(customer, 'last', '')}".strip()
email = getattr(customer, 'email', None)
city = getattr(customer, 'city', None)
ql = query.lower()
if full_name and ql in full_name.lower():
return f"Name: {highlight_text(full_name, tokens)}"
if email and ql in str(email).lower():
return f"Email: {highlight_text(str(email), tokens)}"
if city and ql in str(city).lower():
return f"City: {highlight_text(str(city), tokens)}"
return ""
def create_file_highlight(file_obj: Any, query: str) -> str:
if not query:
return ""
tokens = build_query_tokens(query)
regarding = getattr(file_obj, 'regarding', None)
file_type = getattr(file_obj, 'file_type', None)
ql = query.lower()
if regarding and ql in str(regarding).lower():
return f"Matter: {highlight_text(str(regarding), tokens)}"
if file_type and ql in str(file_type).lower():
return f"Type: {highlight_text(str(file_type), tokens)}"
return ""
def create_ledger_highlight(ledger: Any, query: str) -> str:
if not query:
return ""
tokens = build_query_tokens(query)
note = getattr(ledger, 'note', None)
if note and query.lower() in str(note).lower():
text = str(note) or ""
preview = text[:160]
suffix = "..." if len(text) > 160 else ""
return f"Note: {highlight_text(preview, tokens)}{suffix}"
return ""
def create_qdro_highlight(qdro: Any, query: str) -> str:
if not query:
return ""
tokens = build_query_tokens(query)
form_name = getattr(qdro, 'form_name', None)
pet = getattr(qdro, 'pet', None)
case_number = getattr(qdro, 'case_number', None)
ql = query.lower()
if form_name and ql in str(form_name).lower():
return f"Form: {highlight_text(str(form_name), tokens)}"
if pet and ql in str(pet).lower():
return f"Petitioner: {highlight_text(str(pet), tokens)}"
if case_number and ql in str(case_number).lower():
return f"Case: {highlight_text(str(case_number), tokens)}"
return ""