delphi-database/app/api/search_highlight.py

"""
Server-side highlight utilities for search results.

These functions generate HTML snippets with <strong> around matched tokens,
preserving the original casing of the source text. All non-HTML segments are
HTML-escaped server-side to prevent injection. Only the <strong> tags added by
this module are emitted as HTML; any pre-existing HTML in source text is
escaped.
"""
from typing import List, Tuple, Any
import re


def build_query_tokens(query: str) -> List[str]:
    """Split query into alphanumeric tokens, trimming punctuation and deduping while preserving order."""
    if not query:
        return []
    raw_parts = re.sub(r"[,_;:]+", " ", str(query or "").strip()).split()
    cleaned: List[str] = []
    seen = set()
    for part in raw_parts:
        token = re.sub(r"^[^A-Za-z0-9]+|[^A-Za-z0-9]+$", "", part)
        lowered = token.lower()
        if token and lowered not in seen:
            cleaned.append(token)
            seen.add(lowered)
    return cleaned


def _merge_ranges(ranges: List[Tuple[int, int]]) -> List[Tuple[int, int]]:
    if not ranges:
        return []
    ranges.sort(key=lambda x: (x[0], x[1]))
    merged: List[Tuple[int, int]] = []
    cur_s, cur_e = ranges[0]
    for s, e in ranges[1:]:
        if s <= cur_e:
            cur_e = max(cur_e, e)
        else:
            merged.append((cur_s, cur_e))
            cur_s, cur_e = s, e
    merged.append((cur_s, cur_e))
    return merged


def highlight_text(value: str, tokens: List[str]) -> str:
    """Return `value` with case-insensitive matches of `tokens` wrapped in <strong>, preserving original casing.

    Non-highlighted segments and the highlighted text content are HTML-escaped.
    Only the surrounding <strong> wrappers are emitted as markup.
    """
    if value is None:
        return ""

    def _escape_html(text: str) -> str:
        # Minimal, safe HTML escaping
        if text is None:
            return ""
        # Replace ampersand first to avoid double-escaping
        text = str(text)
        text = text.replace("&", "&amp;")
        text = text.replace("<", "&lt;")
        text = text.replace(">", "&gt;")
        text = text.replace('"', "&quot;")
        text = text.replace("'", "&#39;")
        return text
    source = str(value)
    if not source or not tokens:
        return _escape_html(source)
    haystack = source.lower()
    ranges: List[Tuple[int, int]] = []
    # Deduplicate tokens case-insensitively to avoid redundant scans (parity with client)
    unique_needles = []
    seen_needles = set()
    for t in tokens:
        needle = str(t or "").lower()
        if needle and needle not in seen_needles:
            unique_needles.append(needle)
            seen_needles.add(needle)
    for needle in unique_needles:
        start = 0
        last_possible = max(0, len(haystack) - len(needle))
        while start <= last_possible and len(needle) > 0:
            idx = haystack.find(needle, start)
            if idx == -1:
                break
            ranges.append((idx, idx + len(needle)))
            start = idx + 1
    if not ranges:
        return _escape_html(source)
    parts: List[str] = []
    merged = _merge_ranges(ranges)
    pos = 0
    for s, e in merged:
        if pos < s:
            parts.append(_escape_html(source[pos:s]))
        parts.append("<strong>" + _escape_html(source[s:e]) + "</strong>")
        pos = e
    if pos < len(source):
        parts.append(_escape_html(source[pos:]))
    return "".join(parts)


def create_customer_highlight(customer: Any, query: str) -> str:
    if not query:
        return ""
    tokens = build_query_tokens(query)
    full_name = f"{getattr(customer, 'first', '') or ''} {getattr(customer, 'last', '')}".strip()
    email = getattr(customer, 'email', None)
    city = getattr(customer, 'city', None)
    ql = query.lower()

    if full_name and ql in full_name.lower():
        return f"Name: {highlight_text(full_name, tokens)}"
    if email and ql in str(email).lower():
        return f"Email: {highlight_text(str(email), tokens)}"
    if city and ql in str(city).lower():
        return f"City: {highlight_text(str(city), tokens)}"
    return ""


def create_file_highlight(file_obj: Any, query: str) -> str:
    if not query:
        return ""
    tokens = build_query_tokens(query)
    regarding = getattr(file_obj, 'regarding', None)
    file_type = getattr(file_obj, 'file_type', None)
    ql = query.lower()
    if regarding and ql in str(regarding).lower():
        return f"Matter: {highlight_text(str(regarding), tokens)}"
    if file_type and ql in str(file_type).lower():
        return f"Type: {highlight_text(str(file_type), tokens)}"
    return ""


def create_ledger_highlight(ledger: Any, query: str) -> str:
    if not query:
        return ""
    tokens = build_query_tokens(query)
    note = getattr(ledger, 'note', None)
    if note and query.lower() in str(note).lower():
        text = str(note) or ""
        preview = text[:160]
        suffix = "..." if len(text) > 160 else ""
        return f"Note: {highlight_text(preview, tokens)}{suffix}"
    return ""


def create_qdro_highlight(qdro: Any, query: str) -> str:
    if not query:
        return ""
    tokens = build_query_tokens(query)
    form_name = getattr(qdro, 'form_name', None)
    pet = getattr(qdro, 'pet', None)
    case_number = getattr(qdro, 'case_number', None)
    ql = query.lower()
    if form_name and ql in str(form_name).lower():
        return f"Form: {highlight_text(str(form_name), tokens)}"
    if pet and ql in str(pet).lower():
        return f"Petitioner: {highlight_text(str(pet), tokens)}"
    if case_number and ql in str(case_number).lower():
        return f"Case: {highlight_text(str(case_number), tokens)}"
    return ""