fix(search): tag-aware snippet truncation prevents cutting inside <mark> pairs (GIT-5)

The old truncation counted <mark></mark> HTML tags (~13 chars per keyword)
as visible characters, causing over-aggressive truncation. When a cut
landed inside a tag pair, render_snippet would render highlighted text
as muted gray instead of bold yellow.

New truncate_snippet() walks through markup counting only visible
characters, respects tag boundaries, and always closes an open <mark>
before appending ellipsis. Includes 6 unit tests.
This commit is contained in:
teernisse
2026-03-12 09:15:34 -04:00
parent 44431667e8
commit 36b361a50a

View File

@@ -337,6 +337,75 @@ fn parse_json_array(json: &str) -> Vec<String> {
.collect()
}
/// Truncate a snippet to `max_visible` visible characters, respecting `<mark>` tag boundaries.
///
/// Counts only visible text (not tags) toward the limit, and ensures we never cut
/// inside a `<mark>...</mark>` pair (which would break `render_snippet` highlighting).
fn truncate_snippet(snippet: &str, max_visible: usize) -> String {
if max_visible < 4 {
return snippet.to_string();
}
let mut visible_count = 0;
let mut result = String::new();
let mut remaining = snippet;
while !remaining.is_empty() {
if let Some(start) = remaining.find("<mark>") {
// Count visible chars before the tag
let before = &remaining[..start];
let before_len = before.chars().count();
if visible_count + before_len >= max_visible.saturating_sub(3) {
// Truncate within the pre-tag text
let take = max_visible.saturating_sub(3).saturating_sub(visible_count);
let truncated: String = before.chars().take(take).collect();
result.push_str(&truncated);
result.push_str("...");
return result;
}
result.push_str(before);
visible_count += before_len;
// Find matching </mark>
let after_open = &remaining[start + 6..];
if let Some(end) = after_open.find("</mark>") {
let highlighted = &after_open[..end];
let hl_len = highlighted.chars().count();
if visible_count + hl_len >= max_visible.saturating_sub(3) {
// Truncate within the highlighted text
let take = max_visible.saturating_sub(3).saturating_sub(visible_count);
let truncated: String = highlighted.chars().take(take).collect();
result.push_str("<mark>");
result.push_str(&truncated);
result.push_str("</mark>...");
return result;
}
result.push_str(&remaining[start..start + 6 + end + 7]);
visible_count += hl_len;
remaining = &after_open[end + 7..];
} else {
// Unclosed <mark> — treat rest as plain text
result.push_str(&remaining[start..]);
break;
}
} else {
// No more tags — handle remaining plain text
let rest_len = remaining.chars().count();
if visible_count + rest_len > max_visible && max_visible > 3 {
let take = max_visible.saturating_sub(3).saturating_sub(visible_count);
let truncated: String = remaining.chars().take(take).collect();
result.push_str(&truncated);
result.push_str("...");
return result;
}
result.push_str(remaining);
break;
}
}
result
}
/// Render FTS snippet with `<mark>` tags as terminal highlight style.
fn render_snippet(snippet: &str) -> String {
let mut result = String::new();
@@ -429,17 +498,13 @@ pub fn print_search_results(response: &SearchResponse, explain: bool) {
}
println!("{indent}{}", meta_parts.join(&sep));
// Phase 5: limit snippet to ~2 terminal lines
// Phase 5: limit snippet to ~2 terminal lines.
// Truncate based on visible text length (excluding <mark></mark> tags)
// to avoid cutting inside a highlight tag pair.
let max_snippet_width =
render::terminal_width().saturating_sub(render::visible_width(&indent));
let max_snippet_chars = max_snippet_width.saturating_mul(2);
let snippet = if result.snippet.chars().count() > max_snippet_chars && max_snippet_chars > 3
{
let truncated: String = result.snippet.chars().take(max_snippet_chars - 3).collect();
format!("{truncated}...")
} else {
result.snippet.clone()
};
let snippet = truncate_snippet(&result.snippet, max_snippet_chars);
let rendered = render_snippet(&snippet);
println!("{indent}{rendered}");
@@ -527,3 +592,64 @@ pub fn print_search_results_json(
Err(e) => eprintln!("Error serializing to JSON: {e}"),
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn truncate_snippet_short_text_unchanged() {
let s = "hello world";
assert_eq!(truncate_snippet(s, 100), "hello world");
}
#[test]
fn truncate_snippet_plain_text_truncated() {
let s = "this is a long string that exceeds the limit";
let result = truncate_snippet(s, 20);
assert!(result.ends_with("..."), "got: {result}");
// Visible chars should be <= 20
assert!(result.chars().count() <= 20, "got: {result}");
}
#[test]
fn truncate_snippet_preserves_mark_tags() {
let s = "some text <mark>keyword</mark> and more text here that is long";
let result = truncate_snippet(s, 30);
// Should not cut inside a <mark> pair
let open_count = result.matches("<mark>").count();
let close_count = result.matches("</mark>").count();
assert_eq!(
open_count, close_count,
"unbalanced tags in: {result}"
);
}
#[test]
fn truncate_snippet_cuts_before_mark_tag() {
let s = "a]very long prefix that exceeds the limit <mark>word</mark>";
let result = truncate_snippet(s, 15);
assert!(result.ends_with("..."), "got: {result}");
// The <mark> tag should not appear since we truncated before reaching it
assert!(
!result.contains("<mark>"),
"should not include tag: {result}"
);
}
#[test]
fn truncate_snippet_does_not_count_tags_as_visible() {
// With tags, raw length is 42 chars. Without tags, visible is 29.
let s = "prefix <mark>keyword</mark> suffix text";
// If max_visible = 35, the visible text (29 chars) fits — should NOT truncate
let result = truncate_snippet(s, 35);
assert_eq!(result, s, "should not truncate when visible text fits");
}
#[test]
fn truncate_snippet_small_limit_returns_as_is() {
let s = "text <mark>x</mark>";
// Very small limit should return as-is (guard clause)
assert_eq!(truncate_snippet(s, 3), s);
}
}