fix(search): tag-aware snippet truncation prevents cutting inside <mark> pairs (GIT-5)
The old truncation counted <mark></mark> HTML tags (~13 chars per keyword) as visible characters, causing over-aggressive truncation. When a cut landed inside a tag pair, render_snippet would render highlighted text as muted gray instead of bold yellow. New truncate_snippet() walks through markup counting only visible characters, respects tag boundaries, and always closes an open <mark> before appending ellipsis. Includes 6 unit tests.
This commit is contained in:
@@ -337,6 +337,75 @@ fn parse_json_array(json: &str) -> Vec<String> {
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Truncate a snippet to `max_visible` visible characters, respecting `<mark>` tag boundaries.
|
||||
///
|
||||
/// Counts only visible text (not tags) toward the limit, and ensures we never cut
|
||||
/// inside a `<mark>...</mark>` pair (which would break `render_snippet` highlighting).
|
||||
fn truncate_snippet(snippet: &str, max_visible: usize) -> String {
|
||||
if max_visible < 4 {
|
||||
return snippet.to_string();
|
||||
}
|
||||
|
||||
let mut visible_count = 0;
|
||||
let mut result = String::new();
|
||||
let mut remaining = snippet;
|
||||
|
||||
while !remaining.is_empty() {
|
||||
if let Some(start) = remaining.find("<mark>") {
|
||||
// Count visible chars before the tag
|
||||
let before = &remaining[..start];
|
||||
let before_len = before.chars().count();
|
||||
if visible_count + before_len >= max_visible.saturating_sub(3) {
|
||||
// Truncate within the pre-tag text
|
||||
let take = max_visible.saturating_sub(3).saturating_sub(visible_count);
|
||||
let truncated: String = before.chars().take(take).collect();
|
||||
result.push_str(&truncated);
|
||||
result.push_str("...");
|
||||
return result;
|
||||
}
|
||||
result.push_str(before);
|
||||
visible_count += before_len;
|
||||
|
||||
// Find matching </mark>
|
||||
let after_open = &remaining[start + 6..];
|
||||
if let Some(end) = after_open.find("</mark>") {
|
||||
let highlighted = &after_open[..end];
|
||||
let hl_len = highlighted.chars().count();
|
||||
if visible_count + hl_len >= max_visible.saturating_sub(3) {
|
||||
// Truncate within the highlighted text
|
||||
let take = max_visible.saturating_sub(3).saturating_sub(visible_count);
|
||||
let truncated: String = highlighted.chars().take(take).collect();
|
||||
result.push_str("<mark>");
|
||||
result.push_str(&truncated);
|
||||
result.push_str("</mark>...");
|
||||
return result;
|
||||
}
|
||||
result.push_str(&remaining[start..start + 6 + end + 7]);
|
||||
visible_count += hl_len;
|
||||
remaining = &after_open[end + 7..];
|
||||
} else {
|
||||
// Unclosed <mark> — treat rest as plain text
|
||||
result.push_str(&remaining[start..]);
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
// No more tags — handle remaining plain text
|
||||
let rest_len = remaining.chars().count();
|
||||
if visible_count + rest_len > max_visible && max_visible > 3 {
|
||||
let take = max_visible.saturating_sub(3).saturating_sub(visible_count);
|
||||
let truncated: String = remaining.chars().take(take).collect();
|
||||
result.push_str(&truncated);
|
||||
result.push_str("...");
|
||||
return result;
|
||||
}
|
||||
result.push_str(remaining);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
/// Render FTS snippet with `<mark>` tags as terminal highlight style.
|
||||
fn render_snippet(snippet: &str) -> String {
|
||||
let mut result = String::new();
|
||||
@@ -429,17 +498,13 @@ pub fn print_search_results(response: &SearchResponse, explain: bool) {
|
||||
}
|
||||
println!("{indent}{}", meta_parts.join(&sep));
|
||||
|
||||
// Phase 5: limit snippet to ~2 terminal lines
|
||||
// Phase 5: limit snippet to ~2 terminal lines.
|
||||
// Truncate based on visible text length (excluding <mark></mark> tags)
|
||||
// to avoid cutting inside a highlight tag pair.
|
||||
let max_snippet_width =
|
||||
render::terminal_width().saturating_sub(render::visible_width(&indent));
|
||||
let max_snippet_chars = max_snippet_width.saturating_mul(2);
|
||||
let snippet = if result.snippet.chars().count() > max_snippet_chars && max_snippet_chars > 3
|
||||
{
|
||||
let truncated: String = result.snippet.chars().take(max_snippet_chars - 3).collect();
|
||||
format!("{truncated}...")
|
||||
} else {
|
||||
result.snippet.clone()
|
||||
};
|
||||
let snippet = truncate_snippet(&result.snippet, max_snippet_chars);
|
||||
let rendered = render_snippet(&snippet);
|
||||
println!("{indent}{rendered}");
|
||||
|
||||
@@ -527,3 +592,64 @@ pub fn print_search_results_json(
|
||||
Err(e) => eprintln!("Error serializing to JSON: {e}"),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn truncate_snippet_short_text_unchanged() {
|
||||
let s = "hello world";
|
||||
assert_eq!(truncate_snippet(s, 100), "hello world");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn truncate_snippet_plain_text_truncated() {
|
||||
let s = "this is a long string that exceeds the limit";
|
||||
let result = truncate_snippet(s, 20);
|
||||
assert!(result.ends_with("..."), "got: {result}");
|
||||
// Visible chars should be <= 20
|
||||
assert!(result.chars().count() <= 20, "got: {result}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn truncate_snippet_preserves_mark_tags() {
|
||||
let s = "some text <mark>keyword</mark> and more text here that is long";
|
||||
let result = truncate_snippet(s, 30);
|
||||
// Should not cut inside a <mark> pair
|
||||
let open_count = result.matches("<mark>").count();
|
||||
let close_count = result.matches("</mark>").count();
|
||||
assert_eq!(
|
||||
open_count, close_count,
|
||||
"unbalanced tags in: {result}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn truncate_snippet_cuts_before_mark_tag() {
|
||||
let s = "a]very long prefix that exceeds the limit <mark>word</mark>";
|
||||
let result = truncate_snippet(s, 15);
|
||||
assert!(result.ends_with("..."), "got: {result}");
|
||||
// The <mark> tag should not appear since we truncated before reaching it
|
||||
assert!(
|
||||
!result.contains("<mark>"),
|
||||
"should not include tag: {result}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn truncate_snippet_does_not_count_tags_as_visible() {
|
||||
// With tags, raw length is 42 chars. Without tags, visible is 29.
|
||||
let s = "prefix <mark>keyword</mark> suffix text";
|
||||
// If max_visible = 35, the visible text (29 chars) fits — should NOT truncate
|
||||
let result = truncate_snippet(s, 35);
|
||||
assert_eq!(result, s, "should not truncate when visible text fits");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn truncate_snippet_small_limit_returns_as_is() {
|
||||
let s = "text <mark>x</mark>";
|
||||
// Very small limit should return as-is (guard clause)
|
||||
assert_eq!(truncate_snippet(s, 3), s);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user