use serde::Serialize; use super::indexer::method_rank; use super::spec::SpecIndex; // --------------------------------------------------------------------------- // Public types // --------------------------------------------------------------------------- #[derive(Debug, Clone, Serialize)] pub struct SearchResult { pub result_type: SearchResultType, pub name: String, pub method: Option, pub summary: Option, pub rank: usize, pub score: u32, pub matches: Vec, } #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize)] #[serde(rename_all = "snake_case")] pub enum SearchResultType { Endpoint, Schema, } impl SearchResultType { pub(crate) fn ordinal(self) -> u8 { match self { Self::Endpoint => 0, Self::Schema => 1, } } } #[derive(Debug, Clone, Serialize)] pub struct Match { pub field: String, pub snippet: String, } #[derive(Debug, Clone)] pub struct SearchOptions { pub search_paths: bool, pub search_descriptions: bool, pub search_schemas: bool, pub case_sensitive: bool, pub exact: bool, pub limit: usize, } impl Default for SearchOptions { fn default() -> Self { Self { search_paths: true, search_descriptions: true, search_schemas: true, case_sensitive: false, exact: false, limit: 20, } } } // --------------------------------------------------------------------------- // Field weights // --------------------------------------------------------------------------- const WEIGHT_PATH: f64 = 10.0; const WEIGHT_SUMMARY: f64 = 5.0; const WEIGHT_DESCRIPTION: f64 = 2.0; const WEIGHT_SCHEMA_NAME: f64 = 8.0; // --------------------------------------------------------------------------- // Search engine // --------------------------------------------------------------------------- pub struct SearchEngine<'a> { index: &'a SpecIndex, } impl<'a> SearchEngine<'a> { pub fn new(index: &'a SpecIndex) -> Self { Self { index } } pub fn search(&self, query: &str, opts: &SearchOptions) -> Vec { let query = query.trim(); if query.is_empty() { return Vec::new(); } let terms = tokenize(query, opts.exact); let total_terms = terms.len(); // Pre-lowercase terms once (not once per endpoint x field). let lowered_terms: Vec = if opts.case_sensitive { terms.clone() } else { terms.iter().map(|t| t.to_lowercase()).collect() }; let mut results: Vec = Vec::new(); // Search endpoints if opts.search_paths || opts.search_descriptions { for ep in &self.index.endpoints { let mut raw_score: f64 = 0.0; let mut matched_terms: usize = 0; let mut matches: Vec = Vec::new(); // Pre-lowercase each field once per endpoint (not once per term). let path_lc = if !opts.case_sensitive { Some(ep.path.to_lowercase()) } else { None }; let summary_lc = if !opts.case_sensitive { ep.summary.as_deref().map(str::to_lowercase) } else { None }; let desc_lc = if !opts.case_sensitive { ep.description.as_deref().map(str::to_lowercase) } else { None }; for (i, term) in terms.iter().enumerate() { let lc_term = &lowered_terms[i]; let mut term_matched = false; if opts.search_paths { let haystack = if opts.case_sensitive { &ep.path } else { path_lc.as_ref().unwrap() }; if haystack.contains(lc_term.as_str()) { raw_score += WEIGHT_PATH; matches.push(Match { field: "path".into(), snippet: safe_snippet(&ep.path, term, opts.case_sensitive), }); term_matched = true; } } if (opts.search_descriptions || opts.search_paths) && let Some(ref summary) = ep.summary { let haystack = if opts.case_sensitive { summary.as_str() } else { summary_lc.as_deref().unwrap_or("") }; if haystack.contains(lc_term.as_str()) { raw_score += WEIGHT_SUMMARY; matches.push(Match { field: "summary".into(), snippet: safe_snippet(summary, term, opts.case_sensitive), }); term_matched = true; } } if opts.search_descriptions && let Some(ref desc) = ep.description { let haystack = if opts.case_sensitive { desc.as_str() } else { desc_lc.as_deref().unwrap_or("") }; if haystack.contains(lc_term.as_str()) { raw_score += WEIGHT_DESCRIPTION; matches.push(Match { field: "description".into(), snippet: safe_snippet(desc, term, opts.case_sensitive), }); term_matched = true; } } if term_matched { matched_terms += 1; } } if raw_score > 0.0 { let coverage_boost = 1.0 + (matched_terms as f64 / total_terms.max(1) as f64); let final_score = raw_score * coverage_boost; let quantized = (final_score * 100.0).round() as u32; results.push(SearchResult { result_type: SearchResultType::Endpoint, name: ep.path.clone(), method: Some(ep.method.clone()), summary: ep.summary.clone(), rank: 0, // assigned after sort score: quantized, matches, }); } } } // Search schemas if opts.search_schemas { for schema in &self.index.schemas { let mut raw_score: f64 = 0.0; let mut matched_terms: usize = 0; let mut matches: Vec = Vec::new(); let name_lc = if !opts.case_sensitive { Some(schema.name.to_lowercase()) } else { None }; for (i, term) in terms.iter().enumerate() { let lc_term = &lowered_terms[i]; let haystack = if opts.case_sensitive { &schema.name } else { name_lc.as_ref().unwrap() }; if haystack.contains(lc_term.as_str()) { raw_score += WEIGHT_SCHEMA_NAME; matches.push(Match { field: "schema_name".into(), snippet: safe_snippet(&schema.name, term, opts.case_sensitive), }); matched_terms += 1; } } if raw_score > 0.0 { let coverage_boost = 1.0 + (matched_terms as f64 / total_terms.max(1) as f64); let final_score = raw_score * coverage_boost; let quantized = (final_score * 100.0).round() as u32; results.push(SearchResult { result_type: SearchResultType::Schema, name: schema.name.clone(), method: None, summary: None, rank: 0, score: quantized, matches, }); } } } // Deterministic sort: score DESC, type ordinal ASC, name ASC, method_rank ASC results.sort_by(|a, b| { b.score .cmp(&a.score) .then_with(|| a.result_type.ordinal().cmp(&b.result_type.ordinal())) .then_with(|| a.name.cmp(&b.name)) .then_with(|| { let a_rank = a.method.as_deref().map(method_rank).unwrap_or(u8::MAX); let b_rank = b.method.as_deref().map(method_rank).unwrap_or(u8::MAX); a_rank.cmp(&b_rank) }) }); // Assign 1-based ranks and apply limit results.truncate(opts.limit); for (i, result) in results.iter_mut().enumerate() { result.rank = i + 1; } results } } // --------------------------------------------------------------------------- // Helpers // --------------------------------------------------------------------------- fn tokenize(query: &str, exact: bool) -> Vec { if exact { vec![query.to_string()] } else { query.split_whitespace().map(String::from).collect() } } /// Build a Unicode-safe snippet around the first occurrence of `needle` in /// `haystack`. The context window is 50 characters. Ellipses are added when /// the snippet is truncated. fn safe_snippet(haystack: &str, needle: &str, case_sensitive: bool) -> String { // Find the match position using char-based search to avoid byte-position // mismatches between the original and lowercased strings (which can differ // in byte length for certain Unicode characters, causing panics). let haystack_chars: Vec = haystack.chars().collect(); let needle_chars: Vec = if case_sensitive { needle.chars().collect() } else { needle.chars().flat_map(char::to_lowercase).collect() }; let char_start = if needle_chars.is_empty() { 0 } else { let mut found = None; let search_chars: Vec = if case_sensitive { haystack_chars.clone() } else { haystack_chars .iter() .flat_map(|c| c.to_lowercase()) .collect() }; // Scan through search_chars for the needle 'outer: for i in 0..search_chars.len().saturating_sub(needle_chars.len() - 1) { for (j, nc) in needle_chars.iter().enumerate() { if search_chars[i + j] != *nc { continue 'outer; } } // Map position in search_chars back to position in haystack_chars. // When case-insensitive, lowercasing can expand characters (e.g. // U+0130 -> 'i' + U+0307), so we need to walk both iterators in // parallel to find the corresponding haystack_chars index. if case_sensitive { found = Some(i); } else { let mut search_idx = 0; for (hay_idx, hay_char) in haystack_chars.iter().enumerate() { if search_idx >= i { found = Some(hay_idx); break; } search_idx += hay_char.to_lowercase().count(); } if found.is_none() && search_idx >= i { found = Some(haystack_chars.len()); } } break; } match found { Some(pos) => pos, None => return haystack_chars.iter().take(50).collect(), } }; let needle_char_len = needle.chars().count(); let total_chars = haystack_chars.len(); const WINDOW: usize = 50; // Centre the window around the match. let context_budget = WINDOW.saturating_sub(needle_char_len); let left_context = context_budget / 2; let snippet_start = char_start.saturating_sub(left_context); let snippet_end = (snippet_start + WINDOW).min(total_chars); let prefix = if snippet_start > 0 { "..." } else { "" }; let suffix = if snippet_end < total_chars { "..." } else { "" }; let snippet_body: String = haystack_chars[snippet_start..snippet_end].iter().collect(); format!("{prefix}{snippet_body}{suffix}") } // --------------------------------------------------------------------------- // Tests // --------------------------------------------------------------------------- #[cfg(test)] mod tests { use super::*; use crate::core::spec::{ IndexInfo, IndexedEndpoint, IndexedParam, IndexedSchema, IndexedTag, SpecIndex, }; fn petstore_index() -> SpecIndex { SpecIndex { index_version: 1, generation: 1, content_hash: "sha256:test".into(), openapi: "3.0.3".into(), info: IndexInfo { title: "Petstore".into(), version: "1.0.0".into(), }, endpoints: vec![ IndexedEndpoint { path: "/pets".into(), method: "GET".into(), summary: Some("List all pets".into()), description: Some("Returns a list of pets from the store".into()), operation_id: Some("listPets".into()), tags: vec!["pets".into()], deprecated: false, parameters: vec![IndexedParam { name: "limit".into(), location: "query".into(), required: false, description: Some("Max items".into()), }], request_body_required: false, request_body_content_types: vec![], security_schemes: vec![], security_required: false, operation_ptr: "/paths/~1pets/get".into(), }, IndexedEndpoint { path: "/pets".into(), method: "POST".into(), summary: Some("Create a pet".into()), description: None, operation_id: Some("createPet".into()), tags: vec!["pets".into()], deprecated: false, parameters: vec![], request_body_required: true, request_body_content_types: vec!["application/json".into()], security_schemes: vec![], security_required: false, operation_ptr: "/paths/~1pets/post".into(), }, IndexedEndpoint { path: "/pets/{petId}".into(), method: "GET".into(), summary: Some("Info for a specific pet".into()), description: Some("Detailed information about a single pet".into()), operation_id: Some("showPetById".into()), tags: vec!["pets".into()], deprecated: false, parameters: vec![IndexedParam { name: "petId".into(), location: "path".into(), required: true, description: Some("The id of the pet".into()), }], request_body_required: false, request_body_content_types: vec![], security_schemes: vec![], security_required: false, operation_ptr: "/paths/~1pets~1{petId}/get".into(), }, IndexedEndpoint { path: "/store/inventory".into(), method: "GET".into(), summary: Some("Returns store inventory".into()), description: None, operation_id: Some("getInventory".into()), tags: vec!["store".into()], deprecated: false, parameters: vec![], request_body_required: false, request_body_content_types: vec![], security_schemes: vec![], security_required: false, operation_ptr: "/paths/~1store~1inventory/get".into(), }, ], schemas: vec![ IndexedSchema { name: "Pet".into(), schema_ptr: "/components/schemas/Pet".into(), }, IndexedSchema { name: "Error".into(), schema_ptr: "/components/schemas/Error".into(), }, IndexedSchema { name: "PetList".into(), schema_ptr: "/components/schemas/PetList".into(), }, ], tags: vec![ IndexedTag { name: "pets".into(), description: Some("Pet operations".into()), endpoint_count: 3, }, IndexedTag { name: "store".into(), description: Some("Store operations".into()), endpoint_count: 1, }, ], } } #[test] fn test_search_basic() { let index = petstore_index(); let engine = SearchEngine::new(&index); let opts = SearchOptions::default(); let results = engine.search("pet", &opts); assert!( !results.is_empty(), "should find 'pet' in petstore endpoints" ); // All results should mention pet somewhere for r in &results { let has_pet = r .matches .iter() .any(|m| m.snippet.to_lowercase().contains("pet")); assert!(has_pet, "result {:?} should match 'pet'", r.name); } // Ranks should be sequential 1-based for (i, r) in results.iter().enumerate() { assert_eq!(r.rank, i + 1, "rank should be 1-based sequential"); } } #[test] fn test_search_scores_deterministic() { let index = petstore_index(); let engine = SearchEngine::new(&index); let opts = SearchOptions::default(); let run1 = engine.search("pet", &opts); let run2 = engine.search("pet", &opts); assert_eq!(run1.len(), run2.len()); for (a, b) in run1.iter().zip(run2.iter()) { assert_eq!(a.score, b.score, "scores should be identical across runs"); assert_eq!(a.rank, b.rank, "ranks should be identical across runs"); assert_eq!(a.name, b.name, "names should be identical across runs"); assert_eq!( a.method, b.method, "methods should be identical across runs" ); } } #[test] fn test_search_exact_mode() { let index = petstore_index(); let engine = SearchEngine::new(&index); // "list all" as two tokens: should match broadly let loose_opts = SearchOptions { exact: false, ..SearchOptions::default() }; let loose = engine.search("list all", &loose_opts); // "list all" as exact phrase: only matches if that exact phrase appears let exact_opts = SearchOptions { exact: true, ..SearchOptions::default() }; let exact = engine.search("list all", &exact_opts); // Exact should be a subset of (or equal to) loose results assert!( exact.len() <= loose.len(), "exact mode should return fewer or equal results" ); // The exact match should find "List all pets" summary assert!( !exact.is_empty(), "exact 'list all' should match 'List all pets'" ); } #[test] fn test_search_case_sensitive() { let index = petstore_index(); let engine = SearchEngine::new(&index); // Case-insensitive (default): "PET" matches "pet", "/pets", etc. let insensitive = SearchOptions { case_sensitive: false, ..SearchOptions::default() }; let results_insensitive = engine.search("PET", &insensitive); // Case-sensitive: "PET" should NOT match lowercase "pet" or "/pets" let sensitive = SearchOptions { case_sensitive: true, ..SearchOptions::default() }; let results_sensitive = engine.search("PET", &sensitive); assert!( results_sensitive.len() < results_insensitive.len(), "case-sensitive 'PET' should match fewer results than case-insensitive" ); } #[test] fn test_safe_snippet_unicode() { // Emoji and multi-byte characters let haystack = "Hello \u{1F600} world of pets and \u{1F431} cats everywhere"; let snippet = safe_snippet(haystack, "pets", false); assert!( snippet.contains("pets"), "snippet should contain the search term" ); // Must not panic on multi-byte boundaries } #[test] fn test_safe_snippet_truncation() { let long = "a".repeat(200); let haystack = format!("{long}needle{long}"); let snippet = safe_snippet(&haystack, "needle", false); assert!(snippet.contains("needle")); assert!( snippet.contains("..."), "should have ellipsis for truncation" ); // Snippet should be around 50 chars + ellipsis markers let body_len = snippet.replace("...", "").chars().count(); assert!(body_len <= 50, "snippet body should be at most 50 chars"); } #[test] fn test_empty_query_returns_empty() { let index = petstore_index(); let engine = SearchEngine::new(&index); let opts = SearchOptions::default(); assert!(engine.search("", &opts).is_empty()); assert!(engine.search(" ", &opts).is_empty()); } #[test] fn test_search_limit() { let index = petstore_index(); let engine = SearchEngine::new(&index); let opts = SearchOptions { limit: 2, ..SearchOptions::default() }; let results = engine.search("pet", &opts); assert!(results.len() <= 2, "should respect limit"); } #[test] fn test_search_schemas_only() { let index = petstore_index(); let engine = SearchEngine::new(&index); let opts = SearchOptions { search_paths: false, search_descriptions: false, search_schemas: true, ..SearchOptions::default() }; let results = engine.search("Pet", &opts); assert!(!results.is_empty()); for r in &results { assert_eq!( r.result_type, SearchResultType::Schema, "should only return schemas" ); } } #[test] fn test_search_paths_only() { let index = petstore_index(); let engine = SearchEngine::new(&index); let opts = SearchOptions { search_paths: true, search_descriptions: false, search_schemas: false, ..SearchOptions::default() }; let results = engine.search("store", &opts); assert!(!results.is_empty()); for r in &results { assert_eq!( r.result_type, SearchResultType::Endpoint, "should only return endpoints" ); } } #[test] fn test_multi_term_coverage_boost() { let index = petstore_index(); let engine = SearchEngine::new(&index); let opts = SearchOptions::default(); // "pets store" has two terms; an endpoint matching both gets higher coverage let results = engine.search("pets list", &opts); if results.len() >= 2 { // The first result should have a higher score due to more term matches assert!( results[0].score >= results[1].score, "results should be sorted by score descending" ); } } #[test] fn test_no_match_returns_empty() { let index = petstore_index(); let engine = SearchEngine::new(&index); let opts = SearchOptions::default(); let results = engine.search("zzzznotfound", &opts); assert!( results.is_empty(), "gibberish query should return no results" ); } }