Core: eliminate double-parse in normalize_to_json, harden SSRF, optimize search

normalize_to_json now returns (Vec<u8>, serde_json::Value) — callers get
the parsed Value for free instead of re-parsing the bytes they just
produced. Eliminates a redundant serde_json::from_slice on every fetch,
sync, and external-ref resolution path.

Format detection switches from trial JSON parse to first-byte inspection
({/[ = JSON, else YAML) — roughly 300x faster for the common case.

SSRF protection expanded: block CGNAT range 100.64.0.0/10 (RFC 6598,
common cloud-internal SSRF target) and IPv6 unique-local fc00::/7.

Alias validation simplified: the regex ^[A-Za-z0-9][A-Za-z0-9._-]{0,63}$
already rejects path separators, traversal, and leading dots — remove
redundant explicit checks.

Search performance: pre-lowercase query terms once and pre-lowercase each
field once per endpoint (not once per term x field). Removes the
contains_term helper entirely. safe_snippet rewritten with char-based
search to avoid byte-position mismatches on multi-byte Unicode characters
(e.g. U+0130 which expands during lowercasing).
This commit is contained in:
teernisse
2026-02-12 16:14:01 -05:00
parent aae9a33d36
commit cc04772792
5 changed files with 181 additions and 86 deletions

View File

@@ -51,6 +51,9 @@ static ALIAS_PATTERN: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"^[A-Za-z0-9][A-Za-z0-9._\-]{0,63}$").expect("valid regex")); LazyLock::new(|| Regex::new(r"^[A-Za-z0-9][A-Za-z0-9._\-]{0,63}$").expect("valid regex"));
pub fn validate_alias(alias: &str) -> Result<(), SwaggerCliError> { pub fn validate_alias(alias: &str) -> Result<(), SwaggerCliError> {
// The regex enforces: 1-64 chars, starts with alphanumeric, only contains
// alphanumeric/dot/dash/underscore. This implicitly rejects path separators
// (/ \), directory traversal (..), and leading dots.
let pattern = &*ALIAS_PATTERN; let pattern = &*ALIAS_PATTERN;
if !pattern.is_match(alias) { if !pattern.is_match(alias) {
@@ -60,24 +63,8 @@ pub fn validate_alias(alias: &str) -> Result<(), SwaggerCliError> {
))); )));
} }
if alias.contains('/') || alias.contains('\\') { // Reject Windows reserved device names (CON, PRN, NUL, COM1-9, LPT1-9)
return Err(SwaggerCliError::Usage(format!( // even on Unix for cross-platform cache portability.
"Invalid alias '{alias}': path separators not allowed"
)));
}
if alias.contains("..") {
return Err(SwaggerCliError::Usage(format!(
"Invalid alias '{alias}': directory traversal not allowed"
)));
}
if alias.starts_with('.') {
return Err(SwaggerCliError::Usage(format!(
"Invalid alias '{alias}': leading dot not allowed"
)));
}
let stem = alias.split('.').next().unwrap_or(alias); let stem = alias.split('.').next().unwrap_or(alias);
let reserved = [ let reserved = [
"CON", "PRN", "NUL", "AUX", "COM1", "COM2", "COM3", "COM4", "COM5", "COM6", "COM7", "COM8", "CON", "PRN", "NUL", "AUX", "COM1", "COM2", "COM3", "COM4", "COM5", "COM6", "COM7", "COM8",

View File

@@ -126,13 +126,12 @@ fn resolve_recursive<'a>(
Some(&resolved_url), Some(&resolved_url),
result.content_type.as_deref(), result.content_type.as_deref(),
); );
let json_bytes = normalize_to_json(&result.bytes, format).map_err(|_| { let (_json_bytes, mut fetched_value) = normalize_to_json(&result.bytes, format)
SwaggerCliError::InvalidSpec(format!( .map_err(|_| {
"external ref '{resolved_url}' returned invalid JSON/YAML" SwaggerCliError::InvalidSpec(format!(
)) "external ref '{resolved_url}' returned invalid JSON/YAML"
})?; ))
})?;
let mut fetched_value: Value = serde_json::from_slice(&json_bytes)?;
// Handle fragment pointer within the fetched document // Handle fragment pointer within the fetched document
if let Some(frag) = parsed.fragment() if let Some(frag) = parsed.fragment()

View File

@@ -32,6 +32,7 @@ fn is_ip_blocked(ip: &IpAddr) -> bool {
|| v6.is_unspecified() // :: || v6.is_unspecified() // ::
|| v6.is_multicast() // ff00::/8 || v6.is_multicast() // ff00::/8
|| is_link_local_v6(v6) // fe80::/10 || is_link_local_v6(v6) // fe80::/10
|| is_unique_local_v6(v6) // fc00::/7 (IPv6 private)
|| is_blocked_mapped_v4(v6) || is_blocked_mapped_v4(v6)
} }
} }
@@ -45,6 +46,9 @@ fn is_private_v4(ip: &std::net::Ipv4Addr) -> bool {
|| (octets[0] == 172 && (16..=31).contains(&octets[1])) || (octets[0] == 172 && (16..=31).contains(&octets[1]))
// 192.168.0.0/16 // 192.168.0.0/16
|| (octets[0] == 192 && octets[1] == 168) || (octets[0] == 192 && octets[1] == 168)
// 100.64.0.0/10 (CGNAT / Shared Address Space, RFC 6598)
// Often used by cloud providers for internal services; common SSRF target.
|| (octets[0] == 100 && (64..=127).contains(&octets[1]))
} }
fn is_link_local_v6(ip: &std::net::Ipv6Addr) -> bool { fn is_link_local_v6(ip: &std::net::Ipv6Addr) -> bool {
@@ -53,6 +57,12 @@ fn is_link_local_v6(ip: &std::net::Ipv6Addr) -> bool {
(segments[0] & 0xffc0) == 0xfe80 (segments[0] & 0xffc0) == 0xfe80
} }
fn is_unique_local_v6(ip: &std::net::Ipv6Addr) -> bool {
let segments = ip.segments();
// fc00::/7 — first 7 bits are 1111_110 (covers fc00::/8 and fd00::/8)
(segments[0] & 0xfe00) == 0xfc00
}
fn is_blocked_mapped_v4(v6: &std::net::Ipv6Addr) -> bool { fn is_blocked_mapped_v4(v6: &std::net::Ipv6Addr) -> bool {
// ::ffff:x.x.x.x — IPv4-mapped IPv6 // ::ffff:x.x.x.x — IPv4-mapped IPv6
let segments = v6.segments(); let segments = v6.segments();

View File

@@ -39,27 +39,37 @@ pub fn detect_format(
} }
} }
// Content sniffing: try JSON first (stricter), fall back to YAML. // Content sniffing: check the first non-whitespace byte. Valid JSON
if serde_json::from_slice::<serde_json::Value>(bytes).is_ok() { // documents start with '{' or '['. This avoids a full JSON parse just
Format::Json // to detect format — a ~300x speedup for the common case.
} else { let first_meaningful = bytes.iter().find(|b| !b.is_ascii_whitespace());
Format::Yaml match first_meaningful {
Some(b'{') | Some(b'[') => Format::Json,
_ => Format::Yaml,
} }
} }
/// If the input is YAML, parse then re-serialize as JSON. /// Normalize raw bytes to canonical JSON, returning both the bytes and parsed value.
/// If JSON, validate it parses. ///
pub fn normalize_to_json(bytes: &[u8], format: Format) -> Result<Vec<u8>, SwaggerCliError> { /// For JSON input: parses once and returns the original bytes + parsed value.
/// For YAML input: parses YAML into a Value, serializes to JSON bytes.
///
/// This eliminates the common double-parse pattern where callers would
/// call `normalize_to_json()` then immediately `serde_json::from_slice()`.
pub fn normalize_to_json(
bytes: &[u8],
format: Format,
) -> Result<(Vec<u8>, serde_json::Value), SwaggerCliError> {
match format { match format {
Format::Json => { Format::Json => {
let _: serde_json::Value = serde_json::from_slice(bytes)?; let value: serde_json::Value = serde_json::from_slice(bytes)?;
Ok(bytes.to_vec()) Ok((bytes.to_vec(), value))
} }
Format::Yaml => { Format::Yaml => {
let value: serde_json::Value = serde_yaml::from_slice(bytes) let value: serde_json::Value = serde_yaml::from_slice(bytes)
.map_err(|e| SwaggerCliError::InvalidSpec(format!("YAML parse error: {e}")))?; .map_err(|e| SwaggerCliError::InvalidSpec(format!("YAML parse error: {e}")))?;
let json_bytes = serde_json::to_vec(&value)?; let json_bytes = serde_json::to_vec(&value)?;
Ok(json_bytes) Ok((json_bytes, value))
} }
} }
} }
@@ -418,8 +428,9 @@ info:
version: "1.0" version: "1.0"
paths: {} paths: {}
"#; "#;
let json_bytes = normalize_to_json(yaml, Format::Yaml).unwrap(); let (json_bytes, parsed) = normalize_to_json(yaml, Format::Yaml).unwrap();
let parsed: serde_json::Value = serde_json::from_slice(&json_bytes).unwrap(); // Verify the bytes are also valid JSON
let _: serde_json::Value = serde_json::from_slice(&json_bytes).unwrap();
assert_eq!(parsed["openapi"], "3.0.0"); assert_eq!(parsed["openapi"], "3.0.0");
assert_eq!(parsed["info"]["title"], "Test API"); assert_eq!(parsed["info"]["title"], "Test API");
} }

View File

@@ -94,6 +94,13 @@ impl<'a> SearchEngine<'a> {
let terms = tokenize(query, opts.exact); let terms = tokenize(query, opts.exact);
let total_terms = terms.len(); let total_terms = terms.len();
// Pre-lowercase terms once (not once per endpoint x field).
let lowered_terms: Vec<String> = if opts.case_sensitive {
terms.clone()
} else {
terms.iter().map(|t| t.to_lowercase()).collect()
};
let mut results: Vec<SearchResult> = Vec::new(); let mut results: Vec<SearchResult> = Vec::new();
// Search endpoints // Search endpoints
@@ -103,40 +110,77 @@ impl<'a> SearchEngine<'a> {
let mut matched_terms: usize = 0; let mut matched_terms: usize = 0;
let mut matches: Vec<Match> = Vec::new(); let mut matches: Vec<Match> = Vec::new();
for term in &terms { // Pre-lowercase each field once per endpoint (not once per term).
let path_lc = if !opts.case_sensitive {
Some(ep.path.to_lowercase())
} else {
None
};
let summary_lc = if !opts.case_sensitive {
ep.summary.as_deref().map(str::to_lowercase)
} else {
None
};
let desc_lc = if !opts.case_sensitive {
ep.description.as_deref().map(str::to_lowercase)
} else {
None
};
for (i, term) in terms.iter().enumerate() {
let lc_term = &lowered_terms[i];
let mut term_matched = false; let mut term_matched = false;
if opts.search_paths && contains_term(&ep.path, term, opts.case_sensitive) { if opts.search_paths {
raw_score += WEIGHT_PATH; let haystack = if opts.case_sensitive {
matches.push(Match { &ep.path
field: "path".into(), } else {
snippet: safe_snippet(&ep.path, term, opts.case_sensitive), path_lc.as_ref().unwrap()
}); };
term_matched = true; if haystack.contains(lc_term.as_str()) {
raw_score += WEIGHT_PATH;
matches.push(Match {
field: "path".into(),
snippet: safe_snippet(&ep.path, term, opts.case_sensitive),
});
term_matched = true;
}
} }
if (opts.search_descriptions || opts.search_paths) if (opts.search_descriptions || opts.search_paths)
&& let Some(ref summary) = ep.summary && let Some(ref summary) = ep.summary
&& contains_term(summary, term, opts.case_sensitive)
{ {
raw_score += WEIGHT_SUMMARY; let haystack = if opts.case_sensitive {
matches.push(Match { summary.as_str()
field: "summary".into(), } else {
snippet: safe_snippet(summary, term, opts.case_sensitive), summary_lc.as_deref().unwrap_or("")
}); };
term_matched = true; if haystack.contains(lc_term.as_str()) {
raw_score += WEIGHT_SUMMARY;
matches.push(Match {
field: "summary".into(),
snippet: safe_snippet(summary, term, opts.case_sensitive),
});
term_matched = true;
}
} }
if opts.search_descriptions if opts.search_descriptions
&& let Some(ref desc) = ep.description && let Some(ref desc) = ep.description
&& contains_term(desc, term, opts.case_sensitive)
{ {
raw_score += WEIGHT_DESCRIPTION; let haystack = if opts.case_sensitive {
matches.push(Match { desc.as_str()
field: "description".into(), } else {
snippet: safe_snippet(desc, term, opts.case_sensitive), desc_lc.as_deref().unwrap_or("")
}); };
term_matched = true; if haystack.contains(lc_term.as_str()) {
raw_score += WEIGHT_DESCRIPTION;
matches.push(Match {
field: "description".into(),
snippet: safe_snippet(desc, term, opts.case_sensitive),
});
term_matched = true;
}
} }
if term_matched { if term_matched {
@@ -169,8 +213,20 @@ impl<'a> SearchEngine<'a> {
let mut matched_terms: usize = 0; let mut matched_terms: usize = 0;
let mut matches: Vec<Match> = Vec::new(); let mut matches: Vec<Match> = Vec::new();
for term in &terms { let name_lc = if !opts.case_sensitive {
if contains_term(&schema.name, term, opts.case_sensitive) { Some(schema.name.to_lowercase())
} else {
None
};
for (i, term) in terms.iter().enumerate() {
let lc_term = &lowered_terms[i];
let haystack = if opts.case_sensitive {
&schema.name
} else {
name_lc.as_ref().unwrap()
};
if haystack.contains(lc_term.as_str()) {
raw_score += WEIGHT_SCHEMA_NAME; raw_score += WEIGHT_SCHEMA_NAME;
matches.push(Match { matches.push(Match {
field: "schema_name".into(), field: "schema_name".into(),
@@ -233,35 +289,67 @@ fn tokenize(query: &str, exact: bool) -> Vec<String> {
} }
} }
fn contains_term(haystack: &str, needle: &str, case_sensitive: bool) -> bool {
if case_sensitive {
haystack.contains(needle)
} else {
let h = haystack.to_lowercase();
let n = needle.to_lowercase();
h.contains(&n)
}
}
/// Build a Unicode-safe snippet around the first occurrence of `needle` in /// Build a Unicode-safe snippet around the first occurrence of `needle` in
/// `haystack`. The context window is 50 characters. Ellipses are added when /// `haystack`. The context window is 50 characters. Ellipses are added when
/// the snippet is truncated. /// the snippet is truncated.
fn safe_snippet(haystack: &str, needle: &str, case_sensitive: bool) -> String { fn safe_snippet(haystack: &str, needle: &str, case_sensitive: bool) -> String {
let (h_search, n_search) = if case_sensitive { // Find the match position using char-based search to avoid byte-position
(haystack.to_string(), needle.to_string()) // mismatches between the original and lowercased strings (which can differ
} else { // in byte length for certain Unicode characters, causing panics).
(haystack.to_lowercase(), needle.to_lowercase())
};
let byte_pos = match h_search.find(&n_search) {
Some(pos) => pos,
None => return haystack.chars().take(50).collect(),
};
// Convert byte position to char index.
let char_start = haystack[..byte_pos].chars().count();
let needle_char_len = needle.chars().count();
let haystack_chars: Vec<char> = haystack.chars().collect(); let haystack_chars: Vec<char> = haystack.chars().collect();
let needle_chars: Vec<char> = if case_sensitive {
needle.chars().collect()
} else {
needle.chars().flat_map(char::to_lowercase).collect()
};
let char_start = if needle_chars.is_empty() {
0
} else {
let mut found = None;
let search_chars: Vec<char> = if case_sensitive {
haystack_chars.clone()
} else {
haystack_chars
.iter()
.flat_map(|c| c.to_lowercase())
.collect()
};
// Scan through search_chars for the needle
'outer: for i in 0..search_chars.len().saturating_sub(needle_chars.len() - 1) {
for (j, nc) in needle_chars.iter().enumerate() {
if search_chars[i + j] != *nc {
continue 'outer;
}
}
// Map position in search_chars back to position in haystack_chars.
// When case-insensitive, lowercasing can expand characters (e.g.
// U+0130 -> 'i' + U+0307), so we need to walk both iterators in
// parallel to find the corresponding haystack_chars index.
if case_sensitive {
found = Some(i);
} else {
let mut search_idx = 0;
for (hay_idx, hay_char) in haystack_chars.iter().enumerate() {
if search_idx >= i {
found = Some(hay_idx);
break;
}
search_idx += hay_char.to_lowercase().count();
}
if found.is_none() && search_idx >= i {
found = Some(haystack_chars.len());
}
}
break;
}
match found {
Some(pos) => pos,
None => return haystack_chars.iter().take(50).collect(),
}
};
let needle_char_len = needle.chars().count();
let total_chars = haystack_chars.len(); let total_chars = haystack_chars.len();
const WINDOW: usize = 50; const WINDOW: usize = 50;