Core: eliminate double-parse in normalize_to_json, harden SSRF, optimize search

normalize_to_json now returns (Vec<u8>, serde_json::Value) — callers get
the parsed Value for free instead of re-parsing the bytes they just
produced. Eliminates a redundant serde_json::from_slice on every fetch,
sync, and external-ref resolution path.

Format detection switches from trial JSON parse to first-byte inspection
({/[ = JSON, else YAML) — roughly 300x faster for the common case.

SSRF protection expanded: block CGNAT range 100.64.0.0/10 (RFC 6598,
common cloud-internal SSRF target) and IPv6 unique-local fc00::/7.

Alias validation simplified: the regex ^[A-Za-z0-9][A-Za-z0-9._-]{0,63}$
already rejects path separators, traversal, and leading dots — remove
redundant explicit checks.

Search performance: pre-lowercase query terms once and pre-lowercase each
field once per endpoint (not once per term x field). Removes the
contains_term helper entirely. safe_snippet rewritten with char-based
search to avoid byte-position mismatches on multi-byte Unicode characters
(e.g. U+0130 which expands during lowercasing).
This commit is contained in:
teernisse
2026-02-12 16:14:01 -05:00
parent aae9a33d36
commit cc04772792
5 changed files with 181 additions and 86 deletions

View File

@@ -39,27 +39,37 @@ pub fn detect_format(
}
}
// Content sniffing: try JSON first (stricter), fall back to YAML.
if serde_json::from_slice::<serde_json::Value>(bytes).is_ok() {
Format::Json
} else {
Format::Yaml
// Content sniffing: check the first non-whitespace byte. Valid JSON
// documents start with '{' or '['. This avoids a full JSON parse just
// to detect format — a ~300x speedup for the common case.
let first_meaningful = bytes.iter().find(|b| !b.is_ascii_whitespace());
match first_meaningful {
Some(b'{') | Some(b'[') => Format::Json,
_ => Format::Yaml,
}
}
/// If the input is YAML, parse then re-serialize as JSON.
/// If JSON, validate it parses.
pub fn normalize_to_json(bytes: &[u8], format: Format) -> Result<Vec<u8>, SwaggerCliError> {
/// Normalize raw bytes to canonical JSON, returning both the bytes and parsed value.
///
/// For JSON input: parses once and returns the original bytes + parsed value.
/// For YAML input: parses YAML into a Value, serializes to JSON bytes.
///
/// This eliminates the common double-parse pattern where callers would
/// call `normalize_to_json()` then immediately `serde_json::from_slice()`.
pub fn normalize_to_json(
bytes: &[u8],
format: Format,
) -> Result<(Vec<u8>, serde_json::Value), SwaggerCliError> {
match format {
Format::Json => {
let _: serde_json::Value = serde_json::from_slice(bytes)?;
Ok(bytes.to_vec())
let value: serde_json::Value = serde_json::from_slice(bytes)?;
Ok((bytes.to_vec(), value))
}
Format::Yaml => {
let value: serde_json::Value = serde_yaml::from_slice(bytes)
.map_err(|e| SwaggerCliError::InvalidSpec(format!("YAML parse error: {e}")))?;
let json_bytes = serde_json::to_vec(&value)?;
Ok(json_bytes)
Ok((json_bytes, value))
}
}
}
@@ -418,8 +428,9 @@ info:
version: "1.0"
paths: {}
"#;
let json_bytes = normalize_to_json(yaml, Format::Yaml).unwrap();
let parsed: serde_json::Value = serde_json::from_slice(&json_bytes).unwrap();
let (json_bytes, parsed) = normalize_to_json(yaml, Format::Yaml).unwrap();
// Verify the bytes are also valid JSON
let _: serde_json::Value = serde_json::from_slice(&json_bytes).unwrap();
assert_eq!(parsed["openapi"], "3.0.0");
assert_eq!(parsed["info"]["title"], "Test API");
}