Files
gitlore/crates/lore-tui/src/safety.rs
2026-02-18 12:47:10 -05:00

588 lines
19 KiB
Rust

//! Terminal safety: sanitize untrusted text, URL policy, credential redaction.
//!
//! GitLab content can contain ANSI escapes, bidi overrides, OSC hyperlinks,
//! and C1 control codes that could corrupt terminal rendering. This module
//! strips dangerous sequences while preserving a safe SGR subset for readability.
use std::fmt::Write;
// ---------------------------------------------------------------------------
// UrlPolicy
// ---------------------------------------------------------------------------
/// Controls how OSC 8 hyperlinks in input are handled.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub enum UrlPolicy {
/// Remove OSC 8 hyperlinks entirely, keeping only the link text.
#[default]
Strip,
/// Convert hyperlinks to numbered footnotes: `text [1]` with URL list appended.
Footnote,
/// Pass hyperlinks through unchanged (only for trusted content).
Passthrough,
}
// ---------------------------------------------------------------------------
// RedactPattern
// ---------------------------------------------------------------------------
/// Common patterns for PII/secret redaction.
#[derive(Debug, Clone)]
pub struct RedactPattern {
patterns: Vec<regex::Regex>,
}
impl RedactPattern {
/// Create a default set of redaction patterns (tokens, emails, etc.).
#[must_use]
pub fn defaults() -> Self {
let patterns = vec![
// GitLab personal access tokens
regex::Regex::new(r"glpat-[A-Za-z0-9_\-]{20,}").expect("valid regex"),
// Generic bearer/API tokens (long hex or base64-ish strings after common prefixes)
regex::Regex::new(r"(?i)(token|bearer|api[_-]?key)[\s:=]+\S{8,}").expect("valid regex"),
// Email addresses
regex::Regex::new(r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}")
.expect("valid regex"),
];
Self { patterns }
}
/// Apply all redaction patterns to the input string.
#[must_use]
pub fn redact(&self, input: &str) -> String {
let mut result = input.to_string();
for pattern in &self.patterns {
result = pattern.replace_all(&result, "[REDACTED]").into_owned();
}
result
}
}
// ---------------------------------------------------------------------------
// sanitize_for_terminal
// ---------------------------------------------------------------------------
/// Sanitize untrusted text for safe terminal display.
///
/// - Strips C1 control codes (0x80-0x9F)
/// - Strips OSC sequences (ESC ] ... ST)
/// - Strips cursor movement CSI sequences (CSI n A/B/C/D/E/F/G/H/J/K)
/// - Strips bidi overrides (U+202A-U+202E, U+2066-U+2069)
/// - Preserves safe SGR subset (bold, italic, underline, reset, standard colors)
///
/// `url_policy` controls handling of OSC 8 hyperlinks.
#[must_use]
pub fn sanitize_for_terminal(input: &str, url_policy: UrlPolicy) -> String {
let mut output = String::with_capacity(input.len());
let mut footnotes: Vec<String> = Vec::new();
let chars: Vec<char> = input.chars().collect();
let len = chars.len();
let mut i = 0;
while i < len {
let ch = chars[i];
// --- Bidi overrides ---
if is_bidi_override(ch) {
i += 1;
continue;
}
// --- C1 control codes (U+0080-U+009F) ---
if ('\u{0080}'..='\u{009F}').contains(&ch) {
i += 1;
continue;
}
// --- C0 control codes except tab, newline, carriage return ---
if ch.is_ascii_control() && ch != '\t' && ch != '\n' && ch != '\r' && ch != '\x1B' {
i += 1;
continue;
}
// --- ESC sequences ---
if ch == '\x1B' {
if i + 1 < len {
match chars[i + 1] {
// CSI sequence: ESC [
'[' => {
let (consumed, safe_seq) = parse_csi(&chars, i);
if let Some(seq) = safe_seq {
output.push_str(&seq);
}
i += consumed;
continue;
}
// OSC sequence: ESC ]
']' => {
let (consumed, link_text, link_url) = parse_osc(&chars, i);
match url_policy {
UrlPolicy::Strip => {
if let Some(text) = link_text {
output.push_str(&text);
}
}
UrlPolicy::Footnote => {
if let (Some(text), Some(url)) = (link_text, link_url) {
footnotes.push(url);
let _ = write!(output, "{text} [{n}]", n = footnotes.len());
}
}
UrlPolicy::Passthrough => {
// Reproduce the raw OSC sequence
for &ch_raw in &chars[i..len.min(i + consumed)] {
output.push(ch_raw);
}
}
}
i += consumed;
continue;
}
_ => {
// Unknown ESC sequence — skip ESC + next char
i += 2;
continue;
}
}
} else {
// Trailing ESC at end of input
i += 1;
continue;
}
}
// --- Normal character ---
output.push(ch);
i += 1;
}
// Append footnotes if any
if !footnotes.is_empty() {
output.push('\n');
for (idx, url) in footnotes.iter().enumerate() {
let _ = write!(output, "\n[{}] {url}", idx + 1);
}
}
output
}
// ---------------------------------------------------------------------------
// Bidi check
// ---------------------------------------------------------------------------
fn is_bidi_override(ch: char) -> bool {
matches!(
ch,
'\u{202A}' // LRE
| '\u{202B}' // RLE
| '\u{202C}' // PDF
| '\u{202D}' // LRO
| '\u{202E}' // RLO
| '\u{2066}' // LRI
| '\u{2067}' // RLI
| '\u{2068}' // FSI
| '\u{2069}' // PDI
)
}
// ---------------------------------------------------------------------------
// CSI parser
// ---------------------------------------------------------------------------
/// Parse a CSI sequence starting at `chars[start]` (which should be ESC).
///
/// Returns `(chars_consumed, Option<safe_sequence_string>)`.
/// If the CSI is a safe SGR, returns the full sequence string to preserve.
/// Otherwise returns None (strip it).
fn parse_csi(chars: &[char], start: usize) -> (usize, Option<String>) {
// Minimum: ESC [ <final_byte>
debug_assert!(chars[start] == '\x1B');
debug_assert!(start + 1 < chars.len() && chars[start + 1] == '[');
let mut i = start + 2; // skip ESC [
let len = chars.len();
// Collect parameter bytes (0x30-0x3F) and intermediate bytes (0x20-0x2F)
let param_start = i;
while i < len && (chars[i] as u32) >= 0x20 && (chars[i] as u32) <= 0x3F {
i += 1;
}
// Collect intermediate bytes
while i < len && (chars[i] as u32) >= 0x20 && (chars[i] as u32) <= 0x2F {
i += 1;
}
// Final byte (0x40-0x7E)
if i >= len || (chars[i] as u32) < 0x40 || (chars[i] as u32) > 0x7E {
// Malformed — consume what we've seen and strip
return (i.saturating_sub(start).max(2), None);
}
let final_byte = chars[i];
let consumed = i + 1 - start;
// Only preserve SGR sequences (final byte 'm')
if final_byte == 'm' {
let param_str: String = chars[param_start..i].iter().collect();
if is_safe_sgr(&param_str) {
let full_seq: String = chars[start..start + consumed].iter().collect();
return (consumed, Some(full_seq));
}
}
// Anything else (cursor movement A-H, erase J/K, etc.) is stripped
(consumed, None)
}
/// Check if all SGR parameters in a sequence are in the safe subset.
///
/// Safe: 0 (reset), 1 (bold), 3 (italic), 4 (underline), 22 (normal intensity),
/// 23 (not italic), 24 (not underline), 39 (default fg), 49 (default bg),
/// 30-37 (standard fg), 40-47 (standard bg), 90-97 (bright fg), 100-107 (bright bg).
fn is_safe_sgr(params: &str) -> bool {
if params.is_empty() {
return true; // ESC[m is reset
}
for param in params.split(';') {
let param = param.trim();
if param.is_empty() {
continue; // treat empty as 0
}
let Ok(n) = param.parse::<u32>() else {
return false;
};
if !is_safe_sgr_code(n) {
return false;
}
}
true
}
fn is_safe_sgr_code(n: u32) -> bool {
matches!(
n,
0 // reset
| 1 // bold
| 3 // italic
| 4 // underline
| 22 // normal intensity (turn off bold)
| 23 // not italic
| 24 // not underline
| 39 // default foreground
| 49 // default background
| 30..=37 // standard foreground colors
| 40..=47 // standard background colors
| 90..=97 // bright foreground colors
| 100..=107 // bright background colors
)
}
// ---------------------------------------------------------------------------
// OSC parser
// ---------------------------------------------------------------------------
/// Parse an OSC sequence starting at `chars[start]` (ESC ]).
///
/// Returns `(chars_consumed, link_text, link_url)`.
/// For OSC 8 hyperlinks: `ESC ] 8 ; params ; url ST text ESC ] 8 ; ; ST`
/// For other OSC: consumed without extracting link data.
fn parse_osc(chars: &[char], start: usize) -> (usize, Option<String>, Option<String>) {
debug_assert!(chars[start] == '\x1B');
debug_assert!(start + 1 < chars.len() && chars[start + 1] == ']');
let len = chars.len();
let i = start + 2; // skip ESC ]
// Find ST (String Terminator): ESC \ or BEL (0x07)
let osc_end = find_st(chars, i);
// Check if this is OSC 8 (hyperlink)
if i < len && chars[i] == '8' && i + 1 < len && chars[i + 1] == ';' {
// OSC 8 hyperlink: ESC ] 8 ; params ; url ST ... ESC ] 8 ; ; ST
let osc_content: String = chars[i..osc_end.0].iter().collect();
let first_consumed = osc_end.1;
// Extract URL from "8;params;url"
let url = extract_osc8_url(&osc_content);
// Now find the link text (between first ST and second OSC 8)
let after_first_st = start + 2 + first_consumed;
let mut text = String::new();
let mut j = after_first_st;
// Collect text until we hit the closing OSC 8 or end of input
while j < len {
if j + 1 < len && chars[j] == '\x1B' && chars[j + 1] == ']' {
// Found another OSC — this should be the closing OSC 8
let close_end = find_st(chars, j + 2);
return (
j + close_end.1 - start + 2,
Some(text),
url.map(String::from),
);
}
text.push(chars[j]);
j += 1;
}
// Reached end without closing OSC 8
return (j - start, Some(text), url.map(String::from));
}
// Non-OSC-8: just consume and strip
(osc_end.1 + (start + 2 - start), None, None)
}
/// Find the String Terminator (ST) for an OSC sequence.
/// ST is either ESC \ (two chars) or BEL (0x07).
/// Returns (content_end_index, total_consumed_from_content_start).
fn find_st(chars: &[char], from: usize) -> (usize, usize) {
let len = chars.len();
let mut i = from;
while i < len {
if chars[i] == '\x07' {
return (i, i - from + 1);
}
if i + 1 < len && chars[i] == '\x1B' && chars[i + 1] == '\\' {
return (i, i - from + 2);
}
i += 1;
}
// Unterminated — consume everything
(len, len - from)
}
/// Extract URL from OSC 8 content "8;params;url".
fn extract_osc8_url(content: &str) -> Option<&str> {
// Format: "8;params;url"
let rest = content.strip_prefix("8;")?;
// Skip params (up to next ;)
let url_start = rest.find(';')? + 1;
let url = &rest[url_start..];
if url.is_empty() { None } else { Some(url) }
}
// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------
#[cfg(test)]
mod tests {
use super::*;
// --- CSI / cursor movement ---
#[test]
fn test_strips_cursor_movement() {
// CSI 5A = cursor up 5
let input = "before\x1B[5Aafter";
let result = sanitize_for_terminal(input, UrlPolicy::Strip);
assert_eq!(result, "beforeafter");
}
#[test]
fn test_strips_cursor_movement_all_directions() {
for dir in ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'] {
let input = format!("x\x1B[3{dir}y");
let result = sanitize_for_terminal(&input, UrlPolicy::Strip);
assert_eq!(result, "xy", "failed for direction {dir}");
}
}
#[test]
fn test_strips_erase_sequences() {
// CSI 2J = erase display
let input = "before\x1B[2Jafter";
let result = sanitize_for_terminal(input, UrlPolicy::Strip);
assert_eq!(result, "beforeafter");
}
// --- SGR preservation ---
#[test]
fn test_preserves_bold_italic_underline_reset() {
let input = "\x1B[1mbold\x1B[0m \x1B[3mitalic\x1B[0m \x1B[4munderline\x1B[0m";
let result = sanitize_for_terminal(input, UrlPolicy::Strip);
assert_eq!(result, input);
}
#[test]
fn test_preserves_standard_colors() {
// Red foreground, green background
let input = "\x1B[31mred\x1B[42m on green\x1B[0m";
let result = sanitize_for_terminal(input, UrlPolicy::Strip);
assert_eq!(result, input);
}
#[test]
fn test_preserves_bright_colors() {
let input = "\x1B[91mbright red\x1B[0m";
let result = sanitize_for_terminal(input, UrlPolicy::Strip);
assert_eq!(result, input);
}
#[test]
fn test_preserves_combined_safe_sgr() {
// Bold + red foreground in one sequence
let input = "\x1B[1;31mbold red\x1B[0m";
let result = sanitize_for_terminal(input, UrlPolicy::Strip);
assert_eq!(result, input);
}
#[test]
fn test_strips_unsafe_sgr() {
// SGR 8 = hidden text (not in safe list)
let input = "\x1B[8mhidden\x1B[0m";
let result = sanitize_for_terminal(input, UrlPolicy::Strip);
// SGR 8 stripped, SGR 0 preserved
assert_eq!(result, "hidden\x1B[0m");
}
// --- C1 control codes ---
#[test]
fn test_strips_c1_control_codes() {
// U+008D = Reverse Index, U+009B = CSI (8-bit)
let input = format!("before{}middle{}after", '\u{008D}', '\u{009B}');
let result = sanitize_for_terminal(&input, UrlPolicy::Strip);
assert_eq!(result, "beforemiddleafter");
}
// --- Bidi overrides ---
#[test]
fn test_strips_bidi_overrides() {
let input = format!(
"normal{}reversed{}end",
'\u{202E}', // RLO
'\u{202C}' // PDF
);
let result = sanitize_for_terminal(&input, UrlPolicy::Strip);
assert_eq!(result, "normalreversedend");
}
#[test]
fn test_strips_all_bidi_chars() {
let bidi_chars = [
'\u{202A}', '\u{202B}', '\u{202C}', '\u{202D}', '\u{202E}', '\u{2066}', '\u{2067}',
'\u{2068}', '\u{2069}',
];
for ch in bidi_chars {
let input = format!("a{ch}b");
let result = sanitize_for_terminal(&input, UrlPolicy::Strip);
assert_eq!(result, "ab", "failed for U+{:04X}", ch as u32);
}
}
// --- OSC sequences ---
#[test]
fn test_strips_osc_sequences() {
// OSC 0 (set title): ESC ] 0 ; title BEL
let input = "before\x1B]0;My Title\x07after";
let result = sanitize_for_terminal(input, UrlPolicy::Strip);
assert_eq!(result, "beforeafter");
}
// --- OSC 8 hyperlinks ---
#[test]
fn test_url_policy_strip() {
// OSC 8 hyperlink: ESC]8;;url ST text ESC]8;; ST
let input = "click \x1B]8;;https://example.com\x07here\x1B]8;;\x07 done";
let result = sanitize_for_terminal(input, UrlPolicy::Strip);
assert_eq!(result, "click here done");
}
#[test]
fn test_url_policy_footnote() {
let input = "click \x1B]8;;https://example.com\x07here\x1B]8;;\x07 done";
let result = sanitize_for_terminal(input, UrlPolicy::Footnote);
assert!(result.contains("here [1]"));
assert!(result.contains("[1] https://example.com"));
}
// --- Redaction ---
#[test]
fn test_redact_gitlab_token() {
let redactor = RedactPattern::defaults();
let input = "My token is glpat-AbCdEfGhIjKlMnOpQrStUvWx";
let result = redactor.redact(input);
assert_eq!(result, "My token is [REDACTED]");
}
#[test]
fn test_redact_email() {
let redactor = RedactPattern::defaults();
let input = "Contact user@example.com for details";
let result = redactor.redact(input);
assert_eq!(result, "Contact [REDACTED] for details");
}
#[test]
fn test_redact_bearer_token() {
let redactor = RedactPattern::defaults();
let input = "Authorization: Bearer eyJhbGciOiJSUzI1NiIsInR5cCI";
let result = redactor.redact(input);
assert!(result.contains("[REDACTED]"));
assert!(!result.contains("eyJ"));
}
// --- Edge cases ---
#[test]
fn test_empty_input() {
assert_eq!(sanitize_for_terminal("", UrlPolicy::Strip), "");
}
#[test]
fn test_safe_content_passthrough() {
let input = "Hello, world! This is normal text.\nWith newlines\tand tabs.";
assert_eq!(sanitize_for_terminal(input, UrlPolicy::Strip), input);
}
#[test]
fn test_trailing_esc() {
let input = "text\x1B";
let result = sanitize_for_terminal(input, UrlPolicy::Strip);
assert_eq!(result, "text");
}
#[test]
fn test_malformed_csi_does_not_eat_text() {
// ESC [ without a valid final byte before next printable
let input = "a\x1B[b";
let result = sanitize_for_terminal(input, UrlPolicy::Strip);
// The malformed CSI is consumed but shouldn't eat "b" as text
// ESC[ is start, 'b' is final byte (0x62 is in 0x40-0x7E range)
// So this is CSI with final byte 'b' (cursor back) — gets stripped
assert_eq!(result, "a");
}
#[test]
fn test_utf8_adjacent_to_escapes() {
let input = "\x1B[1m日本語\x1B[0m text";
let result = sanitize_for_terminal(input, UrlPolicy::Strip);
assert_eq!(result, "\x1B[1m日本語\x1B[0m text");
}
#[test]
fn test_fuzz_no_panic() {
// 1000 random-ish byte sequences — must not panic
for seed in 0u16..1000 {
let mut bytes = Vec::new();
for j in 0..50 {
bytes.push(((seed.wrapping_mul(31).wrapping_add(j)) & 0xFF) as u8);
}
// Best-effort UTF-8
let input = String::from_utf8_lossy(&bytes);
let _ = sanitize_for_terminal(&input, UrlPolicy::Strip);
}
}
}