588 lines
19 KiB
Rust
588 lines
19 KiB
Rust
//! Terminal safety: sanitize untrusted text, URL policy, credential redaction.
|
|
//!
|
|
//! GitLab content can contain ANSI escapes, bidi overrides, OSC hyperlinks,
|
|
//! and C1 control codes that could corrupt terminal rendering. This module
|
|
//! strips dangerous sequences while preserving a safe SGR subset for readability.
|
|
|
|
use std::fmt::Write;
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// UrlPolicy
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/// Controls how OSC 8 hyperlinks in input are handled.
|
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
|
|
pub enum UrlPolicy {
|
|
/// Remove OSC 8 hyperlinks entirely, keeping only the link text.
|
|
#[default]
|
|
Strip,
|
|
/// Convert hyperlinks to numbered footnotes: `text [1]` with URL list appended.
|
|
Footnote,
|
|
/// Pass hyperlinks through unchanged (only for trusted content).
|
|
Passthrough,
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// RedactPattern
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/// Common patterns for PII/secret redaction.
|
|
#[derive(Debug, Clone)]
|
|
pub struct RedactPattern {
|
|
patterns: Vec<regex::Regex>,
|
|
}
|
|
|
|
impl RedactPattern {
|
|
/// Create a default set of redaction patterns (tokens, emails, etc.).
|
|
#[must_use]
|
|
pub fn defaults() -> Self {
|
|
let patterns = vec![
|
|
// GitLab personal access tokens
|
|
regex::Regex::new(r"glpat-[A-Za-z0-9_\-]{20,}").expect("valid regex"),
|
|
// Generic bearer/API tokens (long hex or base64-ish strings after common prefixes)
|
|
regex::Regex::new(r"(?i)(token|bearer|api[_-]?key)[\s:=]+\S{8,}").expect("valid regex"),
|
|
// Email addresses
|
|
regex::Regex::new(r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}")
|
|
.expect("valid regex"),
|
|
];
|
|
Self { patterns }
|
|
}
|
|
|
|
/// Apply all redaction patterns to the input string.
|
|
#[must_use]
|
|
pub fn redact(&self, input: &str) -> String {
|
|
let mut result = input.to_string();
|
|
for pattern in &self.patterns {
|
|
result = pattern.replace_all(&result, "[REDACTED]").into_owned();
|
|
}
|
|
result
|
|
}
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// sanitize_for_terminal
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/// Sanitize untrusted text for safe terminal display.
|
|
///
|
|
/// - Strips C1 control codes (0x80-0x9F)
|
|
/// - Strips OSC sequences (ESC ] ... ST)
|
|
/// - Strips cursor movement CSI sequences (CSI n A/B/C/D/E/F/G/H/J/K)
|
|
/// - Strips bidi overrides (U+202A-U+202E, U+2066-U+2069)
|
|
/// - Preserves safe SGR subset (bold, italic, underline, reset, standard colors)
|
|
///
|
|
/// `url_policy` controls handling of OSC 8 hyperlinks.
|
|
#[must_use]
|
|
pub fn sanitize_for_terminal(input: &str, url_policy: UrlPolicy) -> String {
|
|
let mut output = String::with_capacity(input.len());
|
|
let mut footnotes: Vec<String> = Vec::new();
|
|
let chars: Vec<char> = input.chars().collect();
|
|
let len = chars.len();
|
|
let mut i = 0;
|
|
|
|
while i < len {
|
|
let ch = chars[i];
|
|
|
|
// --- Bidi overrides ---
|
|
if is_bidi_override(ch) {
|
|
i += 1;
|
|
continue;
|
|
}
|
|
|
|
// --- C1 control codes (U+0080-U+009F) ---
|
|
if ('\u{0080}'..='\u{009F}').contains(&ch) {
|
|
i += 1;
|
|
continue;
|
|
}
|
|
|
|
// --- C0 control codes except tab, newline, carriage return ---
|
|
if ch.is_ascii_control() && ch != '\t' && ch != '\n' && ch != '\r' && ch != '\x1B' {
|
|
i += 1;
|
|
continue;
|
|
}
|
|
|
|
// --- ESC sequences ---
|
|
if ch == '\x1B' {
|
|
if i + 1 < len {
|
|
match chars[i + 1] {
|
|
// CSI sequence: ESC [
|
|
'[' => {
|
|
let (consumed, safe_seq) = parse_csi(&chars, i);
|
|
if let Some(seq) = safe_seq {
|
|
output.push_str(&seq);
|
|
}
|
|
i += consumed;
|
|
continue;
|
|
}
|
|
// OSC sequence: ESC ]
|
|
']' => {
|
|
let (consumed, link_text, link_url) = parse_osc(&chars, i);
|
|
match url_policy {
|
|
UrlPolicy::Strip => {
|
|
if let Some(text) = link_text {
|
|
output.push_str(&text);
|
|
}
|
|
}
|
|
UrlPolicy::Footnote => {
|
|
if let (Some(text), Some(url)) = (link_text, link_url) {
|
|
footnotes.push(url);
|
|
let _ = write!(output, "{text} [{n}]", n = footnotes.len());
|
|
}
|
|
}
|
|
UrlPolicy::Passthrough => {
|
|
// Reproduce the raw OSC sequence
|
|
for &ch_raw in &chars[i..len.min(i + consumed)] {
|
|
output.push(ch_raw);
|
|
}
|
|
}
|
|
}
|
|
i += consumed;
|
|
continue;
|
|
}
|
|
_ => {
|
|
// Unknown ESC sequence — skip ESC + next char
|
|
i += 2;
|
|
continue;
|
|
}
|
|
}
|
|
} else {
|
|
// Trailing ESC at end of input
|
|
i += 1;
|
|
continue;
|
|
}
|
|
}
|
|
|
|
// --- Normal character ---
|
|
output.push(ch);
|
|
i += 1;
|
|
}
|
|
|
|
// Append footnotes if any
|
|
if !footnotes.is_empty() {
|
|
output.push('\n');
|
|
for (idx, url) in footnotes.iter().enumerate() {
|
|
let _ = write!(output, "\n[{}] {url}", idx + 1);
|
|
}
|
|
}
|
|
|
|
output
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Bidi check
|
|
// ---------------------------------------------------------------------------
|
|
|
|
fn is_bidi_override(ch: char) -> bool {
|
|
matches!(
|
|
ch,
|
|
'\u{202A}' // LRE
|
|
| '\u{202B}' // RLE
|
|
| '\u{202C}' // PDF
|
|
| '\u{202D}' // LRO
|
|
| '\u{202E}' // RLO
|
|
| '\u{2066}' // LRI
|
|
| '\u{2067}' // RLI
|
|
| '\u{2068}' // FSI
|
|
| '\u{2069}' // PDI
|
|
)
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// CSI parser
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/// Parse a CSI sequence starting at `chars[start]` (which should be ESC).
|
|
///
|
|
/// Returns `(chars_consumed, Option<safe_sequence_string>)`.
|
|
/// If the CSI is a safe SGR, returns the full sequence string to preserve.
|
|
/// Otherwise returns None (strip it).
|
|
fn parse_csi(chars: &[char], start: usize) -> (usize, Option<String>) {
|
|
// Minimum: ESC [ <final_byte>
|
|
debug_assert!(chars[start] == '\x1B');
|
|
debug_assert!(start + 1 < chars.len() && chars[start + 1] == '[');
|
|
|
|
let mut i = start + 2; // skip ESC [
|
|
let len = chars.len();
|
|
|
|
// Collect parameter bytes (0x30-0x3F) and intermediate bytes (0x20-0x2F)
|
|
let param_start = i;
|
|
while i < len && (chars[i] as u32) >= 0x20 && (chars[i] as u32) <= 0x3F {
|
|
i += 1;
|
|
}
|
|
|
|
// Collect intermediate bytes
|
|
while i < len && (chars[i] as u32) >= 0x20 && (chars[i] as u32) <= 0x2F {
|
|
i += 1;
|
|
}
|
|
|
|
// Final byte (0x40-0x7E)
|
|
if i >= len || (chars[i] as u32) < 0x40 || (chars[i] as u32) > 0x7E {
|
|
// Malformed — consume what we've seen and strip
|
|
return (i.saturating_sub(start).max(2), None);
|
|
}
|
|
|
|
let final_byte = chars[i];
|
|
let consumed = i + 1 - start;
|
|
|
|
// Only preserve SGR sequences (final byte 'm')
|
|
if final_byte == 'm' {
|
|
let param_str: String = chars[param_start..i].iter().collect();
|
|
if is_safe_sgr(¶m_str) {
|
|
let full_seq: String = chars[start..start + consumed].iter().collect();
|
|
return (consumed, Some(full_seq));
|
|
}
|
|
}
|
|
|
|
// Anything else (cursor movement A-H, erase J/K, etc.) is stripped
|
|
(consumed, None)
|
|
}
|
|
|
|
/// Check if all SGR parameters in a sequence are in the safe subset.
|
|
///
|
|
/// Safe: 0 (reset), 1 (bold), 3 (italic), 4 (underline), 22 (normal intensity),
|
|
/// 23 (not italic), 24 (not underline), 39 (default fg), 49 (default bg),
|
|
/// 30-37 (standard fg), 40-47 (standard bg), 90-97 (bright fg), 100-107 (bright bg).
|
|
fn is_safe_sgr(params: &str) -> bool {
|
|
if params.is_empty() {
|
|
return true; // ESC[m is reset
|
|
}
|
|
|
|
for param in params.split(';') {
|
|
let param = param.trim();
|
|
if param.is_empty() {
|
|
continue; // treat empty as 0
|
|
}
|
|
let Ok(n) = param.parse::<u32>() else {
|
|
return false;
|
|
};
|
|
if !is_safe_sgr_code(n) {
|
|
return false;
|
|
}
|
|
}
|
|
true
|
|
}
|
|
|
|
fn is_safe_sgr_code(n: u32) -> bool {
|
|
matches!(
|
|
n,
|
|
0 // reset
|
|
| 1 // bold
|
|
| 3 // italic
|
|
| 4 // underline
|
|
| 22 // normal intensity (turn off bold)
|
|
| 23 // not italic
|
|
| 24 // not underline
|
|
| 39 // default foreground
|
|
| 49 // default background
|
|
| 30..=37 // standard foreground colors
|
|
| 40..=47 // standard background colors
|
|
| 90..=97 // bright foreground colors
|
|
| 100..=107 // bright background colors
|
|
)
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// OSC parser
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/// Parse an OSC sequence starting at `chars[start]` (ESC ]).
|
|
///
|
|
/// Returns `(chars_consumed, link_text, link_url)`.
|
|
/// For OSC 8 hyperlinks: `ESC ] 8 ; params ; url ST text ESC ] 8 ; ; ST`
|
|
/// For other OSC: consumed without extracting link data.
|
|
fn parse_osc(chars: &[char], start: usize) -> (usize, Option<String>, Option<String>) {
|
|
debug_assert!(chars[start] == '\x1B');
|
|
debug_assert!(start + 1 < chars.len() && chars[start + 1] == ']');
|
|
|
|
let len = chars.len();
|
|
let i = start + 2; // skip ESC ]
|
|
|
|
// Find ST (String Terminator): ESC \ or BEL (0x07)
|
|
let osc_end = find_st(chars, i);
|
|
|
|
// Check if this is OSC 8 (hyperlink)
|
|
if i < len && chars[i] == '8' && i + 1 < len && chars[i + 1] == ';' {
|
|
// OSC 8 hyperlink: ESC ] 8 ; params ; url ST ... ESC ] 8 ; ; ST
|
|
let osc_content: String = chars[i..osc_end.0].iter().collect();
|
|
let first_consumed = osc_end.1;
|
|
|
|
// Extract URL from "8;params;url"
|
|
let url = extract_osc8_url(&osc_content);
|
|
|
|
// Now find the link text (between first ST and second OSC 8)
|
|
let after_first_st = start + 2 + first_consumed;
|
|
let mut text = String::new();
|
|
let mut j = after_first_st;
|
|
|
|
// Collect text until we hit the closing OSC 8 or end of input
|
|
while j < len {
|
|
if j + 1 < len && chars[j] == '\x1B' && chars[j + 1] == ']' {
|
|
// Found another OSC — this should be the closing OSC 8
|
|
let close_end = find_st(chars, j + 2);
|
|
return (
|
|
j + close_end.1 - start + 2,
|
|
Some(text),
|
|
url.map(String::from),
|
|
);
|
|
}
|
|
text.push(chars[j]);
|
|
j += 1;
|
|
}
|
|
|
|
// Reached end without closing OSC 8
|
|
return (j - start, Some(text), url.map(String::from));
|
|
}
|
|
|
|
// Non-OSC-8: just consume and strip
|
|
(osc_end.1 + (start + 2 - start), None, None)
|
|
}
|
|
|
|
/// Find the String Terminator (ST) for an OSC sequence.
|
|
/// ST is either ESC \ (two chars) or BEL (0x07).
|
|
/// Returns (content_end_index, total_consumed_from_content_start).
|
|
fn find_st(chars: &[char], from: usize) -> (usize, usize) {
|
|
let len = chars.len();
|
|
let mut i = from;
|
|
while i < len {
|
|
if chars[i] == '\x07' {
|
|
return (i, i - from + 1);
|
|
}
|
|
if i + 1 < len && chars[i] == '\x1B' && chars[i + 1] == '\\' {
|
|
return (i, i - from + 2);
|
|
}
|
|
i += 1;
|
|
}
|
|
// Unterminated — consume everything
|
|
(len, len - from)
|
|
}
|
|
|
|
/// Extract URL from OSC 8 content "8;params;url".
|
|
fn extract_osc8_url(content: &str) -> Option<&str> {
|
|
// Format: "8;params;url"
|
|
let rest = content.strip_prefix("8;")?;
|
|
// Skip params (up to next ;)
|
|
let url_start = rest.find(';')? + 1;
|
|
let url = &rest[url_start..];
|
|
if url.is_empty() { None } else { Some(url) }
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Tests
|
|
// ---------------------------------------------------------------------------
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
// --- CSI / cursor movement ---
|
|
|
|
#[test]
|
|
fn test_strips_cursor_movement() {
|
|
// CSI 5A = cursor up 5
|
|
let input = "before\x1B[5Aafter";
|
|
let result = sanitize_for_terminal(input, UrlPolicy::Strip);
|
|
assert_eq!(result, "beforeafter");
|
|
}
|
|
|
|
#[test]
|
|
fn test_strips_cursor_movement_all_directions() {
|
|
for dir in ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'] {
|
|
let input = format!("x\x1B[3{dir}y");
|
|
let result = sanitize_for_terminal(&input, UrlPolicy::Strip);
|
|
assert_eq!(result, "xy", "failed for direction {dir}");
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_strips_erase_sequences() {
|
|
// CSI 2J = erase display
|
|
let input = "before\x1B[2Jafter";
|
|
let result = sanitize_for_terminal(input, UrlPolicy::Strip);
|
|
assert_eq!(result, "beforeafter");
|
|
}
|
|
|
|
// --- SGR preservation ---
|
|
|
|
#[test]
|
|
fn test_preserves_bold_italic_underline_reset() {
|
|
let input = "\x1B[1mbold\x1B[0m \x1B[3mitalic\x1B[0m \x1B[4munderline\x1B[0m";
|
|
let result = sanitize_for_terminal(input, UrlPolicy::Strip);
|
|
assert_eq!(result, input);
|
|
}
|
|
|
|
#[test]
|
|
fn test_preserves_standard_colors() {
|
|
// Red foreground, green background
|
|
let input = "\x1B[31mred\x1B[42m on green\x1B[0m";
|
|
let result = sanitize_for_terminal(input, UrlPolicy::Strip);
|
|
assert_eq!(result, input);
|
|
}
|
|
|
|
#[test]
|
|
fn test_preserves_bright_colors() {
|
|
let input = "\x1B[91mbright red\x1B[0m";
|
|
let result = sanitize_for_terminal(input, UrlPolicy::Strip);
|
|
assert_eq!(result, input);
|
|
}
|
|
|
|
#[test]
|
|
fn test_preserves_combined_safe_sgr() {
|
|
// Bold + red foreground in one sequence
|
|
let input = "\x1B[1;31mbold red\x1B[0m";
|
|
let result = sanitize_for_terminal(input, UrlPolicy::Strip);
|
|
assert_eq!(result, input);
|
|
}
|
|
|
|
#[test]
|
|
fn test_strips_unsafe_sgr() {
|
|
// SGR 8 = hidden text (not in safe list)
|
|
let input = "\x1B[8mhidden\x1B[0m";
|
|
let result = sanitize_for_terminal(input, UrlPolicy::Strip);
|
|
// SGR 8 stripped, SGR 0 preserved
|
|
assert_eq!(result, "hidden\x1B[0m");
|
|
}
|
|
|
|
// --- C1 control codes ---
|
|
|
|
#[test]
|
|
fn test_strips_c1_control_codes() {
|
|
// U+008D = Reverse Index, U+009B = CSI (8-bit)
|
|
let input = format!("before{}middle{}after", '\u{008D}', '\u{009B}');
|
|
let result = sanitize_for_terminal(&input, UrlPolicy::Strip);
|
|
assert_eq!(result, "beforemiddleafter");
|
|
}
|
|
|
|
// --- Bidi overrides ---
|
|
|
|
#[test]
|
|
fn test_strips_bidi_overrides() {
|
|
let input = format!(
|
|
"normal{}reversed{}end",
|
|
'\u{202E}', // RLO
|
|
'\u{202C}' // PDF
|
|
);
|
|
let result = sanitize_for_terminal(&input, UrlPolicy::Strip);
|
|
assert_eq!(result, "normalreversedend");
|
|
}
|
|
|
|
#[test]
|
|
fn test_strips_all_bidi_chars() {
|
|
let bidi_chars = [
|
|
'\u{202A}', '\u{202B}', '\u{202C}', '\u{202D}', '\u{202E}', '\u{2066}', '\u{2067}',
|
|
'\u{2068}', '\u{2069}',
|
|
];
|
|
for ch in bidi_chars {
|
|
let input = format!("a{ch}b");
|
|
let result = sanitize_for_terminal(&input, UrlPolicy::Strip);
|
|
assert_eq!(result, "ab", "failed for U+{:04X}", ch as u32);
|
|
}
|
|
}
|
|
|
|
// --- OSC sequences ---
|
|
|
|
#[test]
|
|
fn test_strips_osc_sequences() {
|
|
// OSC 0 (set title): ESC ] 0 ; title BEL
|
|
let input = "before\x1B]0;My Title\x07after";
|
|
let result = sanitize_for_terminal(input, UrlPolicy::Strip);
|
|
assert_eq!(result, "beforeafter");
|
|
}
|
|
|
|
// --- OSC 8 hyperlinks ---
|
|
|
|
#[test]
|
|
fn test_url_policy_strip() {
|
|
// OSC 8 hyperlink: ESC]8;;url ST text ESC]8;; ST
|
|
let input = "click \x1B]8;;https://example.com\x07here\x1B]8;;\x07 done";
|
|
let result = sanitize_for_terminal(input, UrlPolicy::Strip);
|
|
assert_eq!(result, "click here done");
|
|
}
|
|
|
|
#[test]
|
|
fn test_url_policy_footnote() {
|
|
let input = "click \x1B]8;;https://example.com\x07here\x1B]8;;\x07 done";
|
|
let result = sanitize_for_terminal(input, UrlPolicy::Footnote);
|
|
assert!(result.contains("here [1]"));
|
|
assert!(result.contains("[1] https://example.com"));
|
|
}
|
|
|
|
// --- Redaction ---
|
|
|
|
#[test]
|
|
fn test_redact_gitlab_token() {
|
|
let redactor = RedactPattern::defaults();
|
|
let input = "My token is glpat-AbCdEfGhIjKlMnOpQrStUvWx";
|
|
let result = redactor.redact(input);
|
|
assert_eq!(result, "My token is [REDACTED]");
|
|
}
|
|
|
|
#[test]
|
|
fn test_redact_email() {
|
|
let redactor = RedactPattern::defaults();
|
|
let input = "Contact user@example.com for details";
|
|
let result = redactor.redact(input);
|
|
assert_eq!(result, "Contact [REDACTED] for details");
|
|
}
|
|
|
|
#[test]
|
|
fn test_redact_bearer_token() {
|
|
let redactor = RedactPattern::defaults();
|
|
let input = "Authorization: Bearer eyJhbGciOiJSUzI1NiIsInR5cCI";
|
|
let result = redactor.redact(input);
|
|
assert!(result.contains("[REDACTED]"));
|
|
assert!(!result.contains("eyJ"));
|
|
}
|
|
|
|
// --- Edge cases ---
|
|
|
|
#[test]
|
|
fn test_empty_input() {
|
|
assert_eq!(sanitize_for_terminal("", UrlPolicy::Strip), "");
|
|
}
|
|
|
|
#[test]
|
|
fn test_safe_content_passthrough() {
|
|
let input = "Hello, world! This is normal text.\nWith newlines\tand tabs.";
|
|
assert_eq!(sanitize_for_terminal(input, UrlPolicy::Strip), input);
|
|
}
|
|
|
|
#[test]
|
|
fn test_trailing_esc() {
|
|
let input = "text\x1B";
|
|
let result = sanitize_for_terminal(input, UrlPolicy::Strip);
|
|
assert_eq!(result, "text");
|
|
}
|
|
|
|
#[test]
|
|
fn test_malformed_csi_does_not_eat_text() {
|
|
// ESC [ without a valid final byte before next printable
|
|
let input = "a\x1B[b";
|
|
let result = sanitize_for_terminal(input, UrlPolicy::Strip);
|
|
// The malformed CSI is consumed but shouldn't eat "b" as text
|
|
// ESC[ is start, 'b' is final byte (0x62 is in 0x40-0x7E range)
|
|
// So this is CSI with final byte 'b' (cursor back) — gets stripped
|
|
assert_eq!(result, "a");
|
|
}
|
|
|
|
#[test]
|
|
fn test_utf8_adjacent_to_escapes() {
|
|
let input = "\x1B[1m日本語\x1B[0m text";
|
|
let result = sanitize_for_terminal(input, UrlPolicy::Strip);
|
|
assert_eq!(result, "\x1B[1m日本語\x1B[0m text");
|
|
}
|
|
|
|
#[test]
|
|
fn test_fuzz_no_panic() {
|
|
// 1000 random-ish byte sequences — must not panic
|
|
for seed in 0u16..1000 {
|
|
let mut bytes = Vec::new();
|
|
for j in 0..50 {
|
|
bytes.push(((seed.wrapping_mul(31).wrapping_add(j)) & 0xFF) as u8);
|
|
}
|
|
// Best-effort UTF-8
|
|
let input = String::from_utf8_lossy(&bytes);
|
|
let _ = sanitize_for_terminal(&input, UrlPolicy::Strip);
|
|
}
|
|
}
|
|
}
|