gitlore/crates/lore-tui/src/safety.rs

//! Terminal safety: sanitize untrusted text, URL policy, credential redaction.
//!
//! GitLab content can contain ANSI escapes, bidi overrides, OSC hyperlinks,
//! and C1 control codes that could corrupt terminal rendering. This module
//! strips dangerous sequences while preserving a safe SGR subset for readability.

use std::fmt::Write;

// ---------------------------------------------------------------------------
// UrlPolicy
// ---------------------------------------------------------------------------

/// Controls how OSC 8 hyperlinks in input are handled.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub enum UrlPolicy {
    /// Remove OSC 8 hyperlinks entirely, keeping only the link text.
    #[default]
    Strip,
    /// Convert hyperlinks to numbered footnotes: `text [1]` with URL list appended.
    Footnote,
    /// Pass hyperlinks through unchanged (only for trusted content).
    Passthrough,
}

// ---------------------------------------------------------------------------
// RedactPattern
// ---------------------------------------------------------------------------

/// Common patterns for PII/secret redaction.
#[derive(Debug, Clone)]
pub struct RedactPattern {
    patterns: Vec<regex::Regex>,
}

impl RedactPattern {
    /// Create a default set of redaction patterns (tokens, emails, etc.).
    #[must_use]
    pub fn defaults() -> Self {
        let patterns = vec![
            // GitLab personal access tokens
            regex::Regex::new(r"glpat-[A-Za-z0-9_\-]{20,}").expect("valid regex"),
            // Generic bearer/API tokens (long hex or base64-ish strings after common prefixes)
            regex::Regex::new(r"(?i)(token|bearer|api[_-]?key)[\s:=]+\S{8,}").expect("valid regex"),
            // Email addresses
            regex::Regex::new(r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}")
                .expect("valid regex"),
        ];
        Self { patterns }
    }

    /// Apply all redaction patterns to the input string.
    #[must_use]
    pub fn redact(&self, input: &str) -> String {
        let mut result = input.to_string();
        for pattern in &self.patterns {
            result = pattern.replace_all(&result, "[REDACTED]").into_owned();
        }
        result
    }
}

// ---------------------------------------------------------------------------
// sanitize_for_terminal
// ---------------------------------------------------------------------------

/// Sanitize untrusted text for safe terminal display.
///
/// - Strips C1 control codes (0x80-0x9F)
/// - Strips OSC sequences (ESC ] ... ST)
/// - Strips cursor movement CSI sequences (CSI n A/B/C/D/E/F/G/H/J/K)
/// - Strips bidi overrides (U+202A-U+202E, U+2066-U+2069)
/// - Preserves safe SGR subset (bold, italic, underline, reset, standard colors)
///
/// `url_policy` controls handling of OSC 8 hyperlinks.
#[must_use]
pub fn sanitize_for_terminal(input: &str, url_policy: UrlPolicy) -> String {
    let mut output = String::with_capacity(input.len());
    let mut footnotes: Vec<String> = Vec::new();
    let chars: Vec<char> = input.chars().collect();
    let len = chars.len();
    let mut i = 0;

    while i < len {
        let ch = chars[i];

        // --- Bidi overrides ---
        if is_bidi_override(ch) {
            i += 1;
            continue;
        }

        // --- C1 control codes (U+0080-U+009F) ---
        if ('\u{0080}'..='\u{009F}').contains(&ch) {
            i += 1;
            continue;
        }

        // --- C0 control codes except tab, newline, carriage return ---
        if ch.is_ascii_control() && ch != '\t' && ch != '\n' && ch != '\r' && ch != '\x1B' {
            i += 1;
            continue;
        }

        // --- ESC sequences ---
        if ch == '\x1B' {
            if i + 1 < len {
                match chars[i + 1] {
                    // CSI sequence: ESC [
                    '[' => {
                        let (consumed, safe_seq) = parse_csi(&chars, i);
                        if let Some(seq) = safe_seq {
                            output.push_str(&seq);
                        }
                        i += consumed;
                        continue;
                    }
                    // OSC sequence: ESC ]
                    ']' => {
                        let (consumed, link_text, link_url) = parse_osc(&chars, i);
                        match url_policy {
                            UrlPolicy::Strip => {
                                if let Some(text) = link_text {
                                    output.push_str(&text);
                                }
                            }
                            UrlPolicy::Footnote => {
                                if let (Some(text), Some(url)) = (link_text, link_url) {
                                    footnotes.push(url);
                                    let _ = write!(output, "{text} [{n}]", n = footnotes.len());
                                }
                            }
                            UrlPolicy::Passthrough => {
                                // Reproduce the raw OSC sequence
                                for &ch_raw in &chars[i..len.min(i + consumed)] {
                                    output.push(ch_raw);
                                }
                            }
                        }
                        i += consumed;
                        continue;
                    }
                    _ => {
                        // Unknown ESC sequence — skip ESC + next char
                        i += 2;
                        continue;
                    }
                }
            } else {
                // Trailing ESC at end of input
                i += 1;
                continue;
            }
        }

        // --- Normal character ---
        output.push(ch);
        i += 1;
    }

    // Append footnotes if any
    if !footnotes.is_empty() {
        output.push('\n');
        for (idx, url) in footnotes.iter().enumerate() {
            let _ = write!(output, "\n[{}] {url}", idx + 1);
        }
    }

    output
}

// ---------------------------------------------------------------------------
// Bidi check
// ---------------------------------------------------------------------------

fn is_bidi_override(ch: char) -> bool {
    matches!(
        ch,
        '\u{202A}' // LRE
        | '\u{202B}' // RLE
        | '\u{202C}' // PDF
        | '\u{202D}' // LRO
        | '\u{202E}' // RLO
        | '\u{2066}' // LRI
        | '\u{2067}' // RLI
        | '\u{2068}' // FSI
        | '\u{2069}' // PDI
    )
}

// ---------------------------------------------------------------------------
// CSI parser
// ---------------------------------------------------------------------------

/// Parse a CSI sequence starting at `chars[start]` (which should be ESC).
///
/// Returns `(chars_consumed, Option<safe_sequence_string>)`.
/// If the CSI is a safe SGR, returns the full sequence string to preserve.
/// Otherwise returns None (strip it).
fn parse_csi(chars: &[char], start: usize) -> (usize, Option<String>) {
    // Minimum: ESC [ <final_byte>
    debug_assert!(chars[start] == '\x1B');
    debug_assert!(start + 1 < chars.len() && chars[start + 1] == '[');

    let mut i = start + 2; // skip ESC [
    let len = chars.len();

    // Collect parameter bytes (0x30-0x3F) and intermediate bytes (0x20-0x2F)
    let param_start = i;
    while i < len && (chars[i] as u32) >= 0x20 && (chars[i] as u32) <= 0x3F {
        i += 1;
    }

    // Collect intermediate bytes
    while i < len && (chars[i] as u32) >= 0x20 && (chars[i] as u32) <= 0x2F {
        i += 1;
    }

    // Final byte (0x40-0x7E)
    if i >= len || (chars[i] as u32) < 0x40 || (chars[i] as u32) > 0x7E {
        // Malformed — consume what we've seen and strip
        return (i.saturating_sub(start).max(2), None);
    }

    let final_byte = chars[i];
    let consumed = i + 1 - start;

    // Only preserve SGR sequences (final byte 'm')
    if final_byte == 'm' {
        let param_str: String = chars[param_start..i].iter().collect();
        if is_safe_sgr(&param_str) {
            let full_seq: String = chars[start..start + consumed].iter().collect();
            return (consumed, Some(full_seq));
        }
    }

    // Anything else (cursor movement A-H, erase J/K, etc.) is stripped
    (consumed, None)
}

/// Check if all SGR parameters in a sequence are in the safe subset.
///
/// Safe: 0 (reset), 1 (bold), 3 (italic), 4 (underline), 22 (normal intensity),
/// 23 (not italic), 24 (not underline), 39 (default fg), 49 (default bg),
/// 30-37 (standard fg), 40-47 (standard bg), 90-97 (bright fg), 100-107 (bright bg).
fn is_safe_sgr(params: &str) -> bool {
    if params.is_empty() {
        return true; // ESC[m is reset
    }

    for param in params.split(';') {
        let param = param.trim();
        if param.is_empty() {
            continue; // treat empty as 0
        }
        let Ok(n) = param.parse::<u32>() else {
            return false;
        };
        if !is_safe_sgr_code(n) {
            return false;
        }
    }
    true
}

fn is_safe_sgr_code(n: u32) -> bool {
    matches!(
        n,
        0  // reset
        | 1  // bold
        | 3  // italic
        | 4  // underline
        | 22 // normal intensity (turn off bold)
        | 23 // not italic
        | 24 // not underline
        | 39 // default foreground
        | 49 // default background
        | 30..=37   // standard foreground colors
        | 40..=47   // standard background colors
        | 90..=97   // bright foreground colors
        | 100..=107 // bright background colors
    )
}

// ---------------------------------------------------------------------------
// OSC parser
// ---------------------------------------------------------------------------

/// Parse an OSC sequence starting at `chars[start]` (ESC ]).
///
/// Returns `(chars_consumed, link_text, link_url)`.
/// For OSC 8 hyperlinks: `ESC ] 8 ; params ; url ST text ESC ] 8 ; ; ST`
/// For other OSC: consumed without extracting link data.
fn parse_osc(chars: &[char], start: usize) -> (usize, Option<String>, Option<String>) {
    debug_assert!(chars[start] == '\x1B');
    debug_assert!(start + 1 < chars.len() && chars[start + 1] == ']');

    let len = chars.len();
    let i = start + 2; // skip ESC ]

    // Find ST (String Terminator): ESC \ or BEL (0x07)
    let osc_end = find_st(chars, i);

    // Check if this is OSC 8 (hyperlink)
    if i < len && chars[i] == '8' && i + 1 < len && chars[i + 1] == ';' {
        // OSC 8 hyperlink: ESC ] 8 ; params ; url ST ... ESC ] 8 ; ; ST
        let osc_content: String = chars[i..osc_end.0].iter().collect();
        let first_consumed = osc_end.1;

        // Extract URL from "8;params;url"
        let url = extract_osc8_url(&osc_content);

        // Now find the link text (between first ST and second OSC 8)
        let after_first_st = start + 2 + first_consumed;
        let mut text = String::new();
        let mut j = after_first_st;

        // Collect text until we hit the closing OSC 8 or end of input
        while j < len {
            if j + 1 < len && chars[j] == '\x1B' && chars[j + 1] == ']' {
                // Found another OSC — this should be the closing OSC 8
                let close_end = find_st(chars, j + 2);
                return (
                    j + close_end.1 - start + 2,
                    Some(text),
                    url.map(String::from),
                );
            }
            text.push(chars[j]);
            j += 1;
        }

        // Reached end without closing OSC 8
        return (j - start, Some(text), url.map(String::from));
    }

    // Non-OSC-8: just consume and strip
    (osc_end.1 + (start + 2 - start), None, None)
}

/// Find the String Terminator (ST) for an OSC sequence.
/// ST is either ESC \ (two chars) or BEL (0x07).
/// Returns (content_end_index, total_consumed_from_content_start).
fn find_st(chars: &[char], from: usize) -> (usize, usize) {
    let len = chars.len();
    let mut i = from;
    while i < len {
        if chars[i] == '\x07' {
            return (i, i - from + 1);
        }
        if i + 1 < len && chars[i] == '\x1B' && chars[i + 1] == '\\' {
            return (i, i - from + 2);
        }
        i += 1;
    }
    // Unterminated — consume everything
    (len, len - from)
}

/// Extract URL from OSC 8 content "8;params;url".
fn extract_osc8_url(content: &str) -> Option<&str> {
    // Format: "8;params;url"
    let rest = content.strip_prefix("8;")?;
    // Skip params (up to next ;)
    let url_start = rest.find(';')? + 1;
    let url = &rest[url_start..];
    if url.is_empty() { None } else { Some(url) }
}

// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------

#[cfg(test)]
mod tests {
    use super::*;

    // --- CSI / cursor movement ---

    #[test]
    fn test_strips_cursor_movement() {
        // CSI 5A = cursor up 5
        let input = "before\x1B[5Aafter";
        let result = sanitize_for_terminal(input, UrlPolicy::Strip);
        assert_eq!(result, "beforeafter");
    }

    #[test]
    fn test_strips_cursor_movement_all_directions() {
        for dir in ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'] {
            let input = format!("x\x1B[3{dir}y");
            let result = sanitize_for_terminal(&input, UrlPolicy::Strip);
            assert_eq!(result, "xy", "failed for direction {dir}");
        }
    }

    #[test]
    fn test_strips_erase_sequences() {
        // CSI 2J = erase display
        let input = "before\x1B[2Jafter";
        let result = sanitize_for_terminal(input, UrlPolicy::Strip);
        assert_eq!(result, "beforeafter");
    }

    // --- SGR preservation ---

    #[test]
    fn test_preserves_bold_italic_underline_reset() {
        let input = "\x1B[1mbold\x1B[0m \x1B[3mitalic\x1B[0m \x1B[4munderline\x1B[0m";
        let result = sanitize_for_terminal(input, UrlPolicy::Strip);
        assert_eq!(result, input);
    }

    #[test]
    fn test_preserves_standard_colors() {
        // Red foreground, green background
        let input = "\x1B[31mred\x1B[42m on green\x1B[0m";
        let result = sanitize_for_terminal(input, UrlPolicy::Strip);
        assert_eq!(result, input);
    }

    #[test]
    fn test_preserves_bright_colors() {
        let input = "\x1B[91mbright red\x1B[0m";
        let result = sanitize_for_terminal(input, UrlPolicy::Strip);
        assert_eq!(result, input);
    }

    #[test]
    fn test_preserves_combined_safe_sgr() {
        // Bold + red foreground in one sequence
        let input = "\x1B[1;31mbold red\x1B[0m";
        let result = sanitize_for_terminal(input, UrlPolicy::Strip);
        assert_eq!(result, input);
    }

    #[test]
    fn test_strips_unsafe_sgr() {
        // SGR 8 = hidden text (not in safe list)
        let input = "\x1B[8mhidden\x1B[0m";
        let result = sanitize_for_terminal(input, UrlPolicy::Strip);
        // SGR 8 stripped, SGR 0 preserved
        assert_eq!(result, "hidden\x1B[0m");
    }

    // --- C1 control codes ---

    #[test]
    fn test_strips_c1_control_codes() {
        // U+008D = Reverse Index, U+009B = CSI (8-bit)
        let input = format!("before{}middle{}after", '\u{008D}', '\u{009B}');
        let result = sanitize_for_terminal(&input, UrlPolicy::Strip);
        assert_eq!(result, "beforemiddleafter");
    }

    // --- Bidi overrides ---

    #[test]
    fn test_strips_bidi_overrides() {
        let input = format!(
            "normal{}reversed{}end",
            '\u{202E}', // RLO
            '\u{202C}'  // PDF
        );
        let result = sanitize_for_terminal(&input, UrlPolicy::Strip);
        assert_eq!(result, "normalreversedend");
    }

    #[test]
    fn test_strips_all_bidi_chars() {
        let bidi_chars = [
            '\u{202A}', '\u{202B}', '\u{202C}', '\u{202D}', '\u{202E}', '\u{2066}', '\u{2067}',
            '\u{2068}', '\u{2069}',
        ];
        for ch in bidi_chars {
            let input = format!("a{ch}b");
            let result = sanitize_for_terminal(&input, UrlPolicy::Strip);
            assert_eq!(result, "ab", "failed for U+{:04X}", ch as u32);
        }
    }

    // --- OSC sequences ---

    #[test]
    fn test_strips_osc_sequences() {
        // OSC 0 (set title): ESC ] 0 ; title BEL
        let input = "before\x1B]0;My Title\x07after";
        let result = sanitize_for_terminal(input, UrlPolicy::Strip);
        assert_eq!(result, "beforeafter");
    }

    // --- OSC 8 hyperlinks ---

    #[test]
    fn test_url_policy_strip() {
        // OSC 8 hyperlink: ESC]8;;url ST text ESC]8;; ST
        let input = "click \x1B]8;;https://example.com\x07here\x1B]8;;\x07 done";
        let result = sanitize_for_terminal(input, UrlPolicy::Strip);
        assert_eq!(result, "click here done");
    }

    #[test]
    fn test_url_policy_footnote() {
        let input = "click \x1B]8;;https://example.com\x07here\x1B]8;;\x07 done";
        let result = sanitize_for_terminal(input, UrlPolicy::Footnote);
        assert!(result.contains("here [1]"));
        assert!(result.contains("[1] https://example.com"));
    }

    // --- Redaction ---

    #[test]
    fn test_redact_gitlab_token() {
        let redactor = RedactPattern::defaults();
        let input = "My token is glpat-AbCdEfGhIjKlMnOpQrStUvWx";
        let result = redactor.redact(input);
        assert_eq!(result, "My token is [REDACTED]");
    }

    #[test]
    fn test_redact_email() {
        let redactor = RedactPattern::defaults();
        let input = "Contact user@example.com for details";
        let result = redactor.redact(input);
        assert_eq!(result, "Contact [REDACTED] for details");
    }

    #[test]
    fn test_redact_bearer_token() {
        let redactor = RedactPattern::defaults();
        let input = "Authorization: Bearer eyJhbGciOiJSUzI1NiIsInR5cCI";
        let result = redactor.redact(input);
        assert!(result.contains("[REDACTED]"));
        assert!(!result.contains("eyJ"));
    }

    // --- Edge cases ---

    #[test]
    fn test_empty_input() {
        assert_eq!(sanitize_for_terminal("", UrlPolicy::Strip), "");
    }

    #[test]
    fn test_safe_content_passthrough() {
        let input = "Hello, world! This is normal text.\nWith newlines\tand tabs.";
        assert_eq!(sanitize_for_terminal(input, UrlPolicy::Strip), input);
    }

    #[test]
    fn test_trailing_esc() {
        let input = "text\x1B";
        let result = sanitize_for_terminal(input, UrlPolicy::Strip);
        assert_eq!(result, "text");
    }

    #[test]
    fn test_malformed_csi_does_not_eat_text() {
        // ESC [ without a valid final byte before next printable
        let input = "a\x1B[b";
        let result = sanitize_for_terminal(input, UrlPolicy::Strip);
        // The malformed CSI is consumed but shouldn't eat "b" as text
        // ESC[ is start, 'b' is final byte (0x62 is in 0x40-0x7E range)
        // So this is CSI with final byte 'b' (cursor back) — gets stripped
        assert_eq!(result, "a");
    }

    #[test]
    fn test_utf8_adjacent_to_escapes() {
        let input = "\x1B[1m日本語\x1B[0m text";
        let result = sanitize_for_terminal(input, UrlPolicy::Strip);
        assert_eq!(result, "\x1B[1m日本語\x1B[0m text");
    }

    #[test]
    fn test_fuzz_no_panic() {
        // 1000 random-ish byte sequences — must not panic
        for seed in 0u16..1000 {
            let mut bytes = Vec::new();
            for j in 0..50 {
                bytes.push(((seed.wrapping_mul(31).wrapping_add(j)) & 0xFF) as u8);
            }
            // Best-effort UTF-8
            let input = String::from_utf8_lossy(&bytes);
            let _ = sanitize_for_terminal(&input, UrlPolicy::Strip);
        }
    }
}