Add shared type definitions and sensitive content redactor

Shared module consumed by both the Express server and the React client: types.ts: - ParsedMessage: the normalized message unit (uuid, category, content, toolName, toolInput, timestamp, rawIndex) that the parser emits and every downstream consumer (viewer, filter, export) operates on - MessageCategory: 9-value union covering user_message, assistant_text, thinking, tool_call, tool_result, system_message, hook_progress, file_snapshot, and summary - SessionEntry / SessionListResponse / SessionDetailResponse / ExportRequest: API contract types for the sessions list, session detail, and HTML export endpoints - ALL_CATEGORIES, CATEGORY_LABELS, DEFAULT_HIDDEN_CATEGORIES: constants for the filter panel UI and presets (thinking + hook_progress hidden by default) sensitive-redactor.ts: - 34 regex patterns derived from gitleaks production config, organized into Tier 1 (known secret formats: AWS, GitHub, GitLab, OpenAI, Anthropic, HuggingFace, Perplexity, Stripe, Slack, SendGrid, Twilio, GCP, Azure AD, Heroku, npm, PyPI, Sentry, JWT, PEM private keys, generic API key assignments) and Tier 2 (PII/system info: home directory paths, connection strings, URLs with credentials, email addresses, IPv4 addresses, Bearer tokens, env var secret assignments) - Keyword pre-filtering: each pattern declares keywords that must appear in the text before the expensive regex is evaluated, following the gitleaks performance optimization approach - False-positive allowlists: example/test email domains, localhost/ documentation IPs (RFC 5737), noreply@anthropic.com - Pure functions: redactSensitiveContent returns {sanitized, count, categories}, redactString returns just the string, redactMessage returns a new ParsedMessage with content and toolInput redacted Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-29 22:55:48 -05:00
parent 7e15c36e2f
commit c4e15bf082
2 changed files with 514 additions and 0 deletions
--- a/src/shared/sensitive-redactor.ts
+++ b/src/shared/sensitive-redactor.ts
@@ -0,0 +1,437 @@
+import type { ParsedMessage } from "./types.js";
+
+/**
+ * Sensitive information detection and redaction module.
+ *
+ * Uses a curated set of ~37 regex patterns derived from gitleaks' production
+ * config, focused on patterns relevant to Claude Code session logs.
+ * Employs keyword pre-filtering (gitleaks technique) to skip expensive regex
+ * evaluation for messages that contain no potential secrets.
+ *
+ * Shared between client (display redaction) and server (export redaction).
+ */
+
+// ---------------------------------------------------------------------------
+// Types
+// ---------------------------------------------------------------------------
+
+export interface SensitivePattern {
+  id: string;
+  label: string; // Placeholder text, e.g. "[AWS_KEY]"
+  regex: RegExp;
+  keywords: string[]; // At least one keyword must appear before running regex
+  falsePositiveCheck?: (match: string) => boolean;
+}
+
+export interface RedactionResult {
+  sanitized: string;
+  redactionCount: number;
+  categories: string[];
+}
+
+// ---------------------------------------------------------------------------
+// False-positive helpers
+// ---------------------------------------------------------------------------
+
+const ALLOWLISTED_EMAILS = [
+  "@example.com",
+  "@example.org",
+  "@test.com",
+  "@test.org",
+  "@localhost",
+  "noreply@anthropic.com",
+];
+
+const ALLOWLISTED_IPS = ["127.0.0.1", "0.0.0.0"];
+
+const DOCUMENTATION_IP_PREFIXES = ["192.0.2.", "198.51.100.", "203.0.113."];
+
+function isAllowlistedEmail(match: string): boolean {
+  const lower = match.toLowerCase();
+  return ALLOWLISTED_EMAILS.some(
+    (suffix) => lower.endsWith(suffix) || lower === suffix.slice(1)
+  );
+}
+
+function isAllowlistedIp(match: string): boolean {
+  if (ALLOWLISTED_IPS.includes(match)) return true;
+  return DOCUMENTATION_IP_PREFIXES.some((prefix) => match.startsWith(prefix));
+}
+
+// ---------------------------------------------------------------------------
+// Pattern Definitions
+// ---------------------------------------------------------------------------
+
+export const SENSITIVE_PATTERNS: SensitivePattern[] = [
+  // ---- Tier 1: Known Secret Formats ----
+
+  // #1 AWS Access Key
+  {
+    id: "aws_access_key",
+    label: "[AWS_KEY]",
+    regex: /\b(?:A3T[A-Z0-9]|AKIA|ASIA|ABIA|ACCA)[A-Z2-7]{16}\b/g,
+    keywords: ["AKIA", "ASIA", "ABIA", "ACCA", "A3T"],
+  },
+
+  // #2 AWS Bedrock
+  {
+    id: "aws_bedrock",
+    label: "[AWS_BEDROCK_KEY]",
+    regex: /\bABSK[A-Za-z0-9+/]{109,269}={0,2}\b/g,
+    keywords: ["ABSK"],
+  },
+
+  // #3 GitHub PAT
+  {
+    id: "github_pat",
+    label: "[GITHUB_TOKEN]",
+    regex: /\bghp_[0-9a-zA-Z]{36}\b/g,
+    keywords: ["ghp_"],
+  },
+
+  // #4 GitHub Fine-Grained PAT
+  {
+    id: "github_fine_grained_pat",
+    label: "[GITHUB_TOKEN]",
+    regex: /\bgithub_pat_\w{82}\b/g,
+    keywords: ["github_pat_"],
+  },
+
+  // #5 GitHub App Token
+  {
+    id: "github_app_token",
+    label: "[GITHUB_TOKEN]",
+    regex: /\b(?:ghu|ghs)_[0-9a-zA-Z]{36}\b/g,
+    keywords: ["ghu_", "ghs_"],
+  },
+
+  // #6 GitLab PAT
+  {
+    id: "gitlab_pat",
+    label: "[GITLAB_TOKEN]",
+    regex: /\bglpat-[\w-]{20}\b/g,
+    keywords: ["glpat-"],
+  },
+
+  // #7 GitLab Runner Token
+  {
+    id: "gitlab_runner",
+    label: "[GITLAB_TOKEN]",
+    regex: /\bglrt-[0-9a-zA-Z_-]{20}\b/g,
+    keywords: ["glrt-"],
+  },
+
+  // #8 OpenAI Key
+  {
+    id: "openai_key",
+    label: "[OPENAI_KEY]",
+    regex:
+      /\b(?:sk-(?:proj|svcacct|admin)-[A-Za-z0-9_-]{58,74}T3BlbkFJ[A-Za-z0-9_-]{58,74}|sk-[a-zA-Z0-9]{20}T3BlbkFJ[a-zA-Z0-9]{20})\b/g,
+    keywords: ["sk-proj-", "sk-svcacct-", "sk-admin-", "T3BlbkFJ"],
+  },
+
+  // #9 Anthropic Key
+  {
+    id: "anthropic_key",
+    label: "[ANTHROPIC_KEY]",
+    regex: /\bsk-ant-api03-[a-zA-Z0-9_-]{93}AA\b/g,
+    keywords: ["sk-ant-api"],
+  },
+
+  // #10 Anthropic Admin Key
+  {
+    id: "anthropic_admin_key",
+    label: "[ANTHROPIC_KEY]",
+    regex: /\bsk-ant-admin01-[a-zA-Z0-9_-]{93}AA\b/g,
+    keywords: ["sk-ant-admin"],
+  },
+
+  // #11 HuggingFace Token
+  {
+    id: "huggingface_token",
+    label: "[HF_TOKEN]",
+    regex: /\bhf_[a-zA-Z]{34}\b/g,
+    keywords: ["hf_"],
+  },
+
+  // #12 Perplexity Key
+  {
+    id: "perplexity_key",
+    label: "[PERPLEXITY_KEY]",
+    regex: /\bpplx-[a-zA-Z0-9]{48}\b/g,
+    keywords: ["pplx-"],
+  },
+
+  // #13 Stripe Key
+  {
+    id: "stripe_key",
+    label: "[STRIPE_KEY]",
+    regex: /\b(?:sk|rk)_(?:test|live|prod)_[a-zA-Z0-9]{10,99}\b/g,
+    keywords: ["sk_live_", "sk_test_", "sk_prod_", "rk_live_", "rk_test_", "rk_prod_"],
+  },
+
+  // #14 Slack Bot Token
+  {
+    id: "slack_bot_token",
+    label: "[SLACK_TOKEN]",
+    regex: /\bxoxb-[0-9]{10,13}-[0-9]{10,13}[a-zA-Z0-9-]*\b/g,
+    keywords: ["xoxb-"],
+  },
+
+  // #15 Slack User Token
+  {
+    id: "slack_user_token",
+    label: "[SLACK_TOKEN]",
+    regex: /\bxox[pe](?:-[0-9]{10,13}){3}-[a-zA-Z0-9-]{28,34}\b/g,
+    keywords: ["xoxp-", "xoxe-"],
+  },
+
+  // #16 Slack Webhook
+  {
+    id: "slack_webhook",
+    label: "[SLACK_WEBHOOK]",
+    regex:
+      /(?:https?:\/\/)?hooks\.slack\.com\/(?:services|workflows|triggers)\/[A-Za-z0-9+/]{43,56}/g,
+    keywords: ["hooks.slack.com"],
+  },
+
+  // #17 SendGrid Token
+  {
+    id: "sendgrid_token",
+    label: "[SENDGRID_TOKEN]",
+    regex: /\bSG\.[a-z0-9=_\-.]{66}\b/gi,
+    keywords: ["SG."],
+  },
+
+  // #18 Twilio Key
+  {
+    id: "twilio_key",
+    label: "[TWILIO_KEY]",
+    regex: /\bSK[0-9a-fA-F]{32}\b/g,
+    keywords: ["SK"],
+  },
+
+  // #19 GCP API Key
+  {
+    id: "gcp_api_key",
+    label: "[GCP_KEY]",
+    regex: /\bAIza[\w-]{35}\b/g,
+    keywords: ["AIza"],
+  },
+
+  // #20 Azure AD Client Secret
+  {
+    id: "azure_ad_secret",
+    label: "[AZURE_SECRET]",
+    regex: /[a-zA-Z0-9_~.]{3}\dQ~[a-zA-Z0-9_~.-]{31,34}/g,
+    keywords: ["Q~"],
+  },
+
+  // #21 Heroku Key
+  {
+    id: "heroku_key",
+    label: "[HEROKU_KEY]",
+    regex: /\bHRKU-AA[0-9a-zA-Z_-]{58}\b/g,
+    keywords: ["HRKU-"],
+  },
+
+  // #22 npm Token
+  {
+    id: "npm_token",
+    label: "[NPM_TOKEN]",
+    regex: /\bnpm_[a-z0-9]{36}\b/gi,
+    keywords: ["npm_"],
+  },
+
+  // #23 PyPI Token
+  {
+    id: "pypi_token",
+    label: "[PYPI_TOKEN]",
+    regex: /\bpypi-AgEIcHlwaS5vcmc[\w-]{50,1000}\b/g,
+    keywords: ["pypi-"],
+  },
+
+  // #24 Sentry Token
+  {
+    id: "sentry_token",
+    label: "[SENTRY_TOKEN]",
+    regex: /\b(?:sntrys_eyJpYXQiO[a-zA-Z0-9+/]{10,200}|sntryu_[a-f0-9]{64})\b/g,
+    keywords: ["sntrys_", "sntryu_"],
+  },
+
+  // #25 JWT
+  {
+    id: "jwt",
+    label: "[JWT]",
+    regex:
+      /\bey[a-zA-Z0-9]{17,}\.ey[a-zA-Z0-9/\\_-]{17,}\.(?:[a-zA-Z0-9/\\_-]{10,}={0,2})\b/g,
+    keywords: ["eyJ"],
+  },
+
+  // #26 Private Key (PEM)
+  {
+    id: "private_key",
+    label: "[PRIVATE_KEY]",
+    regex:
+      /-----BEGIN[ A-Z0-9_-]{0,100}PRIVATE KEY(?:\s+BLOCK)?-----[\s\S]{64,}?-----END[ A-Z0-9_-]{0,100}PRIVATE KEY(?:\s+BLOCK)?-----/g,
+    keywords: ["PRIVATE KEY"],
+  },
+
+  // #27 Generic API Key (contextual: secret-like variable name + value)
+  {
+    id: "generic_api_key",
+    label: "[API_KEY]",
+    regex:
+      /(?:access|auth|api|credential|creds|key|passw(?:or)?d|secret|token)(?:[\t \w.-]{0,20})[\s'"]{0,3}(?:=|>|:{1,3}=|\|\||:|=>|\?=|,)[\x60'"\s=]{0,5}([\w.=\-/+]{10,150})/gi,
+    keywords: [
+      "secret",
+      "token",
+      "password",
+      "passwd",
+      "api_key",
+      "apikey",
+      "access_key",
+      "auth",
+      "credential",
+    ],
+  },
+
+  // ---- Tier 2: PII / System Info ----
+
+  // #28 Home directory paths
+  {
+    id: "home_directory",
+    label: "[HOME_PATH]",
+    regex:
+      /(?:\/home\/[a-zA-Z0-9_.-]+|\/Users\/[a-zA-Z0-9_.-]+|C:\\Users\\[a-zA-Z0-9_.-]+)(?:[/\\][^\s"'`<>)}\]]*)?/g,
+    keywords: ["/home/", "/Users/", "C:\\Users\\"],
+  },
+
+  // #29 Connection strings
+  {
+    id: "connection_string",
+    label: "[CONNECTION_STRING]",
+    regex:
+      /\b(?:postgres(?:ql)?|mysql|mongodb(?:\+srv)?|redis|amqp|mssql):\/\/[^\s"'`]+/gi,
+    keywords: ["postgres", "mysql", "mongodb", "redis", "amqp", "mssql"],
+  },
+
+  // #30 URLs with credentials
+  {
+    id: "url_with_creds",
+    label: "[URL_WITH_CREDS]",
+    regex: /https?:\/\/[^\s:@]+:[^\s:@]+@[^\s"'`]+/g,
+    keywords: ["://"],
+  },
+
+  // #31 Email addresses
+  {
+    id: "email",
+    label: "[EMAIL]",
+    regex: /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/g,
+    keywords: ["@"],
+    falsePositiveCheck: isAllowlistedEmail,
+  },
+
+  // #32 IPv4 addresses
+  {
+    id: "ipv4",
+    label: "[IP_ADDR]",
+    regex:
+      /\b(?:(?:25[0-5]|2[0-4]\d|1\d{2}|[1-9]?\d)\.){3}(?:25[0-5]|2[0-4]\d|1\d{2}|[1-9]?\d)\b/g,
+    keywords: ["."],
+    falsePositiveCheck: isAllowlistedIp,
+  },
+
+  // #33 Bearer tokens
+  {
+    id: "bearer_token",
+    label: "[BEARER_TOKEN]",
+    regex: /Bearer\s+[A-Za-z0-9\-._~+/]+=*/g,
+    keywords: ["Bearer"],
+  },
+
+  // #34 Env var secret assignments
+  {
+    id: "env_var_secret",
+    label: "[ENV_SECRET]",
+    regex:
+      /\b[A-Z_]*(?:SECRET|TOKEN|PASSWORD|PASSWD|PRIVATE)[A-Z_]*\s*[=:]\s*["']?[^\s"']{4,}["']?\b/g,
+    keywords: ["SECRET", "TOKEN", "PASSWORD", "PASSWD", "PRIVATE"],
+  },
+];
+
+// ---------------------------------------------------------------------------
+// Core redaction functions
+// ---------------------------------------------------------------------------
+
+/**
+ * Check if any keyword from the pattern appears in the content (case-sensitive
+ * for most patterns, lowered for a cheap pre-check).
+ */
+function hasKeyword(content: string, keywords: string[]): boolean {
+  for (const kw of keywords) {
+    if (content.includes(kw)) return true;
+  }
+  return false;
+}
+
+/**
+ * Replaces sensitive content in the input string with placeholder labels.
+ * Pure function — no side effects.
+ */
+export function redactSensitiveContent(input: string): RedactionResult {
+  if (!input) {
+    return { sanitized: "", redactionCount: 0, categories: [] };
+  }
+
+  let result = input;
+  let count = 0;
+  const matchedCategories = new Set<string>();
+
+  for (const pattern of SENSITIVE_PATTERNS) {
+    // Keyword pre-filter: skip expensive regex if no keyword found
+    if (!hasKeyword(result, pattern.keywords)) {
+      continue;
+    }
+
+    // Use a fresh regex each time to avoid lastIndex issues with /g flag
+    const regex = new RegExp(pattern.regex.source, pattern.regex.flags);
+
+    result = result.replace(regex, (match: string) => {
+      // Check false positive
+      if (pattern.falsePositiveCheck && pattern.falsePositiveCheck(match)) {
+        return match;
+      }
+      count++;
+      matchedCategories.add(pattern.id);
+      return pattern.label;
+    });
+  }
+
+  return {
+    sanitized: result,
+    redactionCount: count,
+    categories: [...matchedCategories],
+  };
+}
+
+/**
+ * Convenience wrapper returning just the sanitized string.
+ */
+export function redactString(input: string): string {
+  return redactSensitiveContent(input).sanitized;
+}
+
+/**
+ * Returns a new ParsedMessage with sensitive content redacted from
+ * content and toolInput fields. Does NOT mutate the original.
+ */
+export function redactMessage(msg: ParsedMessage): ParsedMessage {
+  return {
+    ...msg,
+    content: redactString(msg.content),
+    toolInput: msg.toolInput ? redactString(msg.toolInput) : msg.toolInput,
+    // toolName is typically safe (e.g. "Bash", "Read") — pass through unchanged
+  };
+}
--- a/src/shared/types.ts
+++ b/src/shared/types.ts
@@ -0,0 +1,77 @@
+export type MessageCategory =
+  | "user_message"
+  | "assistant_text"
+  | "thinking"
+  | "tool_call"
+  | "tool_result"
+  | "system_message"
+  | "hook_progress"
+  | "file_snapshot"
+  | "summary";
+
+export interface ParsedMessage {
+  uuid: string;
+  category: MessageCategory;
+  content: string;
+  toolName?: string;
+  toolInput?: string;
+  timestamp?: string;
+  rawIndex: number;
+}
+
+export interface SessionEntry {
+  id: string;
+  summary: string;
+  firstPrompt: string;
+  project: string;
+  created: string;
+  modified: string;
+  messageCount: number;
+  path: string;
+}
+
+export interface SessionListResponse {
+  sessions: SessionEntry[];
+}
+
+export interface SessionDetailResponse {
+  id: string;
+  project: string;
+  messages: ParsedMessage[];
+}
+
+export interface ExportRequest {
+  session: SessionDetailResponse;
+  visibleMessageUuids: string[];
+  redactedMessageUuids: string[];
+  autoRedactEnabled?: boolean;
+}
+
+export const ALL_CATEGORIES: MessageCategory[] = [
+  "user_message",
+  "assistant_text",
+  "thinking",
+  "tool_call",
+  "tool_result",
+  "system_message",
+  "hook_progress",
+  "file_snapshot",
+  "summary",
+];
+
+export const CATEGORY_LABELS: Record<MessageCategory, string> = {
+  user_message: "User Messages",
+  assistant_text: "Assistant Text",
+  thinking: "Thinking Blocks",
+  tool_call: "Tool Calls",
+  tool_result: "Tool Results",
+  system_message: "System Messages",
+  hook_progress: "Hook/Progress",
+  file_snapshot: "File Snapshots",
+  summary: "Summaries",
+};
+
+export const DEFAULT_HIDDEN_CATEGORIES: MessageCategory[] = [
+  "thinking",
+  "hook_progress",
+];