From c65335bd21110b93046ca1e8ea81e6e7ac88085a Mon Sep 17 00:00:00 2001
From: teernisse <teernisse@visiostack.com>
Date: Fri, 20 Feb 2026 16:06:53 -0500
Subject: [PATCH] test: add comprehensive parser unit tests and fuzz test

Cover the core JSONL byte-level parser with targeted tests and a fuzz
harness to prevent regressions and catch panics on malformed input:

Unit tests:
- TestParseFile_UserMessages: verifies user message counting and
  ProjectPath extraction from cwd field
- TestParseFile_AssistantDedup: confirms message-ID-based deduplication
  where the last entry wins (handles edits/retries)
- TestParseFile_TimeRange: validates StartTime/EndTime tracking across
  out-of-order timestamps
- TestParseFile_SystemDuration: tests turn_duration aggregation from
  system entries (durationMs -> DurationSecs conversion)
- TestParseFile_EmptyFile: ensures zero stats without errors on empty input
- TestParseFile_MalformedLines: confirms graceful skip of unparseable
  lines without aborting the entire file
- TestParseFile_CacheTokens: validates extraction of cache_read,
  cache_creation_5m, and cache_creation_1h token fields
- TestExtractTopLevelType: table-driven tests for the byte-level type
  extractor covering user, assistant, system, nested-type-ignored,
  unknown, no-type, and empty cases

Fuzz test:
- FuzzExtractTopLevelType: seeds with realistic patterns plus edge cases
  (unterminated strings, non-string type values, empty input). Asserts
  the parser never panics and only returns known type strings or empty.

Uses a writeSession helper that creates temp JSONL files for each test,
keeping tests isolated and cleanup automatic via t.TempDir().
---
 internal/source/parser_test.go | 210 +++++++++++++++++++++++++++++++++
 1 file changed, 210 insertions(+)
 create mode 100644 internal/source/parser_test.go

diff --git a/internal/source/parser_test.go b/internal/source/parser_test.go
new file mode 100644
index 0000000..352bffb
--- /dev/null
+++ b/internal/source/parser_test.go
@@ -0,0 +1,210 @@
+package source
+
+import (
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+	"time"
+)
+
+// writeSession creates a temp JSONL file and returns a DiscoveredFile for it.
+func writeSession(t *testing.T, lines ...string) DiscoveredFile {
+	t.Helper()
+	dir := t.TempDir()
+	path := filepath.Join(dir, "session.jsonl")
+	if err := os.WriteFile(path, []byte(strings.Join(lines, "\n")+"\n"), 0o600); err != nil {
+		t.Fatal(err)
+	}
+	return DiscoveredFile{
+		Path:      path,
+		SessionID: "test-session",
+		Project:   "test-project",
+	}
+}
+
+func TestParseFile_UserMessages(t *testing.T) {
+	df := writeSession(t,
+		`{"type":"user","timestamp":"2025-06-01T10:00:00Z","cwd":"/tmp/proj"}`,
+		`{"type":"user","timestamp":"2025-06-01T10:05:00Z"}`,
+		`{"type":"user","timestamp":"2025-06-01T10:10:00Z"}`,
+	)
+
+	result := ParseFile(df)
+	if result.Err != nil {
+		t.Fatalf("unexpected error: %v", result.Err)
+	}
+
+	if result.Stats.UserMessages != 3 {
+		t.Errorf("UserMessages = %d, want 3", result.Stats.UserMessages)
+	}
+	if result.Stats.ProjectPath != "/tmp/proj" {
+		t.Errorf("ProjectPath = %q, want /tmp/proj", result.Stats.ProjectPath)
+	}
+}
+
+func TestParseFile_AssistantDedup(t *testing.T) {
+	// Two entries with same message ID — second should win (deduplication).
+	df := writeSession(t,
+		`{"type":"assistant","timestamp":"2025-06-01T10:00:00Z","message":{"id":"msg1","model":"claude-sonnet-4-6-20250514","usage":{"input_tokens":100,"output_tokens":50}}}`,
+		`{"type":"assistant","timestamp":"2025-06-01T10:00:01Z","message":{"id":"msg1","model":"claude-sonnet-4-6-20250514","usage":{"input_tokens":200,"output_tokens":80}}}`,
+	)
+
+	result := ParseFile(df)
+	if result.Err != nil {
+		t.Fatalf("unexpected error: %v", result.Err)
+	}
+
+	if result.Stats.APICalls != 1 {
+		t.Errorf("APICalls = %d, want 1 (dedup)", result.Stats.APICalls)
+	}
+	if result.Stats.InputTokens != 200 {
+		t.Errorf("InputTokens = %d, want 200 (last wins)", result.Stats.InputTokens)
+	}
+	if result.Stats.OutputTokens != 80 {
+		t.Errorf("OutputTokens = %d, want 80 (last wins)", result.Stats.OutputTokens)
+	}
+}
+
+func TestParseFile_TimeRange(t *testing.T) {
+	df := writeSession(t,
+		`{"type":"user","timestamp":"2025-06-01T08:00:00Z"}`,
+		`{"type":"user","timestamp":"2025-06-01T12:00:00Z"}`,
+		`{"type":"user","timestamp":"2025-06-01T10:00:00Z"}`,
+	)
+
+	result := ParseFile(df)
+	if result.Err != nil {
+		t.Fatalf("unexpected error: %v", result.Err)
+	}
+
+	wantStart := time.Date(2025, 6, 1, 8, 0, 0, 0, time.UTC)
+	wantEnd := time.Date(2025, 6, 1, 12, 0, 0, 0, time.UTC)
+
+	if !result.Stats.StartTime.Equal(wantStart) {
+		t.Errorf("StartTime = %v, want %v", result.Stats.StartTime, wantStart)
+	}
+	if !result.Stats.EndTime.Equal(wantEnd) {
+		t.Errorf("EndTime = %v, want %v", result.Stats.EndTime, wantEnd)
+	}
+}
+
+func TestParseFile_SystemDuration(t *testing.T) {
+	df := writeSession(t,
+		`{"type":"system","subtype":"turn_duration","timestamp":"2025-06-01T10:00:00Z","durationMs":5000}`,
+		`{"type":"system","subtype":"turn_duration","timestamp":"2025-06-01T10:01:00Z","durationMs":3000}`,
+	)
+
+	result := ParseFile(df)
+	if result.Err != nil {
+		t.Fatalf("unexpected error: %v", result.Err)
+	}
+
+	if result.Stats.DurationSecs != 8 { // (5000+3000)/1000
+		t.Errorf("DurationSecs = %d, want 8", result.Stats.DurationSecs)
+	}
+}
+
+func TestParseFile_EmptyFile(t *testing.T) {
+	df := writeSession(t)
+	result := ParseFile(df)
+	if result.Err != nil {
+		t.Fatalf("unexpected error on empty file: %v", result.Err)
+	}
+	if result.Stats.UserMessages != 0 || result.Stats.APICalls != 0 {
+		t.Error("expected zero stats for empty file")
+	}
+}
+
+func TestParseFile_MalformedLines(t *testing.T) {
+	df := writeSession(t,
+		`not json at all`,
+		`{"type":"user","timestamp":"2025-06-01T10:00:00Z"}`,
+		`{"type":"assistant","broken json`,
+	)
+
+	result := ParseFile(df)
+	if result.Err != nil {
+		t.Fatalf("unexpected error: %v", result.Err)
+	}
+	// Malformed lines should be skipped, not cause a fatal error.
+	if result.Stats.UserMessages != 1 {
+		t.Errorf("UserMessages = %d, want 1", result.Stats.UserMessages)
+	}
+}
+
+func TestParseFile_CacheTokens(t *testing.T) {
+	df := writeSession(t,
+		`{"type":"assistant","timestamp":"2025-06-01T10:00:00Z","message":{"id":"msg1","model":"claude-sonnet-4-6","usage":{"input_tokens":100,"output_tokens":50,"cache_read_input_tokens":500,"cache_creation":{"ephemeral_5m_input_tokens":200,"ephemeral_1h_input_tokens":300}}}}`,
+	)
+
+	result := ParseFile(df)
+	if result.Err != nil {
+		t.Fatalf("unexpected error: %v", result.Err)
+	}
+
+	s := result.Stats
+	if s.CacheReadTokens != 500 {
+		t.Errorf("CacheReadTokens = %d, want 500", s.CacheReadTokens)
+	}
+	if s.CacheCreation5mTokens != 200 {
+		t.Errorf("CacheCreation5mTokens = %d, want 200", s.CacheCreation5mTokens)
+	}
+	if s.CacheCreation1hTokens != 300 {
+		t.Errorf("CacheCreation1hTokens = %d, want 300", s.CacheCreation1hTokens)
+	}
+}
+
+func TestExtractTopLevelType(t *testing.T) {
+	tests := []struct {
+		name  string
+		input string
+		want  string
+	}{
+		{"user", `{"type":"user","foo":"bar"}`, "user"},
+		{"assistant", `{"type":"assistant","message":{}}`, "assistant"},
+		{"system", `{"type": "system","subtype":"turn_duration"}`, "system"},
+		{"nested type ignored", `{"data":{"type":"progress"},"type":"user"}`, "user"},
+		{"unknown type", `{"type":"progress","data":{}}`, ""},
+		{"no type field", `{"message":"hello"}`, ""},
+		{"empty", `{}`, ""},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got := extractTopLevelType([]byte(tt.input))
+			if got != tt.want {
+				t.Errorf("extractTopLevelType(%q) = %q, want %q", tt.input, got, tt.want)
+			}
+		})
+	}
+}
+
+// FuzzExtractTopLevelType tests that the byte-level parser never panics
+// on arbitrary input, which is important since it processes untrusted files.
+func FuzzExtractTopLevelType(f *testing.F) {
+	// Seed corpus with realistic patterns
+	f.Add([]byte(`{"type":"user","timestamp":"2025-06-01T10:00:00Z"}`))
+	f.Add([]byte(`{"type":"assistant","message":{"id":"x","usage":{}}}`))
+	f.Add([]byte(`{"type":"system","subtype":"turn_duration","durationMs":5000}`))
+	f.Add([]byte(`{"data":{"type":"nested"},"type":"user"}`))
+	f.Add([]byte(`not json`))
+	f.Add([]byte(`{}`))
+	f.Add([]byte(`{"type":null}`))
+	f.Add([]byte(`{"type":123}`))
+	f.Add([]byte(``))
+	f.Add([]byte(`{"type":"user`)) // unterminated string
+
+	f.Fuzz(func(t *testing.T, data []byte) {
+		// Must never panic
+		result := extractTopLevelType(data)
+
+		// Result must be one of the known types or empty
+		switch result {
+		case "", "user", "assistant", "system":
+			// ok
+		default:
+			t.Errorf("unexpected type %q from input %q", result, data)
+		}
+	})
+}