diff --git a/internal/source/parser.go b/internal/source/parser.go new file mode 100644 index 0000000..071eee1 --- /dev/null +++ b/internal/source/parser.go @@ -0,0 +1,367 @@ +package source + +import ( + "bufio" + "bytes" + "encoding/json" + "os" + "time" + + "cburn/internal/config" + "cburn/internal/model" +) + +// Byte patterns for field extraction. +var ( + patTurnDuration = []byte(`"turn_duration"`) + patDurationMs = []byte(`"durationMs":`) + patTimestamp1 = []byte(`"timestamp":"`) + patTimestamp2 = []byte(`"timestamp": "`) + patCwd1 = []byte(`"cwd":"`) + patCwd2 = []byte(`"cwd": "`) +) + +// ParseResult holds the output of parsing a single JSONL file. +type ParseResult struct { + Stats model.SessionStats + ParseErrors int + Err error +} + +// ParseFile reads a JSONL session file and produces deduplicated session statistics. +// It deduplicates by message.id, keeping only the last entry per ID (final billed usage). +// +// Entry routing by top-level "type" field: +// - "user" → byte-level extraction (timestamp, cwd, count) +// - "system" → byte-level extraction (timestamp, cwd, durationMs) +// - "assistant" → full JSON parse (token usage, model, costs) +// - everything else → skip +func ParseFile(df DiscoveredFile) ParseResult { + f, err := os.Open(df.Path) + if err != nil { + return ParseResult{Err: err} + } + defer f.Close() + + calls := make(map[string]*model.APICall) + + var ( + userMessages int + parseErrors int + totalDuration int64 + minTime time.Time + maxTime time.Time + cwd string + ) + + scanner := bufio.NewScanner(f) + scanner.Buffer(make([]byte, 0, 256*1024), 2*1024*1024) + + for scanner.Scan() { + line := scanner.Bytes() + + entryType := extractTopLevelType(line) + if entryType == "" { + continue + } + + switch entryType { + case "user": + userMessages++ + if ts, ok := extractTimestampBytes(line); ok { + updateTimeRange(&minTime, &maxTime, ts) + } + if cwd == "" { + if c := extractCwdBytes(line); c != "" { + cwd = c + } + } + + case "system": + if ts, ok := extractTimestampBytes(line); ok { + updateTimeRange(&minTime, &maxTime, ts) + } + if cwd == "" { + if c := extractCwdBytes(line); c != "" { + cwd = c + } + } + if bytes.Contains(line, patTurnDuration) { + if ms, ok := extractDurationMs(line); ok { + totalDuration += ms + } + } + + case "assistant": + var entry RawEntry + if err := json.Unmarshal(line, &entry); err != nil { + parseErrors++ + continue + } + + if entry.Timestamp != "" { + ts, err := time.Parse(time.RFC3339Nano, entry.Timestamp) + if err == nil { + updateTimeRange(&minTime, &maxTime, ts) + } + } + if cwd == "" && entry.Cwd != "" { + cwd = entry.Cwd + } + if entry.DurationMs > 0 { + totalDuration += entry.DurationMs + } else if entry.Data != nil && entry.Data.DurationMs > 0 { + totalDuration += entry.Data.DurationMs + } + + if entry.Message == nil || entry.Message.ID == "" { + continue + } + msg := entry.Message + if msg.Usage == nil { + continue + } + + u := msg.Usage + var cache5m, cache1h int64 + if u.CacheCreation != nil { + cache5m = u.CacheCreation.Ephemeral5mInputTokens + cache1h = u.CacheCreation.Ephemeral1hInputTokens + } else if u.CacheCreationInputTokens > 0 { + cache5m = u.CacheCreationInputTokens + } + + ts, _ := time.Parse(time.RFC3339Nano, entry.Timestamp) + + calls[msg.ID] = &model.APICall{ + MessageID: msg.ID, + Model: msg.Model, + Timestamp: ts, + InputTokens: u.InputTokens, + OutputTokens: u.OutputTokens, + CacheCreation5mTokens: cache5m, + CacheCreation1hTokens: cache1h, + CacheReadTokens: u.CacheReadInputTokens, + ServiceTier: u.ServiceTier, + } + } + } + + if err := scanner.Err(); err != nil { + return ParseResult{Err: err} + } + + stats := model.SessionStats{ + SessionID: df.SessionID, + Project: df.Project, + ProjectPath: cwd, + FilePath: df.Path, + IsSubagent: df.IsSubagent, + ParentSession: df.ParentSession, + StartTime: minTime, + EndTime: maxTime, + UserMessages: userMessages, + APICalls: len(calls), + Models: make(map[string]*model.ModelUsage), + } + + if totalDuration > 0 { + stats.DurationSecs = totalDuration / 1000 + } else if !minTime.IsZero() && !maxTime.IsZero() { + stats.DurationSecs = int64(maxTime.Sub(minTime).Seconds()) + } + + for _, call := range calls { + call.EstimatedCost = config.CalculateCost( + call.Model, + call.InputTokens, + call.OutputTokens, + call.CacheCreation5mTokens, + call.CacheCreation1hTokens, + call.CacheReadTokens, + ) + + stats.InputTokens += call.InputTokens + stats.OutputTokens += call.OutputTokens + stats.CacheCreation5mTokens += call.CacheCreation5mTokens + stats.CacheCreation1hTokens += call.CacheCreation1hTokens + stats.CacheReadTokens += call.CacheReadTokens + stats.EstimatedCost += call.EstimatedCost + + normalized := config.NormalizeModelName(call.Model) + mu, ok := stats.Models[normalized] + if !ok { + mu = &model.ModelUsage{} + stats.Models[normalized] = mu + } + mu.APICalls++ + mu.InputTokens += call.InputTokens + mu.OutputTokens += call.OutputTokens + mu.CacheCreation5mTokens += call.CacheCreation5mTokens + mu.CacheCreation1hTokens += call.CacheCreation1hTokens + mu.CacheReadTokens += call.CacheReadTokens + mu.EstimatedCost += call.EstimatedCost + } + + totalCacheInput := stats.CacheReadTokens + stats.CacheCreation5mTokens + + stats.CacheCreation1hTokens + stats.InputTokens + if totalCacheInput > 0 { + stats.CacheHitRate = float64(stats.CacheReadTokens) / float64(totalCacheInput) + } + + return ParseResult{ + Stats: stats, + ParseErrors: parseErrors, + } +} + +// typeKey is the byte sequence for a JSON key named "type" (with quotes). +var typeKey = []byte(`"type"`) + +// extractTopLevelType finds the top-level "type" field in a JSONL line. +// Tracks brace depth and string boundaries so nested "type" keys are ignored. +// Early-exits once found (~400 bytes in), making cost O(1) vs line length. +func extractTopLevelType(line []byte) string { + depth := 0 + for i := 0; i < len(line); { + switch line[i] { + case '"': + if depth == 1 && bytes.HasPrefix(line[i:], typeKey) { + val, isKey := classifyType(line, i+len(typeKey)) + if isKey { + return val // found the "type" key — done regardless of value + } + // "type" appeared as a value, not a key. Continue scanning. + } + i = skipJSONString(line, i) + case '{': + depth++ + i++ + case '}': + depth-- + i++ + default: + i++ + } + } + return "" +} + +// classifyType checks whether pos follows a JSON key (expects : then value). +// Returns the type value and whether this was a valid key:value pair. +// isKey=false means "type" appeared as a value, not a key — caller should continue. +func classifyType(line []byte, pos int) (val string, isKey bool) { + i := skipSpaces(line, pos) + if i >= len(line) || line[i] != ':' { + return "", false // no colon — this was a value, not a key + } + i = skipSpaces(line, i+1) + if i >= len(line) || line[i] != '"' { + return "", true // key with non-string value (null, number, etc.) + } + i++ // past opening quote + + end := bytes.IndexByte(line[i:], '"') + if end < 0 || end > 20 { + return "", true + } + v := string(line[i : i+end]) + switch v { + case "assistant", "user", "system": + return v, true + } + return "", true // valid key but irrelevant type (e.g., "progress") +} + +// skipJSONString advances past a JSON string starting at the opening quote. +func skipJSONString(line []byte, i int) int { + i++ // skip opening quote + for i < len(line) { + if line[i] == '\\' { + i += 2 + } else if line[i] == '"' { + return i + 1 + } else { + i++ + } + } + return i +} + +func skipSpaces(line []byte, i int) int { + for i < len(line) && line[i] == ' ' { + i++ + } + return i +} + +// extractTimestampBytes extracts the timestamp field via byte scanning. +func extractTimestampBytes(line []byte) (time.Time, bool) { + for _, pat := range [][]byte{patTimestamp1, patTimestamp2} { + idx := bytes.Index(line, pat) + if idx < 0 { + continue + } + start := idx + len(pat) + end := bytes.IndexByte(line[start:], '"') + if end < 0 || end > 40 { + continue + } + ts, err := time.Parse(time.RFC3339Nano, string(line[start:start+end])) + if err != nil { + return time.Time{}, false + } + return ts, true + } + return time.Time{}, false +} + +// extractCwdBytes extracts the cwd field via byte scanning. +func extractCwdBytes(line []byte) string { + for _, pat := range [][]byte{patCwd1, patCwd2} { + idx := bytes.Index(line, pat) + if idx < 0 { + continue + } + start := idx + len(pat) + end := bytes.IndexByte(line[start:], '"') + if end < 0 || end > 1024 { + continue + } + return string(line[start : start+end]) + } + return "" +} + +// extractDurationMs extracts the durationMs integer via byte scanning. +func extractDurationMs(line []byte) (int64, bool) { + idx := bytes.Index(line, patDurationMs) + if idx < 0 { + return 0, false + } + start := idx + len(patDurationMs) + for start < len(line) && line[start] == ' ' { + start++ + } + end := start + for end < len(line) && line[end] >= '0' && line[end] <= '9' { + end++ + } + if end == start { + return 0, false + } + var n int64 + for i := start; i < end; i++ { + n = n*10 + int64(line[i]-'0') + } + return n, true +} + +func updateTimeRange(minTime, maxTime *time.Time, ts time.Time) { + if minTime.IsZero() || ts.Before(*minTime) { + *minTime = ts + } + if maxTime.IsZero() || ts.After(*maxTime) { + *maxTime = ts + } +} diff --git a/internal/source/scanner.go b/internal/source/scanner.go new file mode 100644 index 0000000..90f8d5a --- /dev/null +++ b/internal/source/scanner.go @@ -0,0 +1,121 @@ +package source + +import ( + "os" + "path/filepath" + "strings" +) + +// ScanDir walks the Claude projects directory and discovers all JSONL session files. +// It returns discovered files categorized as main sessions or subagent sessions. +func ScanDir(claudeDir string) ([]DiscoveredFile, error) { + projectsDir := filepath.Join(claudeDir, "projects") + + info, err := os.Stat(projectsDir) + if err != nil { + if os.IsNotExist(err) { + return nil, nil + } + return nil, err + } + if !info.IsDir() { + return nil, nil + } + + var files []DiscoveredFile + + err = filepath.WalkDir(projectsDir, func(path string, d os.DirEntry, err error) error { + if err != nil { + return nil // skip unreadable entries + } + if d.IsDir() { + return nil + } + if filepath.Ext(path) != ".jsonl" { + return nil + } + + // Skip sessions-index.json and other non-session files + name := d.Name() + if !strings.HasSuffix(name, ".jsonl") { + return nil + } + + rel, _ := filepath.Rel(projectsDir, path) + parts := strings.Split(rel, string(filepath.Separator)) + if len(parts) < 2 { + return nil + } + + projectDir := parts[0] + project := decodeProjectName(projectDir) + + df := DiscoveredFile{ + Path: path, + Project: project, + ProjectDir: projectDir, + } + + // Determine if this is a subagent file + // Pattern: //subagents/agent-.jsonl + if len(parts) >= 4 && parts[2] == "subagents" { + df.IsSubagent = true + df.ParentSession = parts[1] + // Use parent+agent to avoid collisions across sessions + df.SessionID = parts[1] + "/" + strings.TrimSuffix(name, ".jsonl") + } else { + // Main session: /.jsonl + df.SessionID = strings.TrimSuffix(name, ".jsonl") + } + + files = append(files, df) + return nil + }) + + return files, err +} + +// decodeProjectName extracts a human-readable project name from the encoded directory name. +// Claude Code encodes absolute paths by replacing "/" with "-", so: +// "-Users-tayloreernisse-projects-gitlore" -> "gitlore" +// "-Users-tayloreernisse-projects-my-cool-project" -> "my-cool-project" +// +// We find the last known path component ("projects", "repos", "src", "code", "home") +// and take everything after it. Falls back to the last non-empty segment. +func decodeProjectName(dirName string) string { + parts := strings.Split(dirName, "-") + + // Known parent directory names that commonly precede the project name + knownParents := map[string]bool{ + "projects": true, "repos": true, "src": true, + "code": true, "workspace": true, "dev": true, + } + + // Scan for the last known parent marker and join everything after it + for i := len(parts) - 2; i >= 0; i-- { + if knownParents[strings.ToLower(parts[i])] { + name := strings.Join(parts[i+1:], "-") + if name != "" { + return name + } + } + } + + // Fallback: return the last non-empty segment + for i := len(parts) - 1; i >= 0; i-- { + if parts[i] != "" { + return parts[i] + } + } + + return dirName +} + +// CountProjects returns the number of unique projects in a set of discovered files. +func CountProjects(files []DiscoveredFile) int { + seen := make(map[string]struct{}) + for _, f := range files { + seen[f.Project] = struct{}{} + } + return len(seen) +} diff --git a/internal/source/types.go b/internal/source/types.go new file mode 100644 index 0000000..a6bf843 --- /dev/null +++ b/internal/source/types.go @@ -0,0 +1,58 @@ +package source + +// RawEntry represents a single line in a Claude Code JSONL session file. +type RawEntry struct { + Type string `json:"type"` + Subtype string `json:"subtype,omitempty"` + Timestamp string `json:"timestamp,omitempty"` + SessionID string `json:"sessionId,omitempty"` + Cwd string `json:"cwd,omitempty"` + Version string `json:"version,omitempty"` + Message *RawMessage `json:"message,omitempty"` + + // For system entries with subtype "turn_duration" + DurationMs int64 `json:"durationMs,omitempty"` + + // For progress entries with turn_duration data + Data *RawProgressData `json:"data,omitempty"` +} + +// RawProgressData holds typed progress data from system/progress entries. +type RawProgressData struct { + Type string `json:"type"` + DurationMs int64 `json:"durationMs,omitempty"` +} + +// RawMessage represents the assistant's message envelope. +type RawMessage struct { + ID string `json:"id"` + Role string `json:"role"` + Model string `json:"model"` + Usage *RawUsage `json:"usage,omitempty"` +} + +// RawUsage holds token counts from the API response. +type RawUsage struct { + InputTokens int64 `json:"input_tokens"` + OutputTokens int64 `json:"output_tokens"` + CacheCreationInputTokens int64 `json:"cache_creation_input_tokens"` + CacheReadInputTokens int64 `json:"cache_read_input_tokens"` + CacheCreation *CacheCreation `json:"cache_creation,omitempty"` + ServiceTier string `json:"service_tier"` +} + +// CacheCreation holds the breakdown of cache write tokens by TTL bucket. +type CacheCreation struct { + Ephemeral5mInputTokens int64 `json:"ephemeral_5m_input_tokens"` + Ephemeral1hInputTokens int64 `json:"ephemeral_1h_input_tokens"` +} + +// DiscoveredFile represents a JSONL file found during directory scanning. +type DiscoveredFile struct { + Path string + Project string // decoded display name (e.g., "gitlore") + ProjectDir string // raw directory name + SessionID string // extracted from filename + IsSubagent bool + ParentSession string // for subagents: parent session UUID +}