feat: add data pipeline with parallel loading, aggregation, and cache integration

Implement the pipeline layer that orchestrates discovery, parsing, caching, and aggregation: - pipeline/loader.go: Load() discovers session files via ScanDir, optionally filters out subagent files, then parses all files in parallel using a bounded worker pool sized to GOMAXPROCS. Workers read from a pre-filled channel (no contention on dispatch) and report progress via an atomic counter and callback. LoadResult tracks total files, parsed files, parse errors, and file errors. - pipeline/aggregator.go: Five aggregation functions, all operating on time-filtered session slices: * Aggregate: computes SummaryStats across all sessions — total tokens (5 types), estimated cost, cache savings (summed per-model via config.CalculateCacheSavings), cache hit rate, and per-active- day rates (cost, tokens, sessions, prompts, minutes). * AggregateDays: groups sessions by local calendar date, sorted most-recent-first. * AggregateModels: groups by normalized model name with share percentages, sorted by cost descending. * AggregateProjects: groups by project name, sorted by cost. * AggregateHourly: distributes prompt/session/token counts across 24 hour buckets (attributed to session start hour). Also provides FilterByTime, FilterByProject, FilterByModel with case-insensitive substring matching. - pipeline/incremental.go: LoadWithCache() implements the incremental loading strategy — compares discovered files against the cache's file_tracker (mtime_ns + size), loads unchanged sessions from SQLite, and only reparses files that changed. Reparsed results are immediately saved back to cache. CacheDir/CachePath follow XDG_CACHE_HOME convention (~/.cache/cburn/metrics.db). - pipeline/bench_test.go: Benchmarks for ScanDir, ParseFile (worst- case largest file), full Load, and LoadWithCache to measure the incremental cache speedup. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-19 13:01:40 -05:00
parent 0e9091f56e
commit 24454247a3
4 changed files with 647 additions and 0 deletions
--- a/internal/pipeline/aggregator.go
+++ b/internal/pipeline/aggregator.go
@@ -0,0 +1,261 @@
+package pipeline
+
+import (
+	"sort"
+	"strings"
+	"time"
+
+	"cburn/internal/config"
+	"cburn/internal/model"
+)
+
+// Aggregate computes summary statistics from a slice of session stats,
+// filtered to sessions within the given time range.
+func Aggregate(sessions []model.SessionStats, since, until time.Time) model.SummaryStats {
+	filtered := FilterByTime(sessions, since, until)
+
+	var stats model.SummaryStats
+	activeDays := make(map[string]struct{})
+
+	for _, s := range filtered {
+		stats.TotalSessions++
+		stats.TotalPrompts += s.UserMessages
+		stats.TotalAPICalls += s.APICalls
+		stats.TotalDurationSecs += s.DurationSecs
+
+		stats.InputTokens += s.InputTokens
+		stats.OutputTokens += s.OutputTokens
+		stats.CacheCreation5mTokens += s.CacheCreation5mTokens
+		stats.CacheCreation1hTokens += s.CacheCreation1hTokens
+		stats.CacheReadTokens += s.CacheReadTokens
+		stats.EstimatedCost += s.EstimatedCost
+
+		if !s.StartTime.IsZero() {
+			day := s.StartTime.Local().Format("2006-01-02")
+			activeDays[day] = struct{}{}
+		}
+	}
+
+	stats.ActiveDays = len(activeDays)
+	stats.TotalBilledTokens = stats.InputTokens + stats.OutputTokens +
+		stats.CacheCreation5mTokens + stats.CacheCreation1hTokens
+
+	// Cache hit rate
+	totalCacheInput := stats.CacheReadTokens + stats.CacheCreation5mTokens +
+		stats.CacheCreation1hTokens + stats.InputTokens
+	if totalCacheInput > 0 {
+		stats.CacheHitRate = float64(stats.CacheReadTokens) / float64(totalCacheInput)
+	}
+
+	// Cache savings (sum across all models found in sessions)
+	for _, s := range filtered {
+		for modelName, mu := range s.Models {
+			stats.CacheSavings += config.CalculateCacheSavings(modelName, mu.CacheReadTokens)
+		}
+	}
+
+	// Per-active-day rates
+	if stats.ActiveDays > 0 {
+		days := float64(stats.ActiveDays)
+		stats.CostPerDay = stats.EstimatedCost / days
+		stats.TokensPerDay = int64(float64(stats.TotalBilledTokens) / days)
+		stats.SessionsPerDay = float64(stats.TotalSessions) / days
+		stats.PromptsPerDay = float64(stats.TotalPrompts) / days
+		stats.MinutesPerDay = float64(stats.TotalDurationSecs) / 60 / days
+	}
+
+	return stats
+}
+
+// AggregateDays computes per-day statistics from sessions.
+func AggregateDays(sessions []model.SessionStats, since, until time.Time) []model.DailyStats {
+	filtered := FilterByTime(sessions, since, until)
+
+	dayMap := make(map[string]*model.DailyStats)
+
+	for _, s := range filtered {
+		if s.StartTime.IsZero() {
+			continue
+		}
+		dayKey := s.StartTime.Local().Format("2006-01-02")
+		ds, ok := dayMap[dayKey]
+		if !ok {
+			t, _ := time.ParseInLocation("2006-01-02", dayKey, time.Local)
+			ds = &model.DailyStats{Date: t}
+			dayMap[dayKey] = ds
+		}
+
+		ds.Sessions++
+		ds.Prompts += s.UserMessages
+		ds.APICalls += s.APICalls
+		ds.DurationSecs += s.DurationSecs
+		ds.InputTokens += s.InputTokens
+		ds.OutputTokens += s.OutputTokens
+		ds.CacheCreation5m += s.CacheCreation5mTokens
+		ds.CacheCreation1h += s.CacheCreation1hTokens
+		ds.CacheReadTokens += s.CacheReadTokens
+		ds.EstimatedCost += s.EstimatedCost
+	}
+
+	// Convert to sorted slice (most recent first)
+	days := make([]model.DailyStats, 0, len(dayMap))
+	for _, ds := range dayMap {
+		days = append(days, *ds)
+	}
+	sort.Slice(days, func(i, j int) bool {
+		return days[i].Date.After(days[j].Date)
+	})
+
+	return days
+}
+
+// AggregateModels computes per-model statistics from sessions.
+func AggregateModels(sessions []model.SessionStats, since, until time.Time) []model.ModelStats {
+	filtered := FilterByTime(sessions, since, until)
+
+	modelMap := make(map[string]*model.ModelStats)
+	totalCalls := 0
+
+	for _, s := range filtered {
+		for modelName, mu := range s.Models {
+			ms, ok := modelMap[modelName]
+			if !ok {
+				ms = &model.ModelStats{Model: modelName}
+				modelMap[modelName] = ms
+			}
+			ms.APICalls += mu.APICalls
+			ms.InputTokens += mu.InputTokens
+			ms.OutputTokens += mu.OutputTokens
+			ms.CacheCreation5m += mu.CacheCreation5mTokens
+			ms.CacheCreation1h += mu.CacheCreation1hTokens
+			ms.CacheReadTokens += mu.CacheReadTokens
+			ms.EstimatedCost += mu.EstimatedCost
+			totalCalls += mu.APICalls
+		}
+	}
+
+	// Compute share percentages and sort by cost descending
+	models := make([]model.ModelStats, 0, len(modelMap))
+	for _, ms := range modelMap {
+		if totalCalls > 0 {
+			ms.SharePercent = float64(ms.APICalls) / float64(totalCalls) * 100
+		}
+		models = append(models, *ms)
+	}
+	sort.Slice(models, func(i, j int) bool {
+		return models[i].EstimatedCost > models[j].EstimatedCost
+	})
+
+	return models
+}
+
+// AggregateProjects computes per-project statistics from sessions.
+func AggregateProjects(sessions []model.SessionStats, since, until time.Time) []model.ProjectStats {
+	filtered := FilterByTime(sessions, since, until)
+
+	projMap := make(map[string]*model.ProjectStats)
+
+	for _, s := range filtered {
+		ps, ok := projMap[s.Project]
+		if !ok {
+			ps = &model.ProjectStats{Project: s.Project}
+			projMap[s.Project] = ps
+		}
+		ps.Sessions++
+		ps.Prompts += s.UserMessages
+		ps.TotalTokens += s.InputTokens + s.OutputTokens +
+			s.CacheCreation5mTokens + s.CacheCreation1hTokens
+		ps.EstimatedCost += s.EstimatedCost
+	}
+
+	// Sort by cost descending
+	projects := make([]model.ProjectStats, 0, len(projMap))
+	for _, ps := range projMap {
+		projects = append(projects, *ps)
+	}
+	sort.Slice(projects, func(i, j int) bool {
+		return projects[i].EstimatedCost > projects[j].EstimatedCost
+	})
+
+	return projects
+}
+
+// AggregateHourly computes prompt counts by hour of day.
+func AggregateHourly(sessions []model.SessionStats, since, until time.Time) []model.HourlyStats {
+	filtered := FilterByTime(sessions, since, until)
+
+	hours := make([]model.HourlyStats, 24)
+	for i := range hours {
+		hours[i].Hour = i
+	}
+
+	// We attribute all prompts and tokens to the session's start hour
+	for _, s := range filtered {
+		if s.StartTime.IsZero() {
+			continue
+		}
+		h := s.StartTime.Local().Hour()
+		hours[h].Prompts += s.UserMessages
+		hours[h].Sessions++
+		hours[h].Tokens += s.InputTokens + s.OutputTokens
+	}
+
+	return hours
+}
+
+// FilterByTime returns sessions whose start time falls within [since, until).
+func FilterByTime(sessions []model.SessionStats, since, until time.Time) []model.SessionStats {
+	if since.IsZero() && until.IsZero() {
+		return sessions
+	}
+
+	var result []model.SessionStats
+	for _, s := range sessions {
+		if s.StartTime.IsZero() {
+			continue
+		}
+		if !since.IsZero() && s.StartTime.Before(since) {
+			continue
+		}
+		if !until.IsZero() && !s.StartTime.Before(until) {
+			continue
+		}
+		result = append(result, s)
+	}
+	return result
+}
+
+// FilterByProject returns sessions matching the project substring.
+func FilterByProject(sessions []model.SessionStats, project string) []model.SessionStats {
+	if project == "" {
+		return sessions
+	}
+	var result []model.SessionStats
+	for _, s := range sessions {
+		if containsIgnoreCase(s.Project, project) {
+			result = append(result, s)
+		}
+	}
+	return result
+}
+
+// FilterByModel returns sessions that have at least one API call to the given model.
+func FilterByModel(sessions []model.SessionStats, modelFilter string) []model.SessionStats {
+	if modelFilter == "" {
+		return sessions
+	}
+	var result []model.SessionStats
+	for _, s := range sessions {
+		for m := range s.Models {
+			if containsIgnoreCase(m, modelFilter) {
+				result = append(result, s)
+				break
+			}
+		}
+	}
+	return result
+}
+
+func containsIgnoreCase(s, substr string) bool {
+	return strings.Contains(strings.ToLower(s), strings.ToLower(substr))
+}
--- a/internal/pipeline/bench_test.go
+++ b/internal/pipeline/bench_test.go
@@ -0,0 +1,92 @@
+package pipeline
+
+import (
+	"os"
+	"path/filepath"
+	"testing"
+
+	"cburn/internal/source"
+	"cburn/internal/store"
+)
+
+func BenchmarkLoad(b *testing.B) {
+	homeDir, _ := os.UserHomeDir()
+	claudeDir := filepath.Join(homeDir, ".claude")
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		result, err := Load(claudeDir, true, nil)
+		if err != nil {
+			b.Fatal(err)
+		}
+		_ = result
+	}
+}
+
+func BenchmarkParseFile(b *testing.B) {
+	homeDir, _ := os.UserHomeDir()
+	claudeDir := filepath.Join(homeDir, ".claude")
+
+	files, err := source.ScanDir(claudeDir)
+	if err != nil {
+		b.Fatal(err)
+	}
+
+	// Find the largest file for worst-case benchmarking
+	var biggest source.DiscoveredFile
+	var biggestSize int64
+	for _, f := range files {
+		info, err := os.Stat(f.Path)
+		if err != nil {
+			continue
+		}
+		if info.Size() > biggestSize {
+			biggestSize = info.Size()
+			biggest = f
+		}
+	}
+
+	b.Logf("Benchmarking largest file: %s (%.1f KB)", biggest.Path, float64(biggestSize)/1024)
+	b.ResetTimer()
+
+	for i := 0; i < b.N; i++ {
+		result := source.ParseFile(biggest)
+		if result.Err != nil {
+			b.Fatal(result.Err)
+		}
+	}
+}
+
+func BenchmarkScanDir(b *testing.B) {
+	homeDir, _ := os.UserHomeDir()
+	claudeDir := filepath.Join(homeDir, ".claude")
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		files, err := source.ScanDir(claudeDir)
+		if err != nil {
+			b.Fatal(err)
+		}
+		_ = files
+	}
+}
+
+func BenchmarkLoadWithCache(b *testing.B) {
+	homeDir, _ := os.UserHomeDir()
+	claudeDir := filepath.Join(homeDir, ".claude")
+
+	cache, err := store.Open(CachePath())
+	if err != nil {
+		b.Fatal(err)
+	}
+	defer cache.Close()
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		cr, err := LoadWithCache(claudeDir, true, cache, nil)
+		if err != nil {
+			b.Fatal(err)
+		}
+		_ = cr
+	}
+}
--- a/internal/pipeline/incremental.go
+++ b/internal/pipeline/incremental.go
@@ -0,0 +1,177 @@
+package pipeline
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+	"runtime"
+	"sync"
+	"sync/atomic"
+
+	"cburn/internal/source"
+	"cburn/internal/store"
+)
+
+// CachedLoadResult extends LoadResult with cache metadata.
+type CachedLoadResult struct {
+	LoadResult
+	CacheHits int
+	Reparsed  int
+}
+
+// LoadWithCache discovers, diffs against cache, parses only changed files,
+// and returns the combined result set.
+func LoadWithCache(claudeDir string, includeSubagents bool, cache *store.Cache, progressFn ProgressFunc) (*CachedLoadResult, error) {
+	// Discover files
+	files, err := source.ScanDir(claudeDir)
+	if err != nil {
+		return nil, fmt.Errorf("scanning %s: %w", claudeDir, err)
+	}
+
+	if len(files) == 0 {
+		return &CachedLoadResult{}, nil
+	}
+
+	// Filter subagents if requested
+	var toProcess []source.DiscoveredFile
+	if includeSubagents {
+		toProcess = files
+	} else {
+		for _, f := range files {
+			if !f.IsSubagent {
+				toProcess = append(toProcess, f)
+			}
+		}
+	}
+
+	result := &CachedLoadResult{
+		LoadResult: LoadResult{
+			TotalFiles:   len(toProcess),
+			ProjectCount: source.CountProjects(files),
+		},
+	}
+
+	if len(toProcess) == 0 {
+		return result, nil
+	}
+
+	// Get tracked files from cache
+	tracked, err := cache.GetTrackedFiles()
+	if err != nil {
+		return nil, fmt.Errorf("reading cache: %w", err)
+	}
+
+	// Diff: partition into changed and unchanged
+	var toReparse []source.DiscoveredFile
+	var unchanged []string // file paths that haven't changed
+
+	for _, f := range toProcess {
+		info, err := os.Stat(f.Path)
+		if err != nil {
+			continue
+		}
+
+		cached, ok := tracked[f.Path]
+		if ok && cached.MtimeNs == info.ModTime().UnixNano() && cached.SizeBytes == info.Size() {
+			unchanged = append(unchanged, f.Path)
+		} else {
+			toReparse = append(toReparse, f)
+		}
+	}
+
+	result.CacheHits = len(unchanged)
+	result.Reparsed = len(toReparse)
+
+	// Load cached sessions
+	if len(unchanged) > 0 {
+		cached, err := cache.LoadAllSessions()
+		if err != nil {
+			return nil, fmt.Errorf("loading cached sessions: %w", err)
+		}
+
+		// Filter to only sessions from unchanged files
+		unchangedSet := make(map[string]struct{}, len(unchanged))
+		for _, p := range unchanged {
+			unchangedSet[p] = struct{}{}
+		}
+		for _, s := range cached {
+			if _, ok := unchangedSet[s.FilePath]; ok {
+				result.Sessions = append(result.Sessions, s)
+				result.ParsedFiles++
+			}
+		}
+	}
+
+	// Parse changed files
+	if len(toReparse) > 0 {
+		numWorkers := runtime.GOMAXPROCS(0)
+		if numWorkers < 1 {
+			numWorkers = 4
+		}
+		if numWorkers > len(toReparse) {
+			numWorkers = len(toReparse)
+		}
+
+		work := make(chan int, len(toReparse))
+		results := make([]source.ParseResult, len(toReparse))
+		var wg sync.WaitGroup
+		var processed atomic.Int64
+
+		for i := range toReparse {
+			work <- i
+		}
+		close(work)
+
+		wg.Add(numWorkers)
+		for w := 0; w < numWorkers; w++ {
+			go func() {
+				defer wg.Done()
+				for idx := range work {
+					results[idx] = source.ParseFile(toReparse[idx])
+					n := processed.Add(1)
+					if progressFn != nil {
+						progressFn(int(n)+result.CacheHits, result.TotalFiles)
+					}
+				}
+			}()
+		}
+
+		wg.Wait()
+
+		// Collect and cache results
+		for i, pr := range results {
+			if pr.Err != nil {
+				result.FileErrors++
+				continue
+			}
+			result.ParsedFiles++
+			result.ParseErrors += pr.ParseErrors
+
+			if pr.Stats.APICalls > 0 || pr.Stats.UserMessages > 0 {
+				result.Sessions = append(result.Sessions, pr.Stats)
+
+				// Save to cache
+				info, err := os.Stat(toReparse[i].Path)
+				if err == nil {
+					_ = cache.SaveSession(pr.Stats, info.ModTime().UnixNano(), info.Size())
+				}
+			}
+		}
+	}
+
+	return result, nil
+}
+
+// CacheDir returns the platform-appropriate cache directory.
+func CacheDir() string {
+	if xdg := os.Getenv("XDG_CACHE_HOME"); xdg != "" {
+		return filepath.Join(xdg, "cburn")
+	}
+	home, _ := os.UserHomeDir()
+	return filepath.Join(home, ".cache", "cburn")
+}
+
+// CachePath returns the full path to the cache database.
+func CachePath() string {
+	return filepath.Join(CacheDir(), "metrics.db")
+}
--- a/internal/pipeline/loader.go
+++ b/internal/pipeline/loader.go
@@ -0,0 +1,117 @@
+package pipeline
+
+import (
+	"fmt"
+	"runtime"
+	"sync"
+	"sync/atomic"
+
+	"cburn/internal/model"
+	"cburn/internal/source"
+)
+
+// LoadResult holds the output of the full data loading pipeline.
+type LoadResult struct {
+	Sessions     []model.SessionStats
+	TotalFiles   int
+	ParsedFiles  int
+	ParseErrors  int
+	FileErrors   int
+	ProjectCount int
+}
+
+// ProgressFunc is called during loading to report progress.
+// current is the number of files processed so far, total is the total count.
+type ProgressFunc func(current, total int)
+
+// Load discovers and parses all session files from the Claude data directory.
+// It uses a bounded worker pool for parallel parsing.
+func Load(claudeDir string, includeSubagents bool, progressFn ProgressFunc) (*LoadResult, error) {
+	// Discover files
+	files, err := source.ScanDir(claudeDir)
+	if err != nil {
+		return nil, fmt.Errorf("scanning %s: %w", claudeDir, err)
+	}
+
+	if len(files) == 0 {
+		return &LoadResult{}, nil
+	}
+
+	// Filter subagents if requested
+	var toProcess []source.DiscoveredFile
+	if includeSubagents {
+		toProcess = files
+	} else {
+		for _, f := range files {
+			if !f.IsSubagent {
+				toProcess = append(toProcess, f)
+			}
+		}
+	}
+
+	result := &LoadResult{
+		TotalFiles:   len(toProcess),
+		ProjectCount: source.CountProjects(files),
+	}
+
+	if len(toProcess) == 0 {
+		return result, nil
+	}
+
+	// Parallel parsing with bounded worker pool
+	numWorkers := runtime.GOMAXPROCS(0)
+	if numWorkers < 1 {
+		numWorkers = 4
+	}
+	if numWorkers > len(toProcess) {
+		numWorkers = len(toProcess)
+	}
+
+	type indexedResult struct {
+		idx    int
+		result source.ParseResult
+	}
+
+	work := make(chan int, len(toProcess))
+	results := make([]source.ParseResult, len(toProcess))
+	var wg sync.WaitGroup
+	var processed atomic.Int64
+
+	// Feed work
+	for i := range toProcess {
+		work <- i
+	}
+	close(work)
+
+	// Spawn workers
+	wg.Add(numWorkers)
+	for w := 0; w < numWorkers; w++ {
+		go func() {
+			defer wg.Done()
+			for idx := range work {
+				results[idx] = source.ParseFile(toProcess[idx])
+				n := processed.Add(1)
+				if progressFn != nil {
+					progressFn(int(n), len(toProcess))
+				}
+			}
+		}()
+	}
+
+	wg.Wait()
+
+	// Collect results
+	for _, pr := range results {
+		if pr.Err != nil {
+			result.FileErrors++
+			continue
+		}
+		result.ParsedFiles++
+		result.ParseErrors += pr.ParseErrors
+		if pr.Stats.APICalls > 0 || pr.Stats.UserMessages > 0 {
+			result.Sessions = append(result.Sessions, pr.Stats)
+		}
+	}
+
+	return result, nil
+}