feat: add data pipeline with parallel loading, aggregation, and cache integration

Implement the pipeline layer that orchestrates discovery, parsing, caching, and aggregation: - pipeline/loader.go: Load() discovers session files via ScanDir, optionally filters out subagent files, then parses all files in parallel using a bounded worker pool sized to GOMAXPROCS. Workers read from a pre-filled channel (no contention on dispatch) and report progress via an atomic counter and callback. LoadResult tracks total files, parsed files, parse errors, and file errors. - pipeline/aggregator.go: Five aggregation functions, all operating on time-filtered session slices: * Aggregate: computes SummaryStats across all sessions — total tokens (5 types), estimated cost, cache savings (summed per-model via config.CalculateCacheSavings), cache hit rate, and per-active- day rates (cost, tokens, sessions, prompts, minutes). * AggregateDays: groups sessions by local calendar date, sorted most-recent-first. * AggregateModels: groups by normalized model name with share percentages, sorted by cost descending. * AggregateProjects: groups by project name, sorted by cost. * AggregateHourly: distributes prompt/session/token counts across 24 hour buckets (attributed to session start hour). Also provides FilterByTime, FilterByProject, FilterByModel with case-insensitive substring matching. - pipeline/incremental.go: LoadWithCache() implements the incremental loading strategy — compares discovered files against the cache's file_tracker (mtime_ns + size), loads unchanged sessions from SQLite, and only reparses files that changed. Reparsed results are immediately saved back to cache. CacheDir/CachePath follow XDG_CACHE_HOME convention (~/.cache/cburn/metrics.db). - pipeline/bench_test.go: Benchmarks for ScanDir, ParseFile (worst- case largest file), full Load, and LoadWithCache to measure the incremental cache speedup. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-19 13:01:40 -05:00
parent 0e9091f56e
commit 24454247a3
4 changed files with 647 additions and 0 deletions
--- a/internal/pipeline/loader.go
+++ b/internal/pipeline/loader.go
@@ -0,0 +1,117 @@
+package pipeline
+
+import (
+	"fmt"
+	"runtime"
+	"sync"
+	"sync/atomic"
+
+	"cburn/internal/model"
+	"cburn/internal/source"
+)
+
+// LoadResult holds the output of the full data loading pipeline.
+type LoadResult struct {
+	Sessions     []model.SessionStats
+	TotalFiles   int
+	ParsedFiles  int
+	ParseErrors  int
+	FileErrors   int
+	ProjectCount int
+}
+
+// ProgressFunc is called during loading to report progress.
+// current is the number of files processed so far, total is the total count.
+type ProgressFunc func(current, total int)
+
+// Load discovers and parses all session files from the Claude data directory.
+// It uses a bounded worker pool for parallel parsing.
+func Load(claudeDir string, includeSubagents bool, progressFn ProgressFunc) (*LoadResult, error) {
+	// Discover files
+	files, err := source.ScanDir(claudeDir)
+	if err != nil {
+		return nil, fmt.Errorf("scanning %s: %w", claudeDir, err)
+	}
+
+	if len(files) == 0 {
+		return &LoadResult{}, nil
+	}
+
+	// Filter subagents if requested
+	var toProcess []source.DiscoveredFile
+	if includeSubagents {
+		toProcess = files
+	} else {
+		for _, f := range files {
+			if !f.IsSubagent {
+				toProcess = append(toProcess, f)
+			}
+		}
+	}
+
+	result := &LoadResult{
+		TotalFiles:   len(toProcess),
+		ProjectCount: source.CountProjects(files),
+	}
+
+	if len(toProcess) == 0 {
+		return result, nil
+	}
+
+	// Parallel parsing with bounded worker pool
+	numWorkers := runtime.GOMAXPROCS(0)
+	if numWorkers < 1 {
+		numWorkers = 4
+	}
+	if numWorkers > len(toProcess) {
+		numWorkers = len(toProcess)
+	}
+
+	type indexedResult struct {
+		idx    int
+		result source.ParseResult
+	}
+
+	work := make(chan int, len(toProcess))
+	results := make([]source.ParseResult, len(toProcess))
+	var wg sync.WaitGroup
+	var processed atomic.Int64
+
+	// Feed work
+	for i := range toProcess {
+		work <- i
+	}
+	close(work)
+
+	// Spawn workers
+	wg.Add(numWorkers)
+	for w := 0; w < numWorkers; w++ {
+		go func() {
+			defer wg.Done()
+			for idx := range work {
+				results[idx] = source.ParseFile(toProcess[idx])
+				n := processed.Add(1)
+				if progressFn != nil {
+					progressFn(int(n), len(toProcess))
+				}
+			}
+		}()
+	}
+
+	wg.Wait()
+
+	// Collect results
+	for _, pr := range results {
+		if pr.Err != nil {
+			result.FileErrors++
+			continue
+		}
+		result.ParsedFiles++
+		result.ParseErrors += pr.ParseErrors
+		if pr.Stats.APICalls > 0 || pr.Stats.UserMessages > 0 {
+			result.Sessions = append(result.Sessions, pr.Stats)
+		}
+	}
+
+	return result, nil
+}