feat: add data pipeline with parallel loading, aggregation, and cache integration
Implement the pipeline layer that orchestrates discovery, parsing,
caching, and aggregation:
- pipeline/loader.go: Load() discovers session files via ScanDir,
optionally filters out subagent files, then parses all files in
parallel using a bounded worker pool sized to GOMAXPROCS. Workers
read from a pre-filled channel (no contention on dispatch) and
report progress via an atomic counter and callback. LoadResult
tracks total files, parsed files, parse errors, and file errors.
- pipeline/aggregator.go: Five aggregation functions, all operating
on time-filtered session slices:
* Aggregate: computes SummaryStats across all sessions — total
tokens (5 types), estimated cost, cache savings (summed per-model
via config.CalculateCacheSavings), cache hit rate, and per-active-
day rates (cost, tokens, sessions, prompts, minutes).
* AggregateDays: groups sessions by local calendar date, sorted
most-recent-first.
* AggregateModels: groups by normalized model name with share
percentages, sorted by cost descending.
* AggregateProjects: groups by project name, sorted by cost.
* AggregateHourly: distributes prompt/session/token counts across
24 hour buckets (attributed to session start hour).
Also provides FilterByTime, FilterByProject, FilterByModel with
case-insensitive substring matching.
- pipeline/incremental.go: LoadWithCache() implements the incremental
loading strategy — compares discovered files against the cache's
file_tracker (mtime_ns + size), loads unchanged sessions from
SQLite, and only reparses files that changed. Reparsed results
are immediately saved back to cache. CacheDir/CachePath follow
XDG_CACHE_HOME convention (~/.cache/cburn/metrics.db).
- pipeline/bench_test.go: Benchmarks for ScanDir, ParseFile (worst-
case largest file), full Load, and LoadWithCache to measure the
incremental cache speedup.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
117
internal/pipeline/loader.go
Normal file
117
internal/pipeline/loader.go
Normal file
@@ -0,0 +1,117 @@
|
||||
package pipeline
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"runtime"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
|
||||
"cburn/internal/model"
|
||||
"cburn/internal/source"
|
||||
)
|
||||
|
||||
// LoadResult holds the output of the full data loading pipeline.
|
||||
type LoadResult struct {
|
||||
Sessions []model.SessionStats
|
||||
TotalFiles int
|
||||
ParsedFiles int
|
||||
ParseErrors int
|
||||
FileErrors int
|
||||
ProjectCount int
|
||||
}
|
||||
|
||||
// ProgressFunc is called during loading to report progress.
|
||||
// current is the number of files processed so far, total is the total count.
|
||||
type ProgressFunc func(current, total int)
|
||||
|
||||
// Load discovers and parses all session files from the Claude data directory.
|
||||
// It uses a bounded worker pool for parallel parsing.
|
||||
func Load(claudeDir string, includeSubagents bool, progressFn ProgressFunc) (*LoadResult, error) {
|
||||
// Discover files
|
||||
files, err := source.ScanDir(claudeDir)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("scanning %s: %w", claudeDir, err)
|
||||
}
|
||||
|
||||
if len(files) == 0 {
|
||||
return &LoadResult{}, nil
|
||||
}
|
||||
|
||||
// Filter subagents if requested
|
||||
var toProcess []source.DiscoveredFile
|
||||
if includeSubagents {
|
||||
toProcess = files
|
||||
} else {
|
||||
for _, f := range files {
|
||||
if !f.IsSubagent {
|
||||
toProcess = append(toProcess, f)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
result := &LoadResult{
|
||||
TotalFiles: len(toProcess),
|
||||
ProjectCount: source.CountProjects(files),
|
||||
}
|
||||
|
||||
if len(toProcess) == 0 {
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// Parallel parsing with bounded worker pool
|
||||
numWorkers := runtime.GOMAXPROCS(0)
|
||||
if numWorkers < 1 {
|
||||
numWorkers = 4
|
||||
}
|
||||
if numWorkers > len(toProcess) {
|
||||
numWorkers = len(toProcess)
|
||||
}
|
||||
|
||||
type indexedResult struct {
|
||||
idx int
|
||||
result source.ParseResult
|
||||
}
|
||||
|
||||
work := make(chan int, len(toProcess))
|
||||
results := make([]source.ParseResult, len(toProcess))
|
||||
var wg sync.WaitGroup
|
||||
var processed atomic.Int64
|
||||
|
||||
// Feed work
|
||||
for i := range toProcess {
|
||||
work <- i
|
||||
}
|
||||
close(work)
|
||||
|
||||
// Spawn workers
|
||||
wg.Add(numWorkers)
|
||||
for w := 0; w < numWorkers; w++ {
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
for idx := range work {
|
||||
results[idx] = source.ParseFile(toProcess[idx])
|
||||
n := processed.Add(1)
|
||||
if progressFn != nil {
|
||||
progressFn(int(n), len(toProcess))
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
|
||||
// Collect results
|
||||
for _, pr := range results {
|
||||
if pr.Err != nil {
|
||||
result.FileErrors++
|
||||
continue
|
||||
}
|
||||
result.ParsedFiles++
|
||||
result.ParseErrors += pr.ParseErrors
|
||||
if pr.Stats.APICalls > 0 || pr.Stats.UserMessages > 0 {
|
||||
result.Sessions = append(result.Sessions, pr.Stats)
|
||||
}
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
Reference in New Issue
Block a user