feat: add data pipeline with parallel loading, aggregation, and cache integration
Implement the pipeline layer that orchestrates discovery, parsing,
caching, and aggregation:
- pipeline/loader.go: Load() discovers session files via ScanDir,
optionally filters out subagent files, then parses all files in
parallel using a bounded worker pool sized to GOMAXPROCS. Workers
read from a pre-filled channel (no contention on dispatch) and
report progress via an atomic counter and callback. LoadResult
tracks total files, parsed files, parse errors, and file errors.
- pipeline/aggregator.go: Five aggregation functions, all operating
on time-filtered session slices:
* Aggregate: computes SummaryStats across all sessions — total
tokens (5 types), estimated cost, cache savings (summed per-model
via config.CalculateCacheSavings), cache hit rate, and per-active-
day rates (cost, tokens, sessions, prompts, minutes).
* AggregateDays: groups sessions by local calendar date, sorted
most-recent-first.
* AggregateModels: groups by normalized model name with share
percentages, sorted by cost descending.
* AggregateProjects: groups by project name, sorted by cost.
* AggregateHourly: distributes prompt/session/token counts across
24 hour buckets (attributed to session start hour).
Also provides FilterByTime, FilterByProject, FilterByModel with
case-insensitive substring matching.
- pipeline/incremental.go: LoadWithCache() implements the incremental
loading strategy — compares discovered files against the cache's
file_tracker (mtime_ns + size), loads unchanged sessions from
SQLite, and only reparses files that changed. Reparsed results
are immediately saved back to cache. CacheDir/CachePath follow
XDG_CACHE_HOME convention (~/.cache/cburn/metrics.db).
- pipeline/bench_test.go: Benchmarks for ScanDir, ParseFile (worst-
case largest file), full Load, and LoadWithCache to measure the
incremental cache speedup.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
261
internal/pipeline/aggregator.go
Normal file
261
internal/pipeline/aggregator.go
Normal file
@@ -0,0 +1,261 @@
|
|||||||
|
package pipeline
|
||||||
|
|
||||||
|
import (
|
||||||
|
"sort"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"cburn/internal/config"
|
||||||
|
"cburn/internal/model"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Aggregate computes summary statistics from a slice of session stats,
|
||||||
|
// filtered to sessions within the given time range.
|
||||||
|
func Aggregate(sessions []model.SessionStats, since, until time.Time) model.SummaryStats {
|
||||||
|
filtered := FilterByTime(sessions, since, until)
|
||||||
|
|
||||||
|
var stats model.SummaryStats
|
||||||
|
activeDays := make(map[string]struct{})
|
||||||
|
|
||||||
|
for _, s := range filtered {
|
||||||
|
stats.TotalSessions++
|
||||||
|
stats.TotalPrompts += s.UserMessages
|
||||||
|
stats.TotalAPICalls += s.APICalls
|
||||||
|
stats.TotalDurationSecs += s.DurationSecs
|
||||||
|
|
||||||
|
stats.InputTokens += s.InputTokens
|
||||||
|
stats.OutputTokens += s.OutputTokens
|
||||||
|
stats.CacheCreation5mTokens += s.CacheCreation5mTokens
|
||||||
|
stats.CacheCreation1hTokens += s.CacheCreation1hTokens
|
||||||
|
stats.CacheReadTokens += s.CacheReadTokens
|
||||||
|
stats.EstimatedCost += s.EstimatedCost
|
||||||
|
|
||||||
|
if !s.StartTime.IsZero() {
|
||||||
|
day := s.StartTime.Local().Format("2006-01-02")
|
||||||
|
activeDays[day] = struct{}{}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
stats.ActiveDays = len(activeDays)
|
||||||
|
stats.TotalBilledTokens = stats.InputTokens + stats.OutputTokens +
|
||||||
|
stats.CacheCreation5mTokens + stats.CacheCreation1hTokens
|
||||||
|
|
||||||
|
// Cache hit rate
|
||||||
|
totalCacheInput := stats.CacheReadTokens + stats.CacheCreation5mTokens +
|
||||||
|
stats.CacheCreation1hTokens + stats.InputTokens
|
||||||
|
if totalCacheInput > 0 {
|
||||||
|
stats.CacheHitRate = float64(stats.CacheReadTokens) / float64(totalCacheInput)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Cache savings (sum across all models found in sessions)
|
||||||
|
for _, s := range filtered {
|
||||||
|
for modelName, mu := range s.Models {
|
||||||
|
stats.CacheSavings += config.CalculateCacheSavings(modelName, mu.CacheReadTokens)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Per-active-day rates
|
||||||
|
if stats.ActiveDays > 0 {
|
||||||
|
days := float64(stats.ActiveDays)
|
||||||
|
stats.CostPerDay = stats.EstimatedCost / days
|
||||||
|
stats.TokensPerDay = int64(float64(stats.TotalBilledTokens) / days)
|
||||||
|
stats.SessionsPerDay = float64(stats.TotalSessions) / days
|
||||||
|
stats.PromptsPerDay = float64(stats.TotalPrompts) / days
|
||||||
|
stats.MinutesPerDay = float64(stats.TotalDurationSecs) / 60 / days
|
||||||
|
}
|
||||||
|
|
||||||
|
return stats
|
||||||
|
}
|
||||||
|
|
||||||
|
// AggregateDays computes per-day statistics from sessions.
|
||||||
|
func AggregateDays(sessions []model.SessionStats, since, until time.Time) []model.DailyStats {
|
||||||
|
filtered := FilterByTime(sessions, since, until)
|
||||||
|
|
||||||
|
dayMap := make(map[string]*model.DailyStats)
|
||||||
|
|
||||||
|
for _, s := range filtered {
|
||||||
|
if s.StartTime.IsZero() {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
dayKey := s.StartTime.Local().Format("2006-01-02")
|
||||||
|
ds, ok := dayMap[dayKey]
|
||||||
|
if !ok {
|
||||||
|
t, _ := time.ParseInLocation("2006-01-02", dayKey, time.Local)
|
||||||
|
ds = &model.DailyStats{Date: t}
|
||||||
|
dayMap[dayKey] = ds
|
||||||
|
}
|
||||||
|
|
||||||
|
ds.Sessions++
|
||||||
|
ds.Prompts += s.UserMessages
|
||||||
|
ds.APICalls += s.APICalls
|
||||||
|
ds.DurationSecs += s.DurationSecs
|
||||||
|
ds.InputTokens += s.InputTokens
|
||||||
|
ds.OutputTokens += s.OutputTokens
|
||||||
|
ds.CacheCreation5m += s.CacheCreation5mTokens
|
||||||
|
ds.CacheCreation1h += s.CacheCreation1hTokens
|
||||||
|
ds.CacheReadTokens += s.CacheReadTokens
|
||||||
|
ds.EstimatedCost += s.EstimatedCost
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convert to sorted slice (most recent first)
|
||||||
|
days := make([]model.DailyStats, 0, len(dayMap))
|
||||||
|
for _, ds := range dayMap {
|
||||||
|
days = append(days, *ds)
|
||||||
|
}
|
||||||
|
sort.Slice(days, func(i, j int) bool {
|
||||||
|
return days[i].Date.After(days[j].Date)
|
||||||
|
})
|
||||||
|
|
||||||
|
return days
|
||||||
|
}
|
||||||
|
|
||||||
|
// AggregateModels computes per-model statistics from sessions.
|
||||||
|
func AggregateModels(sessions []model.SessionStats, since, until time.Time) []model.ModelStats {
|
||||||
|
filtered := FilterByTime(sessions, since, until)
|
||||||
|
|
||||||
|
modelMap := make(map[string]*model.ModelStats)
|
||||||
|
totalCalls := 0
|
||||||
|
|
||||||
|
for _, s := range filtered {
|
||||||
|
for modelName, mu := range s.Models {
|
||||||
|
ms, ok := modelMap[modelName]
|
||||||
|
if !ok {
|
||||||
|
ms = &model.ModelStats{Model: modelName}
|
||||||
|
modelMap[modelName] = ms
|
||||||
|
}
|
||||||
|
ms.APICalls += mu.APICalls
|
||||||
|
ms.InputTokens += mu.InputTokens
|
||||||
|
ms.OutputTokens += mu.OutputTokens
|
||||||
|
ms.CacheCreation5m += mu.CacheCreation5mTokens
|
||||||
|
ms.CacheCreation1h += mu.CacheCreation1hTokens
|
||||||
|
ms.CacheReadTokens += mu.CacheReadTokens
|
||||||
|
ms.EstimatedCost += mu.EstimatedCost
|
||||||
|
totalCalls += mu.APICalls
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Compute share percentages and sort by cost descending
|
||||||
|
models := make([]model.ModelStats, 0, len(modelMap))
|
||||||
|
for _, ms := range modelMap {
|
||||||
|
if totalCalls > 0 {
|
||||||
|
ms.SharePercent = float64(ms.APICalls) / float64(totalCalls) * 100
|
||||||
|
}
|
||||||
|
models = append(models, *ms)
|
||||||
|
}
|
||||||
|
sort.Slice(models, func(i, j int) bool {
|
||||||
|
return models[i].EstimatedCost > models[j].EstimatedCost
|
||||||
|
})
|
||||||
|
|
||||||
|
return models
|
||||||
|
}
|
||||||
|
|
||||||
|
// AggregateProjects computes per-project statistics from sessions.
|
||||||
|
func AggregateProjects(sessions []model.SessionStats, since, until time.Time) []model.ProjectStats {
|
||||||
|
filtered := FilterByTime(sessions, since, until)
|
||||||
|
|
||||||
|
projMap := make(map[string]*model.ProjectStats)
|
||||||
|
|
||||||
|
for _, s := range filtered {
|
||||||
|
ps, ok := projMap[s.Project]
|
||||||
|
if !ok {
|
||||||
|
ps = &model.ProjectStats{Project: s.Project}
|
||||||
|
projMap[s.Project] = ps
|
||||||
|
}
|
||||||
|
ps.Sessions++
|
||||||
|
ps.Prompts += s.UserMessages
|
||||||
|
ps.TotalTokens += s.InputTokens + s.OutputTokens +
|
||||||
|
s.CacheCreation5mTokens + s.CacheCreation1hTokens
|
||||||
|
ps.EstimatedCost += s.EstimatedCost
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sort by cost descending
|
||||||
|
projects := make([]model.ProjectStats, 0, len(projMap))
|
||||||
|
for _, ps := range projMap {
|
||||||
|
projects = append(projects, *ps)
|
||||||
|
}
|
||||||
|
sort.Slice(projects, func(i, j int) bool {
|
||||||
|
return projects[i].EstimatedCost > projects[j].EstimatedCost
|
||||||
|
})
|
||||||
|
|
||||||
|
return projects
|
||||||
|
}
|
||||||
|
|
||||||
|
// AggregateHourly computes prompt counts by hour of day.
|
||||||
|
func AggregateHourly(sessions []model.SessionStats, since, until time.Time) []model.HourlyStats {
|
||||||
|
filtered := FilterByTime(sessions, since, until)
|
||||||
|
|
||||||
|
hours := make([]model.HourlyStats, 24)
|
||||||
|
for i := range hours {
|
||||||
|
hours[i].Hour = i
|
||||||
|
}
|
||||||
|
|
||||||
|
// We attribute all prompts and tokens to the session's start hour
|
||||||
|
for _, s := range filtered {
|
||||||
|
if s.StartTime.IsZero() {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
h := s.StartTime.Local().Hour()
|
||||||
|
hours[h].Prompts += s.UserMessages
|
||||||
|
hours[h].Sessions++
|
||||||
|
hours[h].Tokens += s.InputTokens + s.OutputTokens
|
||||||
|
}
|
||||||
|
|
||||||
|
return hours
|
||||||
|
}
|
||||||
|
|
||||||
|
// FilterByTime returns sessions whose start time falls within [since, until).
|
||||||
|
func FilterByTime(sessions []model.SessionStats, since, until time.Time) []model.SessionStats {
|
||||||
|
if since.IsZero() && until.IsZero() {
|
||||||
|
return sessions
|
||||||
|
}
|
||||||
|
|
||||||
|
var result []model.SessionStats
|
||||||
|
for _, s := range sessions {
|
||||||
|
if s.StartTime.IsZero() {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if !since.IsZero() && s.StartTime.Before(since) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if !until.IsZero() && !s.StartTime.Before(until) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
result = append(result, s)
|
||||||
|
}
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
// FilterByProject returns sessions matching the project substring.
|
||||||
|
func FilterByProject(sessions []model.SessionStats, project string) []model.SessionStats {
|
||||||
|
if project == "" {
|
||||||
|
return sessions
|
||||||
|
}
|
||||||
|
var result []model.SessionStats
|
||||||
|
for _, s := range sessions {
|
||||||
|
if containsIgnoreCase(s.Project, project) {
|
||||||
|
result = append(result, s)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
// FilterByModel returns sessions that have at least one API call to the given model.
|
||||||
|
func FilterByModel(sessions []model.SessionStats, modelFilter string) []model.SessionStats {
|
||||||
|
if modelFilter == "" {
|
||||||
|
return sessions
|
||||||
|
}
|
||||||
|
var result []model.SessionStats
|
||||||
|
for _, s := range sessions {
|
||||||
|
for m := range s.Models {
|
||||||
|
if containsIgnoreCase(m, modelFilter) {
|
||||||
|
result = append(result, s)
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
func containsIgnoreCase(s, substr string) bool {
|
||||||
|
return strings.Contains(strings.ToLower(s), strings.ToLower(substr))
|
||||||
|
}
|
||||||
92
internal/pipeline/bench_test.go
Normal file
92
internal/pipeline/bench_test.go
Normal file
@@ -0,0 +1,92 @@
|
|||||||
|
package pipeline
|
||||||
|
|
||||||
|
import (
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"cburn/internal/source"
|
||||||
|
"cburn/internal/store"
|
||||||
|
)
|
||||||
|
|
||||||
|
func BenchmarkLoad(b *testing.B) {
|
||||||
|
homeDir, _ := os.UserHomeDir()
|
||||||
|
claudeDir := filepath.Join(homeDir, ".claude")
|
||||||
|
|
||||||
|
b.ResetTimer()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
result, err := Load(claudeDir, true, nil)
|
||||||
|
if err != nil {
|
||||||
|
b.Fatal(err)
|
||||||
|
}
|
||||||
|
_ = result
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func BenchmarkParseFile(b *testing.B) {
|
||||||
|
homeDir, _ := os.UserHomeDir()
|
||||||
|
claudeDir := filepath.Join(homeDir, ".claude")
|
||||||
|
|
||||||
|
files, err := source.ScanDir(claudeDir)
|
||||||
|
if err != nil {
|
||||||
|
b.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Find the largest file for worst-case benchmarking
|
||||||
|
var biggest source.DiscoveredFile
|
||||||
|
var biggestSize int64
|
||||||
|
for _, f := range files {
|
||||||
|
info, err := os.Stat(f.Path)
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if info.Size() > biggestSize {
|
||||||
|
biggestSize = info.Size()
|
||||||
|
biggest = f
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
b.Logf("Benchmarking largest file: %s (%.1f KB)", biggest.Path, float64(biggestSize)/1024)
|
||||||
|
b.ResetTimer()
|
||||||
|
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
result := source.ParseFile(biggest)
|
||||||
|
if result.Err != nil {
|
||||||
|
b.Fatal(result.Err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func BenchmarkScanDir(b *testing.B) {
|
||||||
|
homeDir, _ := os.UserHomeDir()
|
||||||
|
claudeDir := filepath.Join(homeDir, ".claude")
|
||||||
|
|
||||||
|
b.ResetTimer()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
files, err := source.ScanDir(claudeDir)
|
||||||
|
if err != nil {
|
||||||
|
b.Fatal(err)
|
||||||
|
}
|
||||||
|
_ = files
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func BenchmarkLoadWithCache(b *testing.B) {
|
||||||
|
homeDir, _ := os.UserHomeDir()
|
||||||
|
claudeDir := filepath.Join(homeDir, ".claude")
|
||||||
|
|
||||||
|
cache, err := store.Open(CachePath())
|
||||||
|
if err != nil {
|
||||||
|
b.Fatal(err)
|
||||||
|
}
|
||||||
|
defer cache.Close()
|
||||||
|
|
||||||
|
b.ResetTimer()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
cr, err := LoadWithCache(claudeDir, true, cache, nil)
|
||||||
|
if err != nil {
|
||||||
|
b.Fatal(err)
|
||||||
|
}
|
||||||
|
_ = cr
|
||||||
|
}
|
||||||
|
}
|
||||||
177
internal/pipeline/incremental.go
Normal file
177
internal/pipeline/incremental.go
Normal file
@@ -0,0 +1,177 @@
|
|||||||
|
package pipeline
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"runtime"
|
||||||
|
"sync"
|
||||||
|
"sync/atomic"
|
||||||
|
|
||||||
|
"cburn/internal/source"
|
||||||
|
"cburn/internal/store"
|
||||||
|
)
|
||||||
|
|
||||||
|
// CachedLoadResult extends LoadResult with cache metadata.
|
||||||
|
type CachedLoadResult struct {
|
||||||
|
LoadResult
|
||||||
|
CacheHits int
|
||||||
|
Reparsed int
|
||||||
|
}
|
||||||
|
|
||||||
|
// LoadWithCache discovers, diffs against cache, parses only changed files,
|
||||||
|
// and returns the combined result set.
|
||||||
|
func LoadWithCache(claudeDir string, includeSubagents bool, cache *store.Cache, progressFn ProgressFunc) (*CachedLoadResult, error) {
|
||||||
|
// Discover files
|
||||||
|
files, err := source.ScanDir(claudeDir)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("scanning %s: %w", claudeDir, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(files) == 0 {
|
||||||
|
return &CachedLoadResult{}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Filter subagents if requested
|
||||||
|
var toProcess []source.DiscoveredFile
|
||||||
|
if includeSubagents {
|
||||||
|
toProcess = files
|
||||||
|
} else {
|
||||||
|
for _, f := range files {
|
||||||
|
if !f.IsSubagent {
|
||||||
|
toProcess = append(toProcess, f)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
result := &CachedLoadResult{
|
||||||
|
LoadResult: LoadResult{
|
||||||
|
TotalFiles: len(toProcess),
|
||||||
|
ProjectCount: source.CountProjects(files),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(toProcess) == 0 {
|
||||||
|
return result, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get tracked files from cache
|
||||||
|
tracked, err := cache.GetTrackedFiles()
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("reading cache: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Diff: partition into changed and unchanged
|
||||||
|
var toReparse []source.DiscoveredFile
|
||||||
|
var unchanged []string // file paths that haven't changed
|
||||||
|
|
||||||
|
for _, f := range toProcess {
|
||||||
|
info, err := os.Stat(f.Path)
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
cached, ok := tracked[f.Path]
|
||||||
|
if ok && cached.MtimeNs == info.ModTime().UnixNano() && cached.SizeBytes == info.Size() {
|
||||||
|
unchanged = append(unchanged, f.Path)
|
||||||
|
} else {
|
||||||
|
toReparse = append(toReparse, f)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
result.CacheHits = len(unchanged)
|
||||||
|
result.Reparsed = len(toReparse)
|
||||||
|
|
||||||
|
// Load cached sessions
|
||||||
|
if len(unchanged) > 0 {
|
||||||
|
cached, err := cache.LoadAllSessions()
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("loading cached sessions: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Filter to only sessions from unchanged files
|
||||||
|
unchangedSet := make(map[string]struct{}, len(unchanged))
|
||||||
|
for _, p := range unchanged {
|
||||||
|
unchangedSet[p] = struct{}{}
|
||||||
|
}
|
||||||
|
for _, s := range cached {
|
||||||
|
if _, ok := unchangedSet[s.FilePath]; ok {
|
||||||
|
result.Sessions = append(result.Sessions, s)
|
||||||
|
result.ParsedFiles++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parse changed files
|
||||||
|
if len(toReparse) > 0 {
|
||||||
|
numWorkers := runtime.GOMAXPROCS(0)
|
||||||
|
if numWorkers < 1 {
|
||||||
|
numWorkers = 4
|
||||||
|
}
|
||||||
|
if numWorkers > len(toReparse) {
|
||||||
|
numWorkers = len(toReparse)
|
||||||
|
}
|
||||||
|
|
||||||
|
work := make(chan int, len(toReparse))
|
||||||
|
results := make([]source.ParseResult, len(toReparse))
|
||||||
|
var wg sync.WaitGroup
|
||||||
|
var processed atomic.Int64
|
||||||
|
|
||||||
|
for i := range toReparse {
|
||||||
|
work <- i
|
||||||
|
}
|
||||||
|
close(work)
|
||||||
|
|
||||||
|
wg.Add(numWorkers)
|
||||||
|
for w := 0; w < numWorkers; w++ {
|
||||||
|
go func() {
|
||||||
|
defer wg.Done()
|
||||||
|
for idx := range work {
|
||||||
|
results[idx] = source.ParseFile(toReparse[idx])
|
||||||
|
n := processed.Add(1)
|
||||||
|
if progressFn != nil {
|
||||||
|
progressFn(int(n)+result.CacheHits, result.TotalFiles)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
|
||||||
|
wg.Wait()
|
||||||
|
|
||||||
|
// Collect and cache results
|
||||||
|
for i, pr := range results {
|
||||||
|
if pr.Err != nil {
|
||||||
|
result.FileErrors++
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
result.ParsedFiles++
|
||||||
|
result.ParseErrors += pr.ParseErrors
|
||||||
|
|
||||||
|
if pr.Stats.APICalls > 0 || pr.Stats.UserMessages > 0 {
|
||||||
|
result.Sessions = append(result.Sessions, pr.Stats)
|
||||||
|
|
||||||
|
// Save to cache
|
||||||
|
info, err := os.Stat(toReparse[i].Path)
|
||||||
|
if err == nil {
|
||||||
|
_ = cache.SaveSession(pr.Stats, info.ModTime().UnixNano(), info.Size())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return result, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// CacheDir returns the platform-appropriate cache directory.
|
||||||
|
func CacheDir() string {
|
||||||
|
if xdg := os.Getenv("XDG_CACHE_HOME"); xdg != "" {
|
||||||
|
return filepath.Join(xdg, "cburn")
|
||||||
|
}
|
||||||
|
home, _ := os.UserHomeDir()
|
||||||
|
return filepath.Join(home, ".cache", "cburn")
|
||||||
|
}
|
||||||
|
|
||||||
|
// CachePath returns the full path to the cache database.
|
||||||
|
func CachePath() string {
|
||||||
|
return filepath.Join(CacheDir(), "metrics.db")
|
||||||
|
}
|
||||||
117
internal/pipeline/loader.go
Normal file
117
internal/pipeline/loader.go
Normal file
@@ -0,0 +1,117 @@
|
|||||||
|
package pipeline
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"runtime"
|
||||||
|
"sync"
|
||||||
|
"sync/atomic"
|
||||||
|
|
||||||
|
"cburn/internal/model"
|
||||||
|
"cburn/internal/source"
|
||||||
|
)
|
||||||
|
|
||||||
|
// LoadResult holds the output of the full data loading pipeline.
|
||||||
|
type LoadResult struct {
|
||||||
|
Sessions []model.SessionStats
|
||||||
|
TotalFiles int
|
||||||
|
ParsedFiles int
|
||||||
|
ParseErrors int
|
||||||
|
FileErrors int
|
||||||
|
ProjectCount int
|
||||||
|
}
|
||||||
|
|
||||||
|
// ProgressFunc is called during loading to report progress.
|
||||||
|
// current is the number of files processed so far, total is the total count.
|
||||||
|
type ProgressFunc func(current, total int)
|
||||||
|
|
||||||
|
// Load discovers and parses all session files from the Claude data directory.
|
||||||
|
// It uses a bounded worker pool for parallel parsing.
|
||||||
|
func Load(claudeDir string, includeSubagents bool, progressFn ProgressFunc) (*LoadResult, error) {
|
||||||
|
// Discover files
|
||||||
|
files, err := source.ScanDir(claudeDir)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("scanning %s: %w", claudeDir, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(files) == 0 {
|
||||||
|
return &LoadResult{}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Filter subagents if requested
|
||||||
|
var toProcess []source.DiscoveredFile
|
||||||
|
if includeSubagents {
|
||||||
|
toProcess = files
|
||||||
|
} else {
|
||||||
|
for _, f := range files {
|
||||||
|
if !f.IsSubagent {
|
||||||
|
toProcess = append(toProcess, f)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
result := &LoadResult{
|
||||||
|
TotalFiles: len(toProcess),
|
||||||
|
ProjectCount: source.CountProjects(files),
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(toProcess) == 0 {
|
||||||
|
return result, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parallel parsing with bounded worker pool
|
||||||
|
numWorkers := runtime.GOMAXPROCS(0)
|
||||||
|
if numWorkers < 1 {
|
||||||
|
numWorkers = 4
|
||||||
|
}
|
||||||
|
if numWorkers > len(toProcess) {
|
||||||
|
numWorkers = len(toProcess)
|
||||||
|
}
|
||||||
|
|
||||||
|
type indexedResult struct {
|
||||||
|
idx int
|
||||||
|
result source.ParseResult
|
||||||
|
}
|
||||||
|
|
||||||
|
work := make(chan int, len(toProcess))
|
||||||
|
results := make([]source.ParseResult, len(toProcess))
|
||||||
|
var wg sync.WaitGroup
|
||||||
|
var processed atomic.Int64
|
||||||
|
|
||||||
|
// Feed work
|
||||||
|
for i := range toProcess {
|
||||||
|
work <- i
|
||||||
|
}
|
||||||
|
close(work)
|
||||||
|
|
||||||
|
// Spawn workers
|
||||||
|
wg.Add(numWorkers)
|
||||||
|
for w := 0; w < numWorkers; w++ {
|
||||||
|
go func() {
|
||||||
|
defer wg.Done()
|
||||||
|
for idx := range work {
|
||||||
|
results[idx] = source.ParseFile(toProcess[idx])
|
||||||
|
n := processed.Add(1)
|
||||||
|
if progressFn != nil {
|
||||||
|
progressFn(int(n), len(toProcess))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
|
||||||
|
wg.Wait()
|
||||||
|
|
||||||
|
// Collect results
|
||||||
|
for _, pr := range results {
|
||||||
|
if pr.Err != nil {
|
||||||
|
result.FileErrors++
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
result.ParsedFiles++
|
||||||
|
result.ParseErrors += pr.ParseErrors
|
||||||
|
if pr.Stats.APICalls > 0 || pr.Stats.UserMessages > 0 {
|
||||||
|
result.Sessions = append(result.Sessions, pr.Stats)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return result, nil
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user