From 6cf5e92957a3c3317853618f0e8c38a647e2a28f Mon Sep 17 00:00:00 2001 From: vikingowl Date: Sun, 5 Apr 2026 22:08:08 +0200 Subject: [PATCH] =?UTF-8?q?feat:=20QualityTracker=20=E2=80=94=20EMA=20rout?= =?UTF-8?q?er=20feedback=20from=20elf=20outcomes,=20ResultFilePaths=20trac?= =?UTF-8?q?king?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- internal/elf/elf.go | 15 +++---- internal/elf/manager.go | 11 +++--- internal/router/feedback.go | 67 ++++++++++++++++++++++++++++++++ internal/router/feedback_test.go | 58 +++++++++++++++++++++++++++ internal/router/router.go | 33 +++++++++++----- internal/router/router_test.go | 4 +- internal/router/selector.go | 27 +++++++------ internal/tool/agent/agent.go | 15 ++++++- internal/tool/agent/batch.go | 17 +++++++- 9 files changed, 208 insertions(+), 39 deletions(-) create mode 100644 internal/router/feedback.go create mode 100644 internal/router/feedback_test.go diff --git a/internal/elf/elf.go b/internal/elf/elf.go index c29754e..d8b22fa 100644 --- a/internal/elf/elf.go +++ b/internal/elf/elf.go @@ -42,13 +42,14 @@ func (s Status) String() string { // Result is the output of a completed elf. type Result struct { - ID string - Status Status - Messages []message.Message - Usage message.Usage - Output string // final text output - Error error - Duration time.Duration + ID string + Status Status + Messages []message.Message + Usage message.Usage + Output string // final text output + Error error + Duration time.Duration + ResultFilePaths []string // paths to /tmp results produced by this elf's tools } // Elf is a sub-agent with its own engine and conversation history. diff --git a/internal/elf/manager.go b/internal/elf/manager.go index e5a47ce..190b9ec 100644 --- a/internal/elf/manager.go +++ b/internal/elf/manager.go @@ -135,11 +135,12 @@ func (m *Manager) ReportResult(result Result) { meta.decision.Commit(int(result.Usage.TotalTokens())) m.router.ReportOutcome(router.Outcome{ - ArmID: meta.armID, - TaskType: meta.taskType, - Success: result.Status == StatusCompleted, - Tokens: int(result.Usage.TotalTokens()), - Duration: result.Duration, + ArmID: meta.armID, + TaskType: meta.taskType, + Success: result.Status == StatusCompleted, + Tokens: int(result.Usage.TotalTokens()), + Duration: result.Duration, + ResultFilePaths: result.ResultFilePaths, }) } diff --git a/internal/router/feedback.go b/internal/router/feedback.go new file mode 100644 index 0000000..a8ae058 --- /dev/null +++ b/internal/router/feedback.go @@ -0,0 +1,67 @@ +package router + +import "sync" + +const ( + qualityAlpha = 0.3 // EMA smoothing factor (~3-sample memory) + minObservations = 3 // min samples before observed score overrides heuristic +) + +// EMAScore tracks an exponential moving average quality score. +type EMAScore struct { + Value float64 + Count int +} + +// QualityTracker records per-arm, per-task-type EMA quality scores from elf outcomes. +type QualityTracker struct { + mu sync.RWMutex + scores map[ArmID]map[TaskType]*EMAScore +} + +// NewQualityTracker returns an empty QualityTracker. +func NewQualityTracker() *QualityTracker { + return &QualityTracker{ + scores: make(map[ArmID]map[TaskType]*EMAScore), + } +} + +// Record updates the EMA score for the given arm and task type. +func (qt *QualityTracker) Record(armID ArmID, taskType TaskType, success bool) { + observation := 0.0 + if success { + observation = 1.0 + } + qt.mu.Lock() + defer qt.mu.Unlock() + if qt.scores[armID] == nil { + qt.scores[armID] = make(map[TaskType]*EMAScore) + } + s := qt.scores[armID][taskType] + if s == nil { + s = &EMAScore{} + qt.scores[armID][taskType] = s + } + if s.Count == 0 { + s.Value = observation + } else { + s.Value = qualityAlpha*observation + (1-qualityAlpha)*s.Value + } + s.Count++ +} + +// Quality returns the observed EMA score for an arm+task combination. +// Returns (0, false) when fewer than minObservations have been recorded. +func (qt *QualityTracker) Quality(armID ArmID, taskType TaskType) (score float64, hasData bool) { + qt.mu.RLock() + defer qt.mu.RUnlock() + m, ok := qt.scores[armID] + if !ok { + return 0, false + } + s, ok := m[taskType] + if !ok || s.Count < minObservations { + return 0, false + } + return s.Value, true +} diff --git a/internal/router/feedback_test.go b/internal/router/feedback_test.go new file mode 100644 index 0000000..bab4802 --- /dev/null +++ b/internal/router/feedback_test.go @@ -0,0 +1,58 @@ +package router_test + +import ( + "testing" + + "somegit.dev/Owlibou/gnoma/internal/router" +) + +func TestQualityTracker_NoDataReturnsHeuristic(t *testing.T) { + qt := router.NewQualityTracker() + _, hasData := qt.Quality("arm:model", router.TaskGeneration) + if hasData { + t.Error("expected no data for unobserved arm") + } +} + +func TestQualityTracker_RecordUpdatesEMA(t *testing.T) { + qt := router.NewQualityTracker() + for i := 0; i < 3; i++ { + qt.Record("arm:model", router.TaskGeneration, true) + } + score, hasData := qt.Quality("arm:model", router.TaskGeneration) + if !hasData { + t.Fatal("expected data after 3 observations") + } + if score <= 0 || score > 1 { + t.Errorf("score out of range [0,1]: %f", score) + } +} + +func TestQualityTracker_AllFailuresLowScore(t *testing.T) { + qt := router.NewQualityTracker() + for i := 0; i < 5; i++ { + qt.Record("arm:model", router.TaskDebug, false) + } + score, _ := qt.Quality("arm:model", router.TaskDebug) + if score > 0.3 { + t.Errorf("expected low score after all failures, got %f", score) + } +} + +func TestQualityTracker_ConcurrentSafe(t *testing.T) { + qt := router.NewQualityTracker() + done := make(chan struct{}) + for i := 0; i < 10; i++ { + go func(success bool) { + qt.Record("arm:model", router.TaskReview, success) + done <- struct{}{} + }(i%2 == 0) + } + for i := 0; i < 10; i++ { + <-done + } + score, _ := qt.Quality("arm:model", router.TaskReview) + if score < 0 || score > 1 { + t.Errorf("invalid score after concurrent writes: %f", score) + } +} diff --git a/internal/router/router.go b/internal/router/router.go index 7b226ff..bf7076d 100644 --- a/internal/router/router.go +++ b/internal/router/router.go @@ -22,6 +22,8 @@ type Router struct { forcedArm ArmID // When true, only local arms are considered (incognito mode) localOnly bool + + quality *QualityTracker } type Config struct { @@ -34,8 +36,9 @@ func New(cfg Config) *Router { logger = slog.Default() } return &Router{ - arms: make(map[ArmID]*Arm), - logger: logger, + arms: make(map[ArmID]*Arm), + logger: logger, + quality: NewQualityTracker(), } } @@ -89,7 +92,7 @@ func (r *Router) Select(task Task) RoutingDecision { } // Select best - best := selectBest(feasible, task) + best := selectBest(r.quality, feasible, task) if best == nil { return RoutingDecision{Error: fmt.Errorf("selection failed")} } @@ -140,25 +143,35 @@ func (r *Router) RemoveArm(id ArmID) { // Outcome records the result of a task execution for quality feedback. type Outcome struct { - ArmID ArmID - TaskType TaskType - Success bool - Tokens int - Duration time.Duration + ArmID ArmID + TaskType TaskType + Success bool + Tokens int + Duration time.Duration + ResultFilePaths []string // paths to /tmp tool result files (for M9 analysis) } // ReportOutcome records a task execution result for quality tracking. -// M4: logs only. M9 will use this for bandit learning. func (r *Router) ReportOutcome(o Outcome) { - r.logger.Debug("outcome reported", + r.quality.Record(o.ArmID, o.TaskType, o.Success) + r.logger.Debug("outcome recorded", "arm", o.ArmID, "task", o.TaskType, "success", o.Success, "tokens", o.Tokens, "duration", o.Duration, + "result_files", len(o.ResultFilePaths), ) } +// LookupArm returns the arm with the given ID, if registered. +func (r *Router) LookupArm(id ArmID) (*Arm, bool) { + r.mu.RLock() + defer r.mu.RUnlock() + arm, ok := r.arms[id] + return arm, ok +} + // Arms returns all registered arms. func (r *Router) Arms() []*Arm { r.mu.RLock() diff --git a/internal/router/router_test.go b/internal/router/router_test.go index e9c7d90..d055f1c 100644 --- a/internal/router/router_test.go +++ b/internal/router/router_test.go @@ -221,7 +221,7 @@ func TestSelectBest_PrefersToolSupport(t *testing.T) { } task := Task{Type: TaskGeneration, RequiresTools: true, Priority: PriorityNormal} - best := selectBest([]*Arm{withoutTools, withTools}, task) + best := selectBest(nil, []*Arm{withoutTools, withTools}, task) if best.ID != "a/with-tools" { t.Errorf("should prefer arm with tool support, got %s", best.ID) @@ -241,7 +241,7 @@ func TestSelectBest_PrefersThinkingForPlanning(t *testing.T) { } task := Task{Type: TaskPlanning, RequiresTools: true, Priority: PriorityNormal, EstimatedTokens: 5000} - best := selectBest([]*Arm{noThinking, thinking}, task) + best := selectBest(nil, []*Arm{noThinking, thinking}, task) if best.ID != "a/thinking" { t.Errorf("should prefer thinking model for planning, got %s", best.ID) diff --git a/internal/router/selector.go b/internal/router/selector.go index 20dedda..f0f5f10 100644 --- a/internal/router/selector.go +++ b/internal/router/selector.go @@ -36,10 +36,9 @@ func (d RoutingDecision) Rollback() { } } -// selectBest picks the highest-scoring feasible arm using heuristic scoring. -// No bandit learning — that's M9. Just smart defaults based on model size, -// locality, task type, cost, and pool scarcity. -func selectBest(arms []*Arm, task Task) *Arm { +// selectBest picks the highest-scoring feasible arm, blending heuristic and +// observed EMA quality when enough data is available. +func selectBest(qt *QualityTracker, arms []*Arm, task Task) *Arm { if len(arms) == 0 { return nil } @@ -48,7 +47,7 @@ func selectBest(arms []*Arm, task Task) *Arm { bestScore := math.Inf(-1) for _, arm := range arms { - score := scoreArm(arm, task) + score := scoreArm(qt, arm, task) if score > bestScore { bestScore = score best = arm @@ -58,17 +57,23 @@ func selectBest(arms []*Arm, task Task) *Arm { return best } -// scoreArm computes a heuristic quality/cost score for an arm. +// scoreArm computes a quality/cost score for an arm. +// When the quality tracker has sufficient observations, blends observed EMA +// (70%) with heuristic (30%). Falls back to pure heuristic otherwise. // Score = (quality × value) / effective_cost -func scoreArm(arm *Arm, task Task) float64 { - quality := heuristicQuality(arm, task) +func scoreArm(qt *QualityTracker, arm *Arm, task Task) float64 { + hq := heuristicQuality(arm, task) + quality := hq + if qt != nil { + if observed, hasData := qt.Quality(arm.ID, task.Type); hasData { + quality = 0.7*observed + 0.3*hq + } + } value := task.ValueScore() cost := effectiveCost(arm, task) - if cost <= 0 { - cost = 0.001 // prevent division by zero for free local models + cost = 0.001 } - return (quality * value) / cost } diff --git a/internal/tool/agent/agent.go b/internal/tool/agent/agent.go index cc60f5b..d157a39 100644 --- a/internal/tool/agent/agent.go +++ b/internal/tool/agent/agent.go @@ -86,7 +86,6 @@ func (t *Tool) Execute(ctx context.Context, args json.RawMessage) (tool.Result, if t.store != nil { preSave, _ = t.store.List("") } - _ = preSave // used in Task 4 for ResultFilePaths diff e, err := t.manager.Spawn(ctx, taskType, a.Prompt, systemPrompt, maxTurns) if err != nil { @@ -174,7 +173,19 @@ func (t *Tool) Execute(ctx context.Context, args json.RawMessage) (tool.Result, return tool.Result{Output: "Elf timed out after 5 minutes"}, nil } - // Report outcome to router for quality feedback + // Attribute /tmp result files produced during this elf's run + if t.store != nil { + postSave, _ := t.store.List("") + preSet := make(map[string]bool, len(preSave)) + for _, f := range preSave { + preSet[f.Path] = true + } + for _, f := range postSave { + if !preSet[f.Path] { + result.ResultFilePaths = append(result.ResultFilePaths, f.Path) + } + } + } t.manager.ReportResult(result) // Send done signal — stays in tree until turn completes diff --git a/internal/tool/agent/batch.go b/internal/tool/agent/batch.go index 3917441..87f846f 100644 --- a/internal/tool/agent/batch.go +++ b/internal/tool/agent/batch.go @@ -97,7 +97,6 @@ func (t *BatchTool) Execute(ctx context.Context, args json.RawMessage) (tool.Res if t.store != nil { preSave, _ = t.store.List("") } - _ = preSave // used in Task 4 // Spawn all elfs with slight stagger to avoid rate limit bursts type elfEntry struct { @@ -178,7 +177,21 @@ func (t *BatchTool) Execute(ctx context.Context, args json.RawMessage) (tool.Res } } - // Report outcome to router + // For batch elfs, attribute all new /tmp files produced during the batch + if t.store != nil { + postSave, _ := t.store.List("") + preSet := make(map[string]bool, len(preSave)) + for _, f := range preSave { + preSet[f.Path] = true + } + var newPaths []string + for _, f := range postSave { + if !preSet[f.Path] { + newPaths = append(newPaths, f.Path) + } + } + results[idx].ResultFilePaths = newPaths + } t.manager.ReportResult(results[idx]) // Send done progress