Files
gnoma/internal/router/quality_json_test.go
T
vikingowl 58beb7ce3c feat(router): classifier-source telemetry + router stats command
Phase 4 routing decisions depend on knowing whether the SLM classifier
is actually firing or whether the heuristic is silently doing all the
work. Adds the instrumentation to make that observable.

router.ClassifierSource enum (heuristic / slm / slm_fallback) is set
on Task by every classifier:
- HeuristicClassifier → ClassifierHeuristic
- slm.Classifier → ClassifierSLM on success, ClassifierSLMFallback when
  the SLM call fails or returns unparseable output

The source is plumbed through router.Outcome to QualityTracker, which
now maintains per-source counters alongside the existing per-arm × task
EMA scores. QualitySnapshot serializes both (classifier_counts is
omitempty for back-compat with pre-feature quality.json files).

lazyClassifier logs at INFO the first time it falls back to heuristic
because the SLM hasn't booted yet — distinguishes operational fallback
from an unconfigured-SLM run.

slm.Manager.Start() now records elapsed-to-healthy and the main.go
goroutine logs it as part of the "SLM ready" event. Confirms whether
short-lived runs are racing the boot cycle.

New `gnoma router stats` subcommand prints both tables (arm × task
quality, classifier source breakdown) from quality.json with a Phase 4
trust hint when the data is too sparse or the SLM share is low.

6 new tests cover ClassifierSource string/enum, heuristic + SLM source
propagation, QualityTracker counter round-trip, and back-compat
restore from a legacy quality.json without classifier_counts.
2026-05-19 18:18:22 +02:00

139 lines
4.3 KiB
Go

package router_test
import (
"encoding/json"
"testing"
"somegit.dev/Owlibou/gnoma/internal/router"
)
func TestQualityTracker_SnapshotRestore_RoundTrip(t *testing.T) {
qt := router.NewQualityTracker()
// Record some outcomes
qt.Record("anthropic/claude-3-5-sonnet", router.TaskGeneration, true)
qt.Record("anthropic/claude-3-5-sonnet", router.TaskGeneration, true)
qt.Record("anthropic/claude-3-5-sonnet", router.TaskGeneration, false)
qt.Record("ollama/gemma3", router.TaskBoilerplate, true)
snap := qt.Snapshot()
// Verify snapshot has the data
if len(snap.Scores) == 0 {
t.Fatal("snapshot scores should not be empty")
}
// Marshal and unmarshal to simulate disk persistence
data, err := json.Marshal(snap)
if err != nil {
t.Fatal(err)
}
var restored router.QualitySnapshot
if err := json.Unmarshal(data, &restored); err != nil {
t.Fatal(err)
}
// Restore into a fresh tracker
qt2 := router.NewQualityTracker()
qt2.Restore(restored)
// After restore, Quality() should return data (Count >= minObservations=3)
score, hasData := qt2.Quality("anthropic/claude-3-5-sonnet", router.TaskGeneration)
if !hasData {
t.Error("expected quality data after restore")
}
if score <= 0 {
t.Errorf("expected positive score, got %f", score)
}
}
func TestQualityTracker_Snapshot_Empty(t *testing.T) {
qt := router.NewQualityTracker()
snap := qt.Snapshot()
if snap.Scores == nil {
t.Error("scores map should be initialized (not nil)")
}
if len(snap.Scores) != 0 {
t.Errorf("expected empty scores, got %d", len(snap.Scores))
}
}
func TestQualityTracker_ClassifierCounts_RecordAndSnapshot(t *testing.T) {
qt := router.NewQualityTracker()
qt.RecordClassifier(router.ClassifierHeuristic)
qt.RecordClassifier(router.ClassifierSLM)
qt.RecordClassifier(router.ClassifierSLM)
qt.RecordClassifier(router.ClassifierSLMFallback)
qt.RecordClassifier(router.ClassifierUnknown) // must be ignored
counts := qt.ClassifierCounts()
if counts[router.ClassifierHeuristic] != 1 {
t.Errorf("heuristic count = %d, want 1", counts[router.ClassifierHeuristic])
}
if counts[router.ClassifierSLM] != 2 {
t.Errorf("slm count = %d, want 2", counts[router.ClassifierSLM])
}
if counts[router.ClassifierSLMFallback] != 1 {
t.Errorf("slm_fallback count = %d, want 1", counts[router.ClassifierSLMFallback])
}
if counts[router.ClassifierUnknown] != 0 {
t.Errorf("unknown count = %d, want 0 (must be ignored)", counts[router.ClassifierUnknown])
}
// Snapshot round-trip.
snap := qt.Snapshot()
if snap.ClassifierCounts["slm"] != 2 {
t.Errorf("snapshot slm count = %d, want 2", snap.ClassifierCounts["slm"])
}
data, err := json.Marshal(snap)
if err != nil {
t.Fatal(err)
}
var restored router.QualitySnapshot
if err := json.Unmarshal(data, &restored); err != nil {
t.Fatal(err)
}
qt2 := router.NewQualityTracker()
qt2.Restore(restored)
if qt2.ClassifierCounts()[router.ClassifierSLM] != 2 {
t.Errorf("restored slm count = %d, want 2", qt2.ClassifierCounts()[router.ClassifierSLM])
}
}
// Verifies that loading a quality.json predating this feature (no
// classifier_counts field) doesn't break.
func TestQualityTracker_Restore_BackCompat_NoClassifierCounts(t *testing.T) {
legacy := []byte(`{"scores":{"foo":{"generation":{"Value":1,"Count":3}}}}`)
var snap router.QualitySnapshot
if err := json.Unmarshal(legacy, &snap); err != nil {
t.Fatal(err)
}
qt := router.NewQualityTracker()
qt.Restore(snap)
if qt.ClassifierCounts() == nil {
t.Error("ClassifierCounts() must return a non-nil map after restoring old snapshot")
}
if len(qt.ClassifierCounts()) != 0 {
t.Errorf("expected empty counts, got %d entries", len(qt.ClassifierCounts()))
}
// Scores should still load.
if _, ok := qt.Quality("foo", router.TaskGeneration); !ok {
t.Error("legacy scores should still load")
}
}
func TestQualityTracker_Restore_Replaces(t *testing.T) {
qt := router.NewQualityTracker()
qt.Record("arm-a", router.TaskDebug, true)
qt.Record("arm-a", router.TaskDebug, true)
qt.Record("arm-a", router.TaskDebug, true)
// Restore with different data — old data should be gone
empty := router.QualitySnapshot{Scores: make(map[string]map[string]*router.EMAScore)}
qt.Restore(empty)
_, hasData := qt.Quality("arm-a", router.TaskDebug)
if hasData {
t.Error("old data should be gone after restore with empty snapshot")
}
}