58beb7ce3c
Phase 4 routing decisions depend on knowing whether the SLM classifier is actually firing or whether the heuristic is silently doing all the work. Adds the instrumentation to make that observable. router.ClassifierSource enum (heuristic / slm / slm_fallback) is set on Task by every classifier: - HeuristicClassifier → ClassifierHeuristic - slm.Classifier → ClassifierSLM on success, ClassifierSLMFallback when the SLM call fails or returns unparseable output The source is plumbed through router.Outcome to QualityTracker, which now maintains per-source counters alongside the existing per-arm × task EMA scores. QualitySnapshot serializes both (classifier_counts is omitempty for back-compat with pre-feature quality.json files). lazyClassifier logs at INFO the first time it falls back to heuristic because the SLM hasn't booted yet — distinguishes operational fallback from an unconfigured-SLM run. slm.Manager.Start() now records elapsed-to-healthy and the main.go goroutine logs it as part of the "SLM ready" event. Confirms whether short-lived runs are racing the boot cycle. New `gnoma router stats` subcommand prints both tables (arm × task quality, classifier source breakdown) from quality.json with a Phase 4 trust hint when the data is too sparse or the SLM share is low. 6 new tests cover ClassifierSource string/enum, heuristic + SLM source propagation, QualityTracker counter round-trip, and back-compat restore from a legacy quality.json without classifier_counts.
139 lines
4.3 KiB
Go
139 lines
4.3 KiB
Go
package router_test
|
|
|
|
import (
|
|
"encoding/json"
|
|
"testing"
|
|
|
|
"somegit.dev/Owlibou/gnoma/internal/router"
|
|
)
|
|
|
|
func TestQualityTracker_SnapshotRestore_RoundTrip(t *testing.T) {
|
|
qt := router.NewQualityTracker()
|
|
// Record some outcomes
|
|
qt.Record("anthropic/claude-3-5-sonnet", router.TaskGeneration, true)
|
|
qt.Record("anthropic/claude-3-5-sonnet", router.TaskGeneration, true)
|
|
qt.Record("anthropic/claude-3-5-sonnet", router.TaskGeneration, false)
|
|
qt.Record("ollama/gemma3", router.TaskBoilerplate, true)
|
|
|
|
snap := qt.Snapshot()
|
|
|
|
// Verify snapshot has the data
|
|
if len(snap.Scores) == 0 {
|
|
t.Fatal("snapshot scores should not be empty")
|
|
}
|
|
|
|
// Marshal and unmarshal to simulate disk persistence
|
|
data, err := json.Marshal(snap)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
var restored router.QualitySnapshot
|
|
if err := json.Unmarshal(data, &restored); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
// Restore into a fresh tracker
|
|
qt2 := router.NewQualityTracker()
|
|
qt2.Restore(restored)
|
|
|
|
// After restore, Quality() should return data (Count >= minObservations=3)
|
|
score, hasData := qt2.Quality("anthropic/claude-3-5-sonnet", router.TaskGeneration)
|
|
if !hasData {
|
|
t.Error("expected quality data after restore")
|
|
}
|
|
if score <= 0 {
|
|
t.Errorf("expected positive score, got %f", score)
|
|
}
|
|
}
|
|
|
|
func TestQualityTracker_Snapshot_Empty(t *testing.T) {
|
|
qt := router.NewQualityTracker()
|
|
snap := qt.Snapshot()
|
|
if snap.Scores == nil {
|
|
t.Error("scores map should be initialized (not nil)")
|
|
}
|
|
if len(snap.Scores) != 0 {
|
|
t.Errorf("expected empty scores, got %d", len(snap.Scores))
|
|
}
|
|
}
|
|
|
|
func TestQualityTracker_ClassifierCounts_RecordAndSnapshot(t *testing.T) {
|
|
qt := router.NewQualityTracker()
|
|
qt.RecordClassifier(router.ClassifierHeuristic)
|
|
qt.RecordClassifier(router.ClassifierSLM)
|
|
qt.RecordClassifier(router.ClassifierSLM)
|
|
qt.RecordClassifier(router.ClassifierSLMFallback)
|
|
qt.RecordClassifier(router.ClassifierUnknown) // must be ignored
|
|
|
|
counts := qt.ClassifierCounts()
|
|
if counts[router.ClassifierHeuristic] != 1 {
|
|
t.Errorf("heuristic count = %d, want 1", counts[router.ClassifierHeuristic])
|
|
}
|
|
if counts[router.ClassifierSLM] != 2 {
|
|
t.Errorf("slm count = %d, want 2", counts[router.ClassifierSLM])
|
|
}
|
|
if counts[router.ClassifierSLMFallback] != 1 {
|
|
t.Errorf("slm_fallback count = %d, want 1", counts[router.ClassifierSLMFallback])
|
|
}
|
|
if counts[router.ClassifierUnknown] != 0 {
|
|
t.Errorf("unknown count = %d, want 0 (must be ignored)", counts[router.ClassifierUnknown])
|
|
}
|
|
|
|
// Snapshot round-trip.
|
|
snap := qt.Snapshot()
|
|
if snap.ClassifierCounts["slm"] != 2 {
|
|
t.Errorf("snapshot slm count = %d, want 2", snap.ClassifierCounts["slm"])
|
|
}
|
|
data, err := json.Marshal(snap)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
var restored router.QualitySnapshot
|
|
if err := json.Unmarshal(data, &restored); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
qt2 := router.NewQualityTracker()
|
|
qt2.Restore(restored)
|
|
if qt2.ClassifierCounts()[router.ClassifierSLM] != 2 {
|
|
t.Errorf("restored slm count = %d, want 2", qt2.ClassifierCounts()[router.ClassifierSLM])
|
|
}
|
|
}
|
|
|
|
// Verifies that loading a quality.json predating this feature (no
|
|
// classifier_counts field) doesn't break.
|
|
func TestQualityTracker_Restore_BackCompat_NoClassifierCounts(t *testing.T) {
|
|
legacy := []byte(`{"scores":{"foo":{"generation":{"Value":1,"Count":3}}}}`)
|
|
var snap router.QualitySnapshot
|
|
if err := json.Unmarshal(legacy, &snap); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
qt := router.NewQualityTracker()
|
|
qt.Restore(snap)
|
|
if qt.ClassifierCounts() == nil {
|
|
t.Error("ClassifierCounts() must return a non-nil map after restoring old snapshot")
|
|
}
|
|
if len(qt.ClassifierCounts()) != 0 {
|
|
t.Errorf("expected empty counts, got %d entries", len(qt.ClassifierCounts()))
|
|
}
|
|
// Scores should still load.
|
|
if _, ok := qt.Quality("foo", router.TaskGeneration); !ok {
|
|
t.Error("legacy scores should still load")
|
|
}
|
|
}
|
|
|
|
func TestQualityTracker_Restore_Replaces(t *testing.T) {
|
|
qt := router.NewQualityTracker()
|
|
qt.Record("arm-a", router.TaskDebug, true)
|
|
qt.Record("arm-a", router.TaskDebug, true)
|
|
qt.Record("arm-a", router.TaskDebug, true)
|
|
|
|
// Restore with different data — old data should be gone
|
|
empty := router.QualitySnapshot{Scores: make(map[string]map[string]*router.EMAScore)}
|
|
qt.Restore(empty)
|
|
|
|
_, hasData := qt.Quality("arm-a", router.TaskDebug)
|
|
if hasData {
|
|
t.Error("old data should be gone after restore with empty snapshot")
|
|
}
|
|
}
|