0aabd19906
Plan D from docs/superpowers/plans/2026-05-19-post-slm-unlock.md (static portion; dynamic bandit-driven promotion deferred to D-2). Routing previously let tier ordering (CLI > local > API) dominate selection — Opus, in tier 3, would lose to a tier-1 CLI agent for SecurityReview even though Opus is empirically stronger at that task. This change introduces explicit per-arm overrides: [[arms]] id = "anthropic/claude-opus-4-7" strengths = ["security_review", "planning"] cost_weight = 0.3 Strengths gate cross-tier promotion: arms matching task.Type bypass the tier loop and compete with each other directly. Promotion is a preference, not a pin — if no strength-tagged arm is feasible (backoff, pool capacity, tool support), selection falls through to the default tier order. CostWeight linearly dampens the cost penalty in scoreArm via effectiveCost = 1 + CostWeight * (cost - 1) CostWeight=1.0 (or unset) preserves current behavior; lower values trade cheapness for quality. The earlier draft used cost^CostWeight which inverts direction for sub-1 local-arm costs (raising a fraction <1 to a fractional power makes it bigger, not smaller); a monotonicity regression test prevents that drift. - internal/router/arm.go: Strengths []TaskType, CostWeight float64, HasStrength(), ResolvedCostWeight() (zero → 1.0). - internal/router/selector.go: scoreArm strength bonus const (strengthScoreBonus = 0.15) + linear cost dampening; selectBest cross-tier promotion before tier loop. - internal/router/router.go: ArmOverride type + ApplyArmOverrides() returns unknown IDs; unknown strength names skipped with per-name warning via slog. - internal/router/task.go: ParseTaskTypeStrict() returns ok bool; ParseTaskType now delegates so the two switches stay in sync. - internal/config/config.go: ArmConfig + [[arms]] TOML wiring. - cmd/gnoma/main.go: applies overrides after all initial arms register; logs a warning when an [[arms]] id has no matching registered arm. Tests cover: predicate helpers, scoring direction across two arms, linear-formula monotonicity on both sides of cost=1, cross-tier promotion, empty-Strengths preserves tier order, promoted arm in backoff falls through via full Router.Select path, observed-quality tiebreak between two strength-tagged arms, ApplyArmOverrides happy path + unknown-ID reporting + unknown-strength skipping.
387 lines
12 KiB
Go
387 lines
12 KiB
Go
package router
|
||
|
||
import (
|
||
"fmt"
|
||
"strings"
|
||
|
||
"somegit.dev/Owlibou/gnoma/internal/provider"
|
||
)
|
||
|
||
// TaskType classifies a task for routing purposes.
|
||
type TaskType int
|
||
|
||
const (
|
||
TaskBoilerplate TaskType = iota // simple scaffolding, templates
|
||
TaskGeneration // new code creation
|
||
TaskRefactor // restructuring existing code
|
||
TaskReview // code review, analysis
|
||
TaskUnitTest // writing tests
|
||
TaskPlanning // architecture, design
|
||
TaskOrchestration // multi-step coordination
|
||
TaskSecurityReview // security-focused analysis
|
||
TaskDebug // finding and fixing bugs
|
||
TaskExplain // explaining code or concepts
|
||
)
|
||
|
||
func (t TaskType) String() string {
|
||
switch t {
|
||
case TaskBoilerplate:
|
||
return "boilerplate"
|
||
case TaskGeneration:
|
||
return "generation"
|
||
case TaskRefactor:
|
||
return "refactor"
|
||
case TaskReview:
|
||
return "review"
|
||
case TaskUnitTest:
|
||
return "unit_test"
|
||
case TaskPlanning:
|
||
return "planning"
|
||
case TaskOrchestration:
|
||
return "orchestration"
|
||
case TaskSecurityReview:
|
||
return "security_review"
|
||
case TaskDebug:
|
||
return "debug"
|
||
case TaskExplain:
|
||
return "explain"
|
||
default:
|
||
return fmt.Sprintf("unknown(%d)", t)
|
||
}
|
||
}
|
||
|
||
// Priority indicates task importance for routing decisions.
|
||
type Priority int
|
||
|
||
const (
|
||
PriorityLow Priority = iota
|
||
PriorityNormal
|
||
PriorityHigh
|
||
PriorityCritical
|
||
)
|
||
|
||
// ClassifierSource identifies which classifier produced a Task.
|
||
// Phase 4 routing decisions depend on knowing whether the SLM is actually
|
||
// firing or whether the heuristic is silently doing all the work.
|
||
type ClassifierSource int
|
||
|
||
const (
|
||
ClassifierUnknown ClassifierSource = iota // unset / pre-classification
|
||
ClassifierHeuristic // router.HeuristicClassifier
|
||
ClassifierSLM // slm.Classifier (SLM call succeeded)
|
||
ClassifierSLMFallback // slm.Classifier fell back internally (timeout, parse error)
|
||
)
|
||
|
||
func (s ClassifierSource) String() string {
|
||
switch s {
|
||
case ClassifierHeuristic:
|
||
return "heuristic"
|
||
case ClassifierSLM:
|
||
return "slm"
|
||
case ClassifierSLMFallback:
|
||
return "slm_fallback"
|
||
default:
|
||
return "unknown"
|
||
}
|
||
}
|
||
|
||
// Task represents a classified unit of work for routing.
|
||
type Task struct {
|
||
Type TaskType
|
||
Priority Priority
|
||
EstimatedTokens int
|
||
RequiresTools bool
|
||
ComplexityScore float64 // 0-1
|
||
RequiredEffort provider.EffortLevel // EffortAuto = no constraint on thinking
|
||
ExcludedArms []ArmID // Arms to avoid (e.g. due to recent 429 errors)
|
||
ClassifierSource ClassifierSource // which classifier produced this Task
|
||
}
|
||
|
||
// ValueScore computes a routing value based on priority and type.
|
||
func (t Task) ValueScore() float64 {
|
||
base := map[Priority]float64{
|
||
PriorityLow: 0.5,
|
||
PriorityNormal: 1.0,
|
||
PriorityHigh: 2.0,
|
||
PriorityCritical: 5.0,
|
||
}[t.Priority]
|
||
|
||
return base * taskTypeMultiplier[t.Type]
|
||
}
|
||
|
||
var taskTypeMultiplier = map[TaskType]float64{
|
||
TaskBoilerplate: 0.6,
|
||
TaskGeneration: 1.0,
|
||
TaskRefactor: 0.9,
|
||
TaskReview: 1.1,
|
||
TaskUnitTest: 0.8,
|
||
TaskPlanning: 1.4,
|
||
TaskOrchestration: 1.5,
|
||
TaskSecurityReview: 2.0,
|
||
TaskDebug: 1.2,
|
||
TaskExplain: 0.7,
|
||
}
|
||
|
||
// QualityThreshold defines minimum acceptable quality for a task type.
|
||
type QualityThreshold struct {
|
||
Minimum float64 // below → output is harmful, never accept
|
||
Acceptable float64 // good enough
|
||
Target float64 // ideal
|
||
}
|
||
|
||
// DefaultThresholds are calibrated for M4 heuristic scores (range ~0–0.85).
|
||
// M9 will replace these with bandit-derived values once quality data accumulates.
|
||
var DefaultThresholds = map[TaskType]QualityThreshold{
|
||
TaskBoilerplate: {0.40, 0.55, 0.70}, // any capable arm works
|
||
TaskGeneration: {0.45, 0.60, 0.75},
|
||
TaskRefactor: {0.50, 0.65, 0.78},
|
||
TaskReview: {0.55, 0.68, 0.80},
|
||
TaskUnitTest: {0.45, 0.60, 0.75},
|
||
TaskPlanning: {0.60, 0.72, 0.82},
|
||
TaskOrchestration: {0.65, 0.75, 0.83},
|
||
TaskSecurityReview: {0.70, 0.78, 0.84}, // requires thinking or large context window
|
||
TaskDebug: {0.50, 0.65, 0.78},
|
||
TaskExplain: {0.40, 0.55, 0.72},
|
||
}
|
||
|
||
// inferEffort derives the minimum required reasoning effort from task type and complexity.
|
||
func inferEffort(task Task) provider.EffortLevel {
|
||
switch task.Type {
|
||
case TaskSecurityReview:
|
||
return provider.EffortHigh
|
||
case TaskOrchestration:
|
||
if task.ComplexityScore >= 0.5 {
|
||
return provider.EffortHigh
|
||
}
|
||
return provider.EffortMedium
|
||
case TaskPlanning:
|
||
if task.ComplexityScore >= 0.7 {
|
||
return provider.EffortHigh
|
||
}
|
||
return provider.EffortMedium
|
||
case TaskDebug, TaskRefactor, TaskReview:
|
||
if task.ComplexityScore >= 0.7 {
|
||
return provider.EffortMedium
|
||
}
|
||
if task.ComplexityScore >= 0.4 {
|
||
return provider.EffortLow
|
||
}
|
||
return provider.EffortAuto
|
||
case TaskGeneration:
|
||
if task.ComplexityScore >= 0.8 {
|
||
return provider.EffortMedium
|
||
}
|
||
return provider.EffortAuto
|
||
default:
|
||
return provider.EffortAuto
|
||
}
|
||
}
|
||
|
||
// ClassifyTask infers a TaskType from the user's prompt using keyword heuristics.
|
||
func ClassifyTask(prompt string) Task {
|
||
lower := strings.ToLower(prompt)
|
||
|
||
task := Task{
|
||
Priority: PriorityNormal,
|
||
RequiresTools: true, // assume tools needed by default
|
||
}
|
||
|
||
// Check for task type keywords (order matters — more specific/common first).
|
||
// Orchestration is placed late: its keywords ("dispatch", "pipeline", "orchestrat")
|
||
// appear as nouns in non-orchestration prompts (e.g. "refactor the pipeline dispatch",
|
||
// "review the orchestration layer"). Operational task types must gate first.
|
||
switch {
|
||
case containsAny(lower, "security", "vulnerability", "cve", "owasp", "xss", "injection", "audit security"):
|
||
task.Type = TaskSecurityReview
|
||
task.Priority = PriorityHigh
|
||
case containsAny(lower, "debug", "fix", "troubleshoot", "not working", "error", "crash", "failing", "bug"):
|
||
task.Type = TaskDebug
|
||
case containsAny(lower, "review", "check", "analyze", "audit", "inspect"):
|
||
task.Type = TaskReview
|
||
case containsAny(lower, "refactor", "restructure", "reorganize", "clean up", "simplify"):
|
||
task.Type = TaskRefactor
|
||
case containsAny(lower, "test", "spec", "coverage", "assert"):
|
||
task.Type = TaskUnitTest
|
||
case containsAny(lower, "explain", "what is", "how does", "describe", "tell me about"):
|
||
task.Type = TaskExplain
|
||
task.RequiresTools = false
|
||
case containsAny(lower, "plan", "architect", "design", "strategy", "roadmap"):
|
||
task.Type = TaskPlanning
|
||
case containsAny(lower, "orchestrat", "coordinate", "dispatch", "pipeline",
|
||
"fan out", "subtask", "delegate to", "spawn elf"):
|
||
task.Type = TaskOrchestration
|
||
task.Priority = PriorityHigh
|
||
case containsAny(lower, "create", "implement", "build", "add", "write", "generate", "make"):
|
||
task.Type = TaskGeneration
|
||
case containsAny(lower, "scaffold", "boilerplate", "template", "stub", "skeleton"):
|
||
task.Type = TaskBoilerplate
|
||
default:
|
||
task.Type = TaskGeneration // default
|
||
}
|
||
|
||
// Estimate complexity from prompt length and keywords
|
||
task.ComplexityScore = estimateComplexity(lower)
|
||
|
||
// Per-task-type complexity floor. A short "refactor X" prompt looks
|
||
// trivial by word count but the task itself implies existing code and
|
||
// non-trivial reasoning — clamping the floor up keeps such tasks out
|
||
// of the SLM arm's MaxComplexity ceiling.
|
||
if floor := MinComplexityForType(task.Type); task.ComplexityScore < floor {
|
||
task.ComplexityScore = floor
|
||
}
|
||
|
||
// Trivial-prompt override: short, knowledge-only prompts whose task
|
||
// type doesn't imply existing code to read or modify can run without
|
||
// tools — making the SLM arm (ToolUse=false) feasible for genuinely
|
||
// tiny questions like "what is 2+2?" or "explain a closure".
|
||
if isTrivialPrompt(lower, task.Type, task.ComplexityScore) {
|
||
task.RequiresTools = false
|
||
}
|
||
|
||
task.RequiredEffort = inferEffort(task)
|
||
|
||
return task
|
||
}
|
||
|
||
// MinComplexityForType returns the inherent complexity floor for a task
|
||
// type. Tasks that imply existing code or multi-step reasoning get a
|
||
// non-zero floor so short prompts don't slip past the SLM arm's
|
||
// MaxComplexity ceiling.
|
||
func MinComplexityForType(t TaskType) float64 {
|
||
switch t {
|
||
case TaskSecurityReview, TaskOrchestration:
|
||
return 0.6
|
||
case TaskRefactor, TaskPlanning, TaskDebug:
|
||
return 0.4
|
||
case TaskUnitTest, TaskReview:
|
||
return 0.35
|
||
default:
|
||
return 0
|
||
}
|
||
}
|
||
|
||
// trivialEligibleTypes are the task types where a "no tools needed" verdict
|
||
// is plausible from a short prompt alone. Debug / Refactor / Review / Test /
|
||
// SecurityReview / Orchestration all imply existing code or processes to
|
||
// touch — keep RequiresTools=true even if the wording is brief.
|
||
var trivialEligibleTypes = map[TaskType]bool{
|
||
TaskExplain: true,
|
||
TaskGeneration: true,
|
||
TaskBoilerplate: true,
|
||
}
|
||
|
||
// toolNeedingTokens name actions/objects that always require tool execution.
|
||
// Matched as whole words (string-fields), not substrings — avoids treating
|
||
// "tester" as "test".
|
||
var toolNeedingTokens = map[string]bool{
|
||
"read": true, "write": true, "edit": true, "create": true,
|
||
"delete": true, "remove": true, "list": true, "find": true,
|
||
"search": true, "grep": true, "open": true, "save": true,
|
||
"run": true, "execute": true, "compile": true, "build": true,
|
||
"test": true, "tests": true, "install": true, "commit": true,
|
||
"push": true, "pull": true, "diff": true, "file": true, "files": true,
|
||
}
|
||
|
||
// isTrivialPrompt is true when the prompt is short, low-complexity, of a
|
||
// type compatible with knowledge-only answers, and contains no token that
|
||
// implies a file/shell action.
|
||
func isTrivialPrompt(lower string, taskType TaskType, complexity float64) bool {
|
||
if !trivialEligibleTypes[taskType] {
|
||
return false
|
||
}
|
||
if complexity > 0.15 {
|
||
return false
|
||
}
|
||
fields := strings.Fields(lower)
|
||
if len(fields) > 12 {
|
||
return false
|
||
}
|
||
for _, w := range fields {
|
||
w = strings.Trim(w, ".,!?;:")
|
||
if toolNeedingTokens[w] {
|
||
return false
|
||
}
|
||
}
|
||
return true
|
||
}
|
||
|
||
func containsAny(s string, keywords ...string) bool {
|
||
for _, kw := range keywords {
|
||
if strings.Contains(s, kw) {
|
||
return true
|
||
}
|
||
}
|
||
return false
|
||
}
|
||
|
||
func estimateComplexity(prompt string) float64 {
|
||
score := 0.0
|
||
|
||
// Length contributes to complexity
|
||
words := len(strings.Fields(prompt))
|
||
score += float64(words) / 200.0 // normalize: 200 words = 1.0
|
||
|
||
// Complexity keywords
|
||
complexKeywords := []string{"implement", "design", "architect", "system", "integration", "migrate", "optimize"}
|
||
for _, kw := range complexKeywords {
|
||
if strings.Contains(prompt, kw) {
|
||
score += 0.15
|
||
}
|
||
}
|
||
|
||
// Simple keywords reduce complexity
|
||
simpleKeywords := []string{"rename", "format", "add field", "change name", "typo", "simple"}
|
||
for _, kw := range simpleKeywords {
|
||
if strings.Contains(prompt, kw) {
|
||
score -= 0.15
|
||
}
|
||
}
|
||
|
||
// Clamp to [0, 1]
|
||
if score < 0 {
|
||
score = 0
|
||
}
|
||
if score > 1 {
|
||
score = 1
|
||
}
|
||
return score
|
||
}
|
||
|
||
// ParseTaskTypeStrict is like ParseTaskType but reports whether the input
|
||
// matched a known type. Used by config wiring to surface typos in
|
||
// user-supplied task-type names instead of silently falling back to
|
||
// TaskGeneration.
|
||
func ParseTaskTypeStrict(s string) (TaskType, bool) {
|
||
switch strings.ToLower(strings.ReplaceAll(s, "_", "")) {
|
||
case "debug":
|
||
return TaskDebug, true
|
||
case "explain":
|
||
return TaskExplain, true
|
||
case "generation":
|
||
return TaskGeneration, true
|
||
case "refactor":
|
||
return TaskRefactor, true
|
||
case "unittest":
|
||
return TaskUnitTest, true
|
||
case "boilerplate":
|
||
return TaskBoilerplate, true
|
||
case "planning":
|
||
return TaskPlanning, true
|
||
case "orchestration":
|
||
return TaskOrchestration, true
|
||
case "securityreview":
|
||
return TaskSecurityReview, true
|
||
case "review":
|
||
return TaskReview, true
|
||
}
|
||
return TaskGeneration, false
|
||
}
|
||
|
||
// ParseTaskType converts a string from an SLM JSON response to a TaskType.
|
||
// Matching is case-insensitive. Unknown strings fall back to TaskGeneration.
|
||
// Use ParseTaskTypeStrict when you need to detect typos.
|
||
func ParseTaskType(s string) TaskType {
|
||
t, _ := ParseTaskTypeStrict(s)
|
||
return t
|
||
}
|