Files
gnoma/internal/router/task.go
T
vikingowl 0aabd19906 feat(router): per-arm strengths + cost weight (Phase D)
Plan D from docs/superpowers/plans/2026-05-19-post-slm-unlock.md
(static portion; dynamic bandit-driven promotion deferred to D-2).

Routing previously let tier ordering (CLI > local > API) dominate
selection — Opus, in tier 3, would lose to a tier-1 CLI agent for
SecurityReview even though Opus is empirically stronger at that task.
This change introduces explicit per-arm overrides:

  [[arms]]
  id = "anthropic/claude-opus-4-7"
  strengths = ["security_review", "planning"]
  cost_weight = 0.3

Strengths gate cross-tier promotion: arms matching task.Type bypass
the tier loop and compete with each other directly. Promotion is a
preference, not a pin — if no strength-tagged arm is feasible
(backoff, pool capacity, tool support), selection falls through to
the default tier order.

CostWeight linearly dampens the cost penalty in scoreArm via
  effectiveCost = 1 + CostWeight * (cost - 1)
CostWeight=1.0 (or unset) preserves current behavior; lower values
trade cheapness for quality. The earlier draft used cost^CostWeight
which inverts direction for sub-1 local-arm costs (raising a
fraction <1 to a fractional power makes it bigger, not smaller); a
monotonicity regression test prevents that drift.

- internal/router/arm.go: Strengths []TaskType, CostWeight float64,
  HasStrength(), ResolvedCostWeight() (zero → 1.0).
- internal/router/selector.go: scoreArm strength bonus const
  (strengthScoreBonus = 0.15) + linear cost dampening; selectBest
  cross-tier promotion before tier loop.
- internal/router/router.go: ArmOverride type + ApplyArmOverrides()
  returns unknown IDs; unknown strength names skipped with per-name
  warning via slog.
- internal/router/task.go: ParseTaskTypeStrict() returns ok bool;
  ParseTaskType now delegates so the two switches stay in sync.
- internal/config/config.go: ArmConfig + [[arms]] TOML wiring.
- cmd/gnoma/main.go: applies overrides after all initial arms
  register; logs a warning when an [[arms]] id has no matching
  registered arm.

Tests cover: predicate helpers, scoring direction across two arms,
linear-formula monotonicity on both sides of cost=1, cross-tier
promotion, empty-Strengths preserves tier order, promoted arm in
backoff falls through via full Router.Select path, observed-quality
tiebreak between two strength-tagged arms, ApplyArmOverrides happy
path + unknown-ID reporting + unknown-strength skipping.
2026-05-19 21:14:45 +02:00

387 lines
12 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package router
import (
"fmt"
"strings"
"somegit.dev/Owlibou/gnoma/internal/provider"
)
// TaskType classifies a task for routing purposes.
type TaskType int
const (
TaskBoilerplate TaskType = iota // simple scaffolding, templates
TaskGeneration // new code creation
TaskRefactor // restructuring existing code
TaskReview // code review, analysis
TaskUnitTest // writing tests
TaskPlanning // architecture, design
TaskOrchestration // multi-step coordination
TaskSecurityReview // security-focused analysis
TaskDebug // finding and fixing bugs
TaskExplain // explaining code or concepts
)
func (t TaskType) String() string {
switch t {
case TaskBoilerplate:
return "boilerplate"
case TaskGeneration:
return "generation"
case TaskRefactor:
return "refactor"
case TaskReview:
return "review"
case TaskUnitTest:
return "unit_test"
case TaskPlanning:
return "planning"
case TaskOrchestration:
return "orchestration"
case TaskSecurityReview:
return "security_review"
case TaskDebug:
return "debug"
case TaskExplain:
return "explain"
default:
return fmt.Sprintf("unknown(%d)", t)
}
}
// Priority indicates task importance for routing decisions.
type Priority int
const (
PriorityLow Priority = iota
PriorityNormal
PriorityHigh
PriorityCritical
)
// ClassifierSource identifies which classifier produced a Task.
// Phase 4 routing decisions depend on knowing whether the SLM is actually
// firing or whether the heuristic is silently doing all the work.
type ClassifierSource int
const (
ClassifierUnknown ClassifierSource = iota // unset / pre-classification
ClassifierHeuristic // router.HeuristicClassifier
ClassifierSLM // slm.Classifier (SLM call succeeded)
ClassifierSLMFallback // slm.Classifier fell back internally (timeout, parse error)
)
func (s ClassifierSource) String() string {
switch s {
case ClassifierHeuristic:
return "heuristic"
case ClassifierSLM:
return "slm"
case ClassifierSLMFallback:
return "slm_fallback"
default:
return "unknown"
}
}
// Task represents a classified unit of work for routing.
type Task struct {
Type TaskType
Priority Priority
EstimatedTokens int
RequiresTools bool
ComplexityScore float64 // 0-1
RequiredEffort provider.EffortLevel // EffortAuto = no constraint on thinking
ExcludedArms []ArmID // Arms to avoid (e.g. due to recent 429 errors)
ClassifierSource ClassifierSource // which classifier produced this Task
}
// ValueScore computes a routing value based on priority and type.
func (t Task) ValueScore() float64 {
base := map[Priority]float64{
PriorityLow: 0.5,
PriorityNormal: 1.0,
PriorityHigh: 2.0,
PriorityCritical: 5.0,
}[t.Priority]
return base * taskTypeMultiplier[t.Type]
}
var taskTypeMultiplier = map[TaskType]float64{
TaskBoilerplate: 0.6,
TaskGeneration: 1.0,
TaskRefactor: 0.9,
TaskReview: 1.1,
TaskUnitTest: 0.8,
TaskPlanning: 1.4,
TaskOrchestration: 1.5,
TaskSecurityReview: 2.0,
TaskDebug: 1.2,
TaskExplain: 0.7,
}
// QualityThreshold defines minimum acceptable quality for a task type.
type QualityThreshold struct {
Minimum float64 // below → output is harmful, never accept
Acceptable float64 // good enough
Target float64 // ideal
}
// DefaultThresholds are calibrated for M4 heuristic scores (range ~00.85).
// M9 will replace these with bandit-derived values once quality data accumulates.
var DefaultThresholds = map[TaskType]QualityThreshold{
TaskBoilerplate: {0.40, 0.55, 0.70}, // any capable arm works
TaskGeneration: {0.45, 0.60, 0.75},
TaskRefactor: {0.50, 0.65, 0.78},
TaskReview: {0.55, 0.68, 0.80},
TaskUnitTest: {0.45, 0.60, 0.75},
TaskPlanning: {0.60, 0.72, 0.82},
TaskOrchestration: {0.65, 0.75, 0.83},
TaskSecurityReview: {0.70, 0.78, 0.84}, // requires thinking or large context window
TaskDebug: {0.50, 0.65, 0.78},
TaskExplain: {0.40, 0.55, 0.72},
}
// inferEffort derives the minimum required reasoning effort from task type and complexity.
func inferEffort(task Task) provider.EffortLevel {
switch task.Type {
case TaskSecurityReview:
return provider.EffortHigh
case TaskOrchestration:
if task.ComplexityScore >= 0.5 {
return provider.EffortHigh
}
return provider.EffortMedium
case TaskPlanning:
if task.ComplexityScore >= 0.7 {
return provider.EffortHigh
}
return provider.EffortMedium
case TaskDebug, TaskRefactor, TaskReview:
if task.ComplexityScore >= 0.7 {
return provider.EffortMedium
}
if task.ComplexityScore >= 0.4 {
return provider.EffortLow
}
return provider.EffortAuto
case TaskGeneration:
if task.ComplexityScore >= 0.8 {
return provider.EffortMedium
}
return provider.EffortAuto
default:
return provider.EffortAuto
}
}
// ClassifyTask infers a TaskType from the user's prompt using keyword heuristics.
func ClassifyTask(prompt string) Task {
lower := strings.ToLower(prompt)
task := Task{
Priority: PriorityNormal,
RequiresTools: true, // assume tools needed by default
}
// Check for task type keywords (order matters — more specific/common first).
// Orchestration is placed late: its keywords ("dispatch", "pipeline", "orchestrat")
// appear as nouns in non-orchestration prompts (e.g. "refactor the pipeline dispatch",
// "review the orchestration layer"). Operational task types must gate first.
switch {
case containsAny(lower, "security", "vulnerability", "cve", "owasp", "xss", "injection", "audit security"):
task.Type = TaskSecurityReview
task.Priority = PriorityHigh
case containsAny(lower, "debug", "fix", "troubleshoot", "not working", "error", "crash", "failing", "bug"):
task.Type = TaskDebug
case containsAny(lower, "review", "check", "analyze", "audit", "inspect"):
task.Type = TaskReview
case containsAny(lower, "refactor", "restructure", "reorganize", "clean up", "simplify"):
task.Type = TaskRefactor
case containsAny(lower, "test", "spec", "coverage", "assert"):
task.Type = TaskUnitTest
case containsAny(lower, "explain", "what is", "how does", "describe", "tell me about"):
task.Type = TaskExplain
task.RequiresTools = false
case containsAny(lower, "plan", "architect", "design", "strategy", "roadmap"):
task.Type = TaskPlanning
case containsAny(lower, "orchestrat", "coordinate", "dispatch", "pipeline",
"fan out", "subtask", "delegate to", "spawn elf"):
task.Type = TaskOrchestration
task.Priority = PriorityHigh
case containsAny(lower, "create", "implement", "build", "add", "write", "generate", "make"):
task.Type = TaskGeneration
case containsAny(lower, "scaffold", "boilerplate", "template", "stub", "skeleton"):
task.Type = TaskBoilerplate
default:
task.Type = TaskGeneration // default
}
// Estimate complexity from prompt length and keywords
task.ComplexityScore = estimateComplexity(lower)
// Per-task-type complexity floor. A short "refactor X" prompt looks
// trivial by word count but the task itself implies existing code and
// non-trivial reasoning — clamping the floor up keeps such tasks out
// of the SLM arm's MaxComplexity ceiling.
if floor := MinComplexityForType(task.Type); task.ComplexityScore < floor {
task.ComplexityScore = floor
}
// Trivial-prompt override: short, knowledge-only prompts whose task
// type doesn't imply existing code to read or modify can run without
// tools — making the SLM arm (ToolUse=false) feasible for genuinely
// tiny questions like "what is 2+2?" or "explain a closure".
if isTrivialPrompt(lower, task.Type, task.ComplexityScore) {
task.RequiresTools = false
}
task.RequiredEffort = inferEffort(task)
return task
}
// MinComplexityForType returns the inherent complexity floor for a task
// type. Tasks that imply existing code or multi-step reasoning get a
// non-zero floor so short prompts don't slip past the SLM arm's
// MaxComplexity ceiling.
func MinComplexityForType(t TaskType) float64 {
switch t {
case TaskSecurityReview, TaskOrchestration:
return 0.6
case TaskRefactor, TaskPlanning, TaskDebug:
return 0.4
case TaskUnitTest, TaskReview:
return 0.35
default:
return 0
}
}
// trivialEligibleTypes are the task types where a "no tools needed" verdict
// is plausible from a short prompt alone. Debug / Refactor / Review / Test /
// SecurityReview / Orchestration all imply existing code or processes to
// touch — keep RequiresTools=true even if the wording is brief.
var trivialEligibleTypes = map[TaskType]bool{
TaskExplain: true,
TaskGeneration: true,
TaskBoilerplate: true,
}
// toolNeedingTokens name actions/objects that always require tool execution.
// Matched as whole words (string-fields), not substrings — avoids treating
// "tester" as "test".
var toolNeedingTokens = map[string]bool{
"read": true, "write": true, "edit": true, "create": true,
"delete": true, "remove": true, "list": true, "find": true,
"search": true, "grep": true, "open": true, "save": true,
"run": true, "execute": true, "compile": true, "build": true,
"test": true, "tests": true, "install": true, "commit": true,
"push": true, "pull": true, "diff": true, "file": true, "files": true,
}
// isTrivialPrompt is true when the prompt is short, low-complexity, of a
// type compatible with knowledge-only answers, and contains no token that
// implies a file/shell action.
func isTrivialPrompt(lower string, taskType TaskType, complexity float64) bool {
if !trivialEligibleTypes[taskType] {
return false
}
if complexity > 0.15 {
return false
}
fields := strings.Fields(lower)
if len(fields) > 12 {
return false
}
for _, w := range fields {
w = strings.Trim(w, ".,!?;:")
if toolNeedingTokens[w] {
return false
}
}
return true
}
func containsAny(s string, keywords ...string) bool {
for _, kw := range keywords {
if strings.Contains(s, kw) {
return true
}
}
return false
}
func estimateComplexity(prompt string) float64 {
score := 0.0
// Length contributes to complexity
words := len(strings.Fields(prompt))
score += float64(words) / 200.0 // normalize: 200 words = 1.0
// Complexity keywords
complexKeywords := []string{"implement", "design", "architect", "system", "integration", "migrate", "optimize"}
for _, kw := range complexKeywords {
if strings.Contains(prompt, kw) {
score += 0.15
}
}
// Simple keywords reduce complexity
simpleKeywords := []string{"rename", "format", "add field", "change name", "typo", "simple"}
for _, kw := range simpleKeywords {
if strings.Contains(prompt, kw) {
score -= 0.15
}
}
// Clamp to [0, 1]
if score < 0 {
score = 0
}
if score > 1 {
score = 1
}
return score
}
// ParseTaskTypeStrict is like ParseTaskType but reports whether the input
// matched a known type. Used by config wiring to surface typos in
// user-supplied task-type names instead of silently falling back to
// TaskGeneration.
func ParseTaskTypeStrict(s string) (TaskType, bool) {
switch strings.ToLower(strings.ReplaceAll(s, "_", "")) {
case "debug":
return TaskDebug, true
case "explain":
return TaskExplain, true
case "generation":
return TaskGeneration, true
case "refactor":
return TaskRefactor, true
case "unittest":
return TaskUnitTest, true
case "boilerplate":
return TaskBoilerplate, true
case "planning":
return TaskPlanning, true
case "orchestration":
return TaskOrchestration, true
case "securityreview":
return TaskSecurityReview, true
case "review":
return TaskReview, true
}
return TaskGeneration, false
}
// ParseTaskType converts a string from an SLM JSON response to a TaskType.
// Matching is case-insensitive. Unknown strings fall back to TaskGeneration.
// Use ParseTaskTypeStrict when you need to detect typos.
func ParseTaskType(s string) TaskType {
t, _ := ParseTaskTypeStrict(s)
return t
}