fix(m8): replace_default map, error UX, benchmarks, and launch prep

- Fix replace_default positional bug: []string → map[string]string for explicit MCP tool → built-in name mapping - Improve error messages for missing API keys (3 actionable options) and unknown providers (early validation with available list) - Remove python3 dependency from MCP tests (pure bash grep/sed parsing) - Add router benchmark scaffold (6 benchmarks in bench_test.go + docs) - Add .goreleaser.yml for cross-platform binary releases with ldflags - Add launch-ready README with quickstart, extensibility docs, GIF placeholder - Add CONTRIBUTING.md and Gitea issue templates (bug report, feature request)
2026-04-12 03:34:58 +02:00
parent d2d79d65da
commit d7b524664d
15 changed files with 530 additions and 34 deletions
--- a/.gitea/issue_template/bug_report.yaml
+++ b/.gitea/issue_template/bug_report.yaml
@@ -0,0 +1,58 @@
+name: Bug Report
+about: Report something that isn't working correctly
+labels:
+  - bug
+body:
+  - type: textarea
+    id: description
+    attributes:
+      label: Description
+      description: What happened? What did you expect?
+    validations:
+      required: true
+  - type: textarea
+    id: reproduction
+    attributes:
+      label: Steps to reproduce
+      description: Minimal steps to trigger the issue
+      placeholder: |
+        1. Run `gnoma --provider anthropic`
+        2. Type "..."
+        3. See error
+    validations:
+      required: true
+  - type: input
+    id: version
+    attributes:
+      label: gnoma version
+      description: Output of `gnoma --version`
+      placeholder: "gnoma 0.1.0 (abc1234, 2026-04-12)"
+    validations:
+      required: true
+  - type: input
+    id: os
+    attributes:
+      label: OS / Architecture
+      placeholder: "Linux x86_64 / macOS arm64 / Windows amd64"
+    validations:
+      required: true
+  - type: dropdown
+    id: provider
+    attributes:
+      label: Provider
+      options:
+        - mistral
+        - anthropic
+        - openai
+        - google
+        - ollama
+        - llamacpp
+        - N/A
+    validations:
+      required: false
+  - type: textarea
+    id: logs
+    attributes:
+      label: Relevant logs
+      description: Run with `--verbose` for debug output
+      render: shell
--- a/.gitea/issue_template/feature_request.yaml
+++ b/.gitea/issue_template/feature_request.yaml
@@ -0,0 +1,42 @@
+name: Feature Request
+about: Suggest an improvement or new capability
+labels:
+  - enhancement
+body:
+  - type: textarea
+    id: problem
+    attributes:
+      label: Problem
+      description: What are you trying to do that gnoma doesn't support well?
+    validations:
+      required: true
+  - type: textarea
+    id: solution
+    attributes:
+      label: Proposed solution
+      description: How would you like this to work?
+    validations:
+      required: true
+  - type: textarea
+    id: alternatives
+    attributes:
+      label: Alternatives considered
+      description: Other approaches you've thought about
+    validations:
+      required: false
+  - type: dropdown
+    id: area
+    attributes:
+      label: Area
+      options:
+        - providers
+        - tools
+        - router
+        - TUI
+        - MCP / plugins
+        - elfs (sub-agents)
+        - security
+        - config
+        - other
+    validations:
+      required: false
--- a/.goreleaser.yml
+++ b/.goreleaser.yml
@@ -0,0 +1,47 @@
+version: 2
+
+before:
+  hooks:
+    - go mod tidy
+
+builds:
+  - main: ./cmd/gnoma
+    binary: gnoma
+    env:
+      - CGO_ENABLED=0
+    goos:
+      - linux
+      - darwin
+      - windows
+    goarch:
+      - amd64
+      - arm64
+    ldflags:
+      - -s -w
+      - -X main.buildVersion={{.Version}}
+      - -X main.buildCommit={{.ShortCommit}}
+      - -X main.buildDate={{.Date}}
+
+archives:
+  - formats: [tar.gz]
+    format_overrides:
+      - goos: windows
+        formats: [zip]
+    name_template: >-
+      {{ .ProjectName }}_{{ .Version }}_{{ .Os }}_{{ .Arch }}
+
+checksum:
+  name_template: checksums.txt
+
+changelog:
+  sort: asc
+  filters:
+    exclude:
+      - "^docs:"
+      - "^test:"
+      - "^chore:"
+
+release:
+  gitea:
+    owner: Owlibou
+    name: gnoma
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -0,0 +1,53 @@
+# Contributing to gnoma
+
+## Setup
+
+```sh
+git clone https://somegit.dev/Owlibou/gnoma && cd gnoma
+make build   # requires Go 1.26+
+make test
+make lint    # requires golangci-lint
+```
+
+## Development workflow
+
+1. Create a branch from `main`
+2. Write tests first (TDD) — table-driven, `t.TempDir()` for filesystem tests
+3. `make check` (fmt + vet + lint + test) must pass
+4. Commit with conventional messages: `feat:`, `fix:`, `refactor:`, `test:`, `docs:`
+
+## Code style
+
+- Go 1.26 idioms (`new(expr)`, `errors.AsType[E]`)
+- Structured logging with `log/slog`
+- `json.RawMessage` for tool schemas (zero-cost passthrough)
+- Functional options for complex configuration
+- Short, lowercase package names — no underscores
+
+## Testing
+
+- Unit tests: `make test`
+- Integration tests (require API keys): `make test-integration`
+- Coverage: `make cover`
+- Benchmarks: `go test -bench=. ./internal/router/`
+
+Integration tests use `//go:build integration` and are skipped by default.
+
+## Architecture
+
+Read `docs/essentials/INDEX.md` before making architectural changes. Key packages:
+
+| Package | Purpose |
+|---------|---------|
+| `internal/engine` | Agentic loop (stream → tool → re-query) |
+| `internal/router` | Multi-armed bandit arm selection |
+| `internal/provider` | LLM provider adapters |
+| `internal/tool` | Tool interface + registry |
+| `internal/mcp` | MCP client (JSON-RPC over stdio) |
+| `internal/plugin` | Plugin manifest, loader, manager |
+| `internal/elf` | Sub-agent (elf) system |
+| `internal/tui` | Bubble Tea terminal UI |
+
+## Issues
+
+Use the issue templates when filing bugs or requesting features. Include reproduction steps, expected behavior, and gnoma version (`gnoma --version`).
--- a/README.md
+++ b/README.md
@@ -1,7 +1,28 @@
 # gnoma

-Provider-agnostic agentic coding assistant in Go.
-Named after the northern pygmy-owl (*Glaucidium gnoma*). Agents are called **elfs** (elf owl).
+**A provider-agnostic agentic coding assistant built in Go.** gnoma routes tasks to the best available LLM — cloud or local — through a multi-armed bandit router, while tools, hooks, skills, MCP servers, and plugins keep it extensible. Named after the northern pygmy-owl (*Glaucidium gnoma*); agents are called **elfs** (elf owl).
+
+<!-- TODO: replace with actual demo recording -->
+<!-- ![demo](docs/assets/demo.gif) -->
+
+## Quickstart
+
+```sh
+# Install
+go install somegit.dev/Owlibou/gnoma/cmd/gnoma@latest
+
+# Or build from source
+git clone https://somegit.dev/Owlibou/gnoma && cd gnoma
+make build    # binary at ./bin/gnoma
+
+# Set at least one provider key
+export ANTHROPIC_API_KEY=sk-ant-...   # or OPENAI_API_KEY, MISTRAL_API_KEY, GEMINI_API_KEY
+
+# Run
+gnoma                                 # interactive TUI
+echo "list files" | gnoma             # pipe mode
+gnoma --provider ollama               # use a local model
+```

 ## Build

@@ -104,6 +125,61 @@ llamacpp = "http://localhost:9090/v1"

 ---

+## Extensibility (M8)
+
+gnoma supports hooks, skills, MCP servers, and plugins.
+
+### MCP Servers
+
+Connect any [MCP](https://modelcontextprotocol.io)-compatible tool server:
+
+```toml
+[[mcp_servers]]
+name    = "git"
+command = "mcp-server-git"
+args    = ["--repo", "."]
+timeout = "30s"
+
+# Replace a built-in tool with an MCP tool
+[mcp_servers.replace_default]
+exec = "bash"   # MCP tool "exec" replaces gnoma's built-in "bash"
+```
+
+MCP tools appear as `mcp__{server}__{tool}` (e.g., `mcp__git__status`), or under the built-in name when using `replace_default`.
+
+### Skills
+
+Drop markdown files into `.gnoma/skills/` or `~/.config/gnoma/skills/`:
+
+```
+/skillname          # invoke a skill
+/skills             # list available skills
+```
+
+### Hooks
+
+Run shell commands on tool events:
+
+```toml
+[[hooks]]
+name         = "block-rm-rf"
+event        = "pre_tool_use"
+type         = "command"
+exec         = "bash-safety-check.sh"
+tool_pattern = "bash*"
+```
+
+### Plugins
+
+Bundle skills, hooks, and MCP configs into installable plugins:
+
+```sh
+gnoma plugin install ./my-plugin    # install from directory
+gnoma plugin list                   # list installed plugins
+```
+
+---
+
 ## Session Persistence

 Conversations are auto-saved to `.gnoma/sessions/` after each completed turn. On a crash you lose at most the current in-flight turn; all previously completed turns are safe.
--- a/cmd/gnoma/main.go
+++ b/cmd/gnoma/main.go
@@ -46,6 +46,13 @@ import (
 	"somegit.dev/Owlibou/gnoma/internal/tool/sysinfo"
 )

+// Set by goreleaser ldflags.
+var (
+	buildVersion = "dev"
+	buildCommit  = "none"
+	buildDate    = "unknown"
+)
+
 func main() {
 	var resumeFlag string
 	var (
@@ -64,7 +71,7 @@ func main() {
 	flag.Parse()

 	if *version {
-		fmt.Println("gnoma v0.1.0-dev")
+		fmt.Printf("gnoma %s (%s, %s)\n", buildVersion, buildCommit, buildDate)
 		os.Exit(0)
 	}

@@ -123,7 +130,17 @@ func main() {
 	}

 	// Resolve API key: CLI flag → config → env vars
+	knownProviders := map[string]bool{
+		"mistral": true, "anthropic": true, "openai": true,
+		"google": true, "ollama": true, "llamacpp": true,
+	}
 	localProviders := map[string]bool{"ollama": true, "llamacpp": true}
+
+	if !knownProviders[*providerName] {
+		fmt.Fprintf(os.Stderr, "error: unknown provider %q\n  available: mistral, anthropic, openai, google, ollama, llamacpp\n  usage:     gnoma --provider <name>\n", *providerName)
+		os.Exit(1)
+	}
+
 	key := *apiKey
 	if key == "" {
 		if cfgKey, ok := cfg.Provider.APIKeys[*providerName]; ok && cfgKey != "" {
@@ -134,8 +151,14 @@ func main() {
 		key = resolveAPIKey(*providerName)
 	}
 	if key == "" && !localProviders[*providerName] {
-		fmt.Fprintf(os.Stderr, "error: no API key for provider %q\nSet %s environment variable or use --api-key\n",
-			*providerName, envKeyFor(*providerName))
+		envVar := envKeyFor(*providerName)
+		fmt.Fprintf(os.Stderr, "error: no API key for provider %q\n\n", *providerName)
+		fmt.Fprintf(os.Stderr, "  Option 1: export %s=<your-key>\n", envVar)
+		fmt.Fprintf(os.Stderr, "  Option 2: gnoma --api-key <your-key>\n")
+		fmt.Fprintf(os.Stderr, "  Option 3: add to .gnoma/config.toml:\n")
+		fmt.Fprintf(os.Stderr, "            [provider.api_keys]\n")
+		fmt.Fprintf(os.Stderr, "            %s = \"<your-key>\"\n\n", *providerName)
+		fmt.Fprintf(os.Stderr, "For local models (no API key needed): gnoma --provider ollama\n")
 		os.Exit(1)
 	}

@@ -735,7 +758,7 @@ func createProvider(name, apiKey, model, baseURL string) (provider.Provider, err
 	case "llamacpp":
 		return openaicompat.NewLlamaCpp(cfg)
 	default:
-		return nil, fmt.Errorf("unknown provider %q (supports: mistral, anthropic, openai, google, ollama, llamacpp)", name)
+		return nil, fmt.Errorf("unknown provider %q\n  available: mistral, anthropic, openai, google, ollama, llamacpp\n  usage:     gnoma --provider <name>", name)
 	}
 }

--- a/docs/benchmarks/README.md
+++ b/docs/benchmarks/README.md
@@ -0,0 +1,43 @@
+# Router Benchmarks
+
+Tracking how gnoma's multi-armed bandit router (M4 heuristic, M9 bandit) performs across providers, task types, and cost envelopes.
+
+## Methodology
+
+Each benchmark run:
+
+1. Registers a set of arms (provider/model pairs) with known cost profiles
+2. Generates synthetic tasks across all 10 task types with varying complexity
+3. Runs N routing decisions and records: arm selected, latency, quality score, cost
+4. Reports convergence metrics after simulated quality feedback
+
+## Metrics
+
+| Metric | Description |
+|--------|-------------|
+| **Selection accuracy** | % of tasks routed to the optimal arm (vs. oracle with perfect knowledge) |
+| **Cost efficiency** | Total cost relative to always-cheapest and always-best-quality baselines |
+| **Convergence speed** | Observations needed before bandit matches heuristic on quality (M9) |
+| **Pool utilization** | % of rate limit budget consumed before exhaustion |
+| **Latency overhead** | Time spent in Select() excluding provider round-trip |
+
+## Running
+
+```sh
+# Go benchmarks (in-process, no real API calls)
+go test -bench=. -benchmem ./internal/router/
+
+# Synthetic routing simulation (when available)
+go run ./cmd/gnoma-bench/ --arms=5 --tasks=1000 --seed=42
+```
+
+## Results
+
+No benchmark results yet. This scaffold will be populated as M9 (Router Advanced) lands.
+
+### Planned comparisons
+
+- Heuristic-only (M4) vs. bandit (M9) after 50, 200, 1000 observations
+- 2-arm (local + cloud) vs. 5-arm (mixed providers) scenarios
+- Cost-capped routing: $5/day budget with mixed task load
+- Quality degradation under rate limit pressure (pool scarcity)
--- a/internal/config/config.go
+++ b/internal/config/config.go
@@ -25,14 +25,14 @@ type Config struct {
 //	args = ["--repo", "."]
 //	env = { GIT_DIR = ".git" }
 //	timeout = "30s"
-//	replace_default = ["bash"]
+//	replace_default = { exec = "bash" }  # MCP tool "exec" replaces built-in "bash"
 type MCPServerConfig struct {
 	Name           string            `toml:"name"`
 	Command        string            `toml:"command"`
 	Args           []string          `toml:"args"`
 	Env            map[string]string `toml:"env"`
 	Timeout        string            `toml:"timeout"`
-	ReplaceDefault []string          `toml:"replace_default"`
+	ReplaceDefault map[string]string `toml:"replace_default"` // MCP tool name → built-in name
 }

 // PluginsSection controls plugin loading.
--- a/internal/mcp/client_test.go
+++ b/internal/mcp/client_test.go
@@ -28,14 +28,14 @@ func writeMCPServer(t *testing.T, tools []MCPTool, callResult string) string {
 	os.WriteFile(filepath.Join(dir, "tools.json"), toolsJSON, 0o644)
 	os.WriteFile(filepath.Join(dir, "call.json"), []byte(callResult), 0o644)

-	// The script uses jq-free JSON construction: reads response payload from
-	// file and wraps it in a JSON-RPC envelope using python (widely available).
+	// The script uses pure bash for JSON parsing — no python3 or jq dependency.
+	// We extract "method" and "id" with grep since the JSON-RPC format is predictable.
 	script := filepath.Join(dir, "mcp-server.sh")
 	content := `#!/bin/bash
 DIR="` + dir + `"
 while IFS= read -r line; do
-  method=$(echo "$line" | python3 -c "import sys,json; print(json.load(sys.stdin).get('method',''))" 2>/dev/null)
-  id=$(echo "$line" | python3 -c "import sys,json; print(json.load(sys.stdin).get('id',0))" 2>/dev/null)
+  method=$(echo "$line" | grep -o '"method":"[^"]*"' | head -1 | sed 's/"method":"//;s/"//')
+  id=$(echo "$line" | grep -o '"id":[0-9]*' | head -1 | cut -d: -f2)

  case "$method" in
    initialize)
--- a/internal/mcp/config.go
+++ b/internal/mcp/config.go
@@ -16,7 +16,7 @@ type ServerConfig struct {
 	Args           []string
 	Env            map[string]string
 	Timeout        time.Duration
-	ReplaceDefault []string
+	ReplaceDefault map[string]string // MCP tool name → built-in name to replace
 }

 // ParseServerConfigs validates and converts raw config entries.
--- a/internal/mcp/config_test.go
+++ b/internal/mcp/config_test.go
@@ -15,7 +15,7 @@ func TestParseServerConfigs_Valid(t *testing.T) {
 			Args:           []string{"--repo", "."},
 			Env:            map[string]string{"GIT_DIR": ".git"},
 			Timeout:        "10s",
-			ReplaceDefault: []string{"bash"},
+			ReplaceDefault: map[string]string{"exec": "bash"},
 		},
 		{
 			Name:    "docker",
@@ -37,8 +37,8 @@ func TestParseServerConfigs_Valid(t *testing.T) {
 	if got[0].Timeout != 10*time.Second {
 		t.Errorf("config[0].Timeout = %v, want %v", got[0].Timeout, 10*time.Second)
 	}
-	if len(got[0].ReplaceDefault) != 1 || got[0].ReplaceDefault[0] != "bash" {
-		t.Errorf("config[0].ReplaceDefault = %v, want [bash]", got[0].ReplaceDefault)
+	if got[0].ReplaceDefault["exec"] != "bash" {
+		t.Errorf("config[0].ReplaceDefault = %v, want map[exec:bash]", got[0].ReplaceDefault)
 	}

 	// Second config should get default timeout.
--- a/internal/mcp/manager.go
+++ b/internal/mcp/manager.go
@@ -84,23 +84,13 @@ func (m *Manager) startServer(ctx context.Context, srv ServerConfig) (*Client, e
 }

 func (m *Manager) registerTools(srv ServerConfig, tools []MCPTool, client *Client, registry *tool.Registry) {
-	replaceSet := make(map[string]bool, len(srv.ReplaceDefault))
-	for _, name := range srv.ReplaceDefault {
-		replaceSet[name] = true
-	}
-
 	for _, mt := range tools {
 		adapter := NewAdapter(srv.Name, mt, client)

-		// Check if any replace_default entry matches this MCP tool.
-		// Match by checking if the MCP tool name appears in a replace target,
-		// or assign replacements in order.
-		for _, replaceName := range srv.ReplaceDefault {
-			if replaceSet[replaceName] {
-				adapter.SetOverrideName(replaceName)
-				delete(replaceSet, replaceName)
-				break
-			}
+		// Explicit mapping: if this MCP tool name has a replace_default entry,
+		// register it under the built-in's name instead of mcp__{server}__{tool}.
+		if builtinName, ok := srv.ReplaceDefault[mt.Name]; ok {
+			adapter.SetOverrideName(builtinName)
 		}

 		registry.Register(adapter)
--- a/internal/mcp/manager_test.go
+++ b/internal/mcp/manager_test.go
@@ -71,7 +71,7 @@ func TestManager_StartAll_ReplaceDefault(t *testing.T) {
 			Command:        "bash",
 			Args:           []string{script},
 			Timeout:        5 * time.Second,
-			ReplaceDefault: []string{"bash"},
+			ReplaceDefault: map[string]string{"exec": "bash"},
 		},
 	}, reg)
 	if err != nil {
@@ -170,7 +170,7 @@ func TestManager_StartAll_ReplaceDefault_PicksMatchingTool(t *testing.T) {
 			Command:        "bash",
 			Args:           []string{script},
 			Timeout:        5 * time.Second,
-			ReplaceDefault: []string{"fs.read", "fs.write"},
+			ReplaceDefault: map[string]string{"read": "fs.read", "write": "fs.write"},
 		},
 	}, reg)
 	if err != nil {
--- a/internal/mcp/tool_test.go
+++ b/internal/mcp/tool_test.go
@@ -165,8 +165,8 @@ func TestAdapter_Execute_RPCError(t *testing.T) {
 	content := `#!/bin/bash
 DIR="` + dir + `"
 while IFS= read -r line; do
-  method=$(echo "$line" | python3 -c "import sys,json; print(json.load(sys.stdin).get('method',''))" 2>/dev/null)
-  id=$(echo "$line" | python3 -c "import sys,json; print(json.load(sys.stdin).get('id',0))" 2>/dev/null)
+  method=$(echo "$line" | grep -o '"method":"[^"]*"' | head -1 | sed 's/"method":"//;s/"//')
+  id=$(echo "$line" | grep -o '"id":[0-9]*' | head -1 | cut -d: -f2)

  case "$method" in
    initialize)
--- a/internal/router/bench_test.go
+++ b/internal/router/bench_test.go
@@ -0,0 +1,164 @@
+package router
+
+import (
+	"testing"
+
+	"somegit.dev/Owlibou/gnoma/internal/provider"
+)
+
+// benchArms creates a set of arms with diverse cost/capability profiles.
+func benchArms() []*Arm {
+	return []*Arm{
+		{
+			ID: "anthropic/claude-sonnet", ModelName: "claude-sonnet",
+			Capabilities:    provider.Capabilities{ToolUse: true, ContextWindow: 200000, Thinking: false},
+			CostPer1kInput:  0.003, CostPer1kOutput: 0.015,
+		},
+		{
+			ID: "anthropic/claude-opus", ModelName: "claude-opus",
+			Capabilities:    provider.Capabilities{ToolUse: true, ContextWindow: 200000, Thinking: true},
+			CostPer1kInput:  0.015, CostPer1kOutput: 0.075,
+		},
+		{
+			ID: "openai/gpt-4o", ModelName: "gpt-4o",
+			Capabilities:    provider.Capabilities{ToolUse: true, ContextWindow: 128000},
+			CostPer1kInput:  0.005, CostPer1kOutput: 0.015,
+		},
+		{
+			ID: "ollama/qwen3:8b", ModelName: "qwen3:8b",
+			IsLocal:         true,
+			Capabilities:    provider.Capabilities{ToolUse: true, ContextWindow: 32000},
+			CostPer1kInput:  0, CostPer1kOutput: 0,
+		},
+		{
+			ID: "mistral/mistral-large", ModelName: "mistral-large",
+			Capabilities:    provider.Capabilities{ToolUse: true, ContextWindow: 128000},
+			CostPer1kInput:  0.002, CostPer1kOutput: 0.006,
+		},
+	}
+}
+
+// benchTasks returns one task per TaskType at varying complexity.
+func benchTasks() []Task {
+	return []Task{
+		{Type: TaskBoilerplate, Priority: PriorityLow, EstimatedTokens: 500, RequiresTools: true, ComplexityScore: 0.1},
+		{Type: TaskGeneration, Priority: PriorityNormal, EstimatedTokens: 2000, RequiresTools: true, ComplexityScore: 0.5},
+		{Type: TaskRefactor, Priority: PriorityNormal, EstimatedTokens: 3000, RequiresTools: true, ComplexityScore: 0.6},
+		{Type: TaskReview, Priority: PriorityHigh, EstimatedTokens: 4000, RequiresTools: false, ComplexityScore: 0.5},
+		{Type: TaskUnitTest, Priority: PriorityNormal, EstimatedTokens: 1500, RequiresTools: true, ComplexityScore: 0.4},
+		{Type: TaskPlanning, Priority: PriorityHigh, EstimatedTokens: 5000, RequiresTools: false, ComplexityScore: 0.8},
+		{Type: TaskOrchestration, Priority: PriorityCritical, EstimatedTokens: 8000, RequiresTools: true, ComplexityScore: 0.9},
+		{Type: TaskSecurityReview, Priority: PriorityCritical, EstimatedTokens: 6000, RequiresTools: true, ComplexityScore: 0.85},
+		{Type: TaskDebug, Priority: PriorityNormal, EstimatedTokens: 3000, RequiresTools: true, ComplexityScore: 0.6},
+		{Type: TaskExplain, Priority: PriorityLow, EstimatedTokens: 1000, RequiresTools: false, ComplexityScore: 0.2},
+	}
+}
+
+func BenchmarkSelectBest(b *testing.B) {
+	arms := benchArms()
+	tasks := benchTasks()
+	qt := NewQualityTracker()
+
+	b.ResetTimer()
+	for b.Loop() {
+		for _, task := range tasks {
+			selectBest(qt, arms, task)
+		}
+	}
+}
+
+func BenchmarkFilterFeasible(b *testing.B) {
+	arms := benchArms()
+	tasks := benchTasks()
+
+	b.ResetTimer()
+	for b.Loop() {
+		for _, task := range tasks {
+			filterFeasible(arms, task)
+		}
+	}
+}
+
+func BenchmarkRouterSelect(b *testing.B) {
+	r := New(Config{})
+	for _, arm := range benchArms() {
+		r.RegisterArm(arm)
+	}
+	tasks := benchTasks()
+
+	b.ResetTimer()
+	for b.Loop() {
+		for _, task := range tasks {
+			d := r.Select(task)
+			if d.Error == nil {
+				d.Commit(task.EstimatedTokens)
+			}
+		}
+	}
+}
+
+func BenchmarkScoreArm(b *testing.B) {
+	arms := benchArms()
+	qt := NewQualityTracker()
+	task := Task{Type: TaskGeneration, Priority: PriorityNormal, EstimatedTokens: 2000, RequiresTools: true, ComplexityScore: 0.5}
+
+	b.ResetTimer()
+	for b.Loop() {
+		for _, arm := range arms {
+			scoreArm(qt, arm, task)
+		}
+	}
+}
+
+func BenchmarkClassifyTask(b *testing.B) {
+	prompts := []string{
+		"fix the null pointer in handleRequest",
+		"explain how the router selects arms",
+		"refactor the authentication middleware to use the new session store",
+		"add a new endpoint for user profile updates",
+		"review the security of the payment processing flow for OWASP vulnerabilities",
+		"write unit tests for the pool tracker",
+		"plan the architecture for the plugin system",
+		"scaffold a new provider adapter for Cohere",
+		"orchestrate a multi-step migration: backup, schema change, data backfill, verify",
+		"debug why the TUI freezes when streaming large responses",
+	}
+
+	b.ResetTimer()
+	for b.Loop() {
+		for _, p := range prompts {
+			ClassifyTask(p)
+		}
+	}
+}
+
+func BenchmarkRouterSelectWithQuality(b *testing.B) {
+	r := New(Config{})
+	for _, arm := range benchArms() {
+		r.RegisterArm(arm)
+	}
+	tasks := benchTasks()
+
+	// Seed quality tracker with 20 observations per arm/task combo
+	for _, arm := range benchArms() {
+		for _, task := range tasks {
+			for range 20 {
+				r.quality.Record(arm.ID, task.Type, true)
+			}
+			// Mix in some failures for realism
+			for range 3 {
+				r.quality.Record(arm.ID, task.Type, false)
+			}
+		}
+	}
+
+	b.ResetTimer()
+	for b.Loop() {
+		for _, task := range tasks {
+			d := r.Select(task)
+			if d.Error == nil {
+				d.Commit(task.EstimatedTokens)
+			}
+		}
+	}
+}