From d7b524664d67a07c56b61426187bc9f4947b23a1 Mon Sep 17 00:00:00 2001 From: vikingowl Date: Sun, 12 Apr 2026 03:34:58 +0200 Subject: [PATCH] fix(m8): replace_default map, error UX, benchmarks, and launch prep MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Fix replace_default positional bug: []string → map[string]string for explicit MCP tool → built-in name mapping - Improve error messages for missing API keys (3 actionable options) and unknown providers (early validation with available list) - Remove python3 dependency from MCP tests (pure bash grep/sed parsing) - Add router benchmark scaffold (6 benchmarks in bench_test.go + docs) - Add .goreleaser.yml for cross-platform binary releases with ldflags - Add launch-ready README with quickstart, extensibility docs, GIF placeholder - Add CONTRIBUTING.md and Gitea issue templates (bug report, feature request) --- .gitea/issue_template/bug_report.yaml | 58 ++++++++ .gitea/issue_template/feature_request.yaml | 42 ++++++ .goreleaser.yml | 47 ++++++ CONTRIBUTING.md | 53 +++++++ README.md | 80 +++++++++- cmd/gnoma/main.go | 31 +++- docs/benchmarks/README.md | 43 ++++++ internal/config/config.go | 4 +- internal/mcp/client_test.go | 8 +- internal/mcp/config.go | 2 +- internal/mcp/config_test.go | 6 +- internal/mcp/manager.go | 18 +-- internal/mcp/manager_test.go | 4 +- internal/mcp/tool_test.go | 4 +- internal/router/bench_test.go | 164 +++++++++++++++++++++ 15 files changed, 530 insertions(+), 34 deletions(-) create mode 100644 .gitea/issue_template/bug_report.yaml create mode 100644 .gitea/issue_template/feature_request.yaml create mode 100644 .goreleaser.yml create mode 100644 CONTRIBUTING.md create mode 100644 docs/benchmarks/README.md create mode 100644 internal/router/bench_test.go diff --git a/.gitea/issue_template/bug_report.yaml b/.gitea/issue_template/bug_report.yaml new file mode 100644 index 0000000..21a3d17 --- /dev/null +++ b/.gitea/issue_template/bug_report.yaml @@ -0,0 +1,58 @@ +name: Bug Report +about: Report something that isn't working correctly +labels: + - bug +body: + - type: textarea + id: description + attributes: + label: Description + description: What happened? What did you expect? + validations: + required: true + - type: textarea + id: reproduction + attributes: + label: Steps to reproduce + description: Minimal steps to trigger the issue + placeholder: | + 1. Run `gnoma --provider anthropic` + 2. Type "..." + 3. See error + validations: + required: true + - type: input + id: version + attributes: + label: gnoma version + description: Output of `gnoma --version` + placeholder: "gnoma 0.1.0 (abc1234, 2026-04-12)" + validations: + required: true + - type: input + id: os + attributes: + label: OS / Architecture + placeholder: "Linux x86_64 / macOS arm64 / Windows amd64" + validations: + required: true + - type: dropdown + id: provider + attributes: + label: Provider + options: + - mistral + - anthropic + - openai + - google + - ollama + - llamacpp + - N/A + validations: + required: false + - type: textarea + id: logs + attributes: + label: Relevant logs + description: Run with `--verbose` for debug output + render: shell diff --git a/.gitea/issue_template/feature_request.yaml b/.gitea/issue_template/feature_request.yaml new file mode 100644 index 0000000..a1849bd --- /dev/null +++ b/.gitea/issue_template/feature_request.yaml @@ -0,0 +1,42 @@ +name: Feature Request +about: Suggest an improvement or new capability +labels: + - enhancement +body: + - type: textarea + id: problem + attributes: + label: Problem + description: What are you trying to do that gnoma doesn't support well? + validations: + required: true + - type: textarea + id: solution + attributes: + label: Proposed solution + description: How would you like this to work? + validations: + required: true + - type: textarea + id: alternatives + attributes: + label: Alternatives considered + description: Other approaches you've thought about + validations: + required: false + - type: dropdown + id: area + attributes: + label: Area + options: + - providers + - tools + - router + - TUI + - MCP / plugins + - elfs (sub-agents) + - security + - config + - other + validations: + required: false diff --git a/.goreleaser.yml b/.goreleaser.yml new file mode 100644 index 0000000..3130385 --- /dev/null +++ b/.goreleaser.yml @@ -0,0 +1,47 @@ +version: 2 + +before: + hooks: + - go mod tidy + +builds: + - main: ./cmd/gnoma + binary: gnoma + env: + - CGO_ENABLED=0 + goos: + - linux + - darwin + - windows + goarch: + - amd64 + - arm64 + ldflags: + - -s -w + - -X main.buildVersion={{.Version}} + - -X main.buildCommit={{.ShortCommit}} + - -X main.buildDate={{.Date}} + +archives: + - formats: [tar.gz] + format_overrides: + - goos: windows + formats: [zip] + name_template: >- + {{ .ProjectName }}_{{ .Version }}_{{ .Os }}_{{ .Arch }} + +checksum: + name_template: checksums.txt + +changelog: + sort: asc + filters: + exclude: + - "^docs:" + - "^test:" + - "^chore:" + +release: + gitea: + owner: Owlibou + name: gnoma diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..2f76f1e --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,53 @@ +# Contributing to gnoma + +## Setup + +```sh +git clone https://somegit.dev/Owlibou/gnoma && cd gnoma +make build # requires Go 1.26+ +make test +make lint # requires golangci-lint +``` + +## Development workflow + +1. Create a branch from `main` +2. Write tests first (TDD) — table-driven, `t.TempDir()` for filesystem tests +3. `make check` (fmt + vet + lint + test) must pass +4. Commit with conventional messages: `feat:`, `fix:`, `refactor:`, `test:`, `docs:` + +## Code style + +- Go 1.26 idioms (`new(expr)`, `errors.AsType[E]`) +- Structured logging with `log/slog` +- `json.RawMessage` for tool schemas (zero-cost passthrough) +- Functional options for complex configuration +- Short, lowercase package names — no underscores + +## Testing + +- Unit tests: `make test` +- Integration tests (require API keys): `make test-integration` +- Coverage: `make cover` +- Benchmarks: `go test -bench=. ./internal/router/` + +Integration tests use `//go:build integration` and are skipped by default. + +## Architecture + +Read `docs/essentials/INDEX.md` before making architectural changes. Key packages: + +| Package | Purpose | +|---------|---------| +| `internal/engine` | Agentic loop (stream → tool → re-query) | +| `internal/router` | Multi-armed bandit arm selection | +| `internal/provider` | LLM provider adapters | +| `internal/tool` | Tool interface + registry | +| `internal/mcp` | MCP client (JSON-RPC over stdio) | +| `internal/plugin` | Plugin manifest, loader, manager | +| `internal/elf` | Sub-agent (elf) system | +| `internal/tui` | Bubble Tea terminal UI | + +## Issues + +Use the issue templates when filing bugs or requesting features. Include reproduction steps, expected behavior, and gnoma version (`gnoma --version`). diff --git a/README.md b/README.md index 0b0f85e..1ce643a 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,28 @@ # gnoma -Provider-agnostic agentic coding assistant in Go. -Named after the northern pygmy-owl (*Glaucidium gnoma*). Agents are called **elfs** (elf owl). +**A provider-agnostic agentic coding assistant built in Go.** gnoma routes tasks to the best available LLM — cloud or local — through a multi-armed bandit router, while tools, hooks, skills, MCP servers, and plugins keep it extensible. Named after the northern pygmy-owl (*Glaucidium gnoma*); agents are called **elfs** (elf owl). + + + + +## Quickstart + +```sh +# Install +go install somegit.dev/Owlibou/gnoma/cmd/gnoma@latest + +# Or build from source +git clone https://somegit.dev/Owlibou/gnoma && cd gnoma +make build # binary at ./bin/gnoma + +# Set at least one provider key +export ANTHROPIC_API_KEY=sk-ant-... # or OPENAI_API_KEY, MISTRAL_API_KEY, GEMINI_API_KEY + +# Run +gnoma # interactive TUI +echo "list files" | gnoma # pipe mode +gnoma --provider ollama # use a local model +``` ## Build @@ -104,6 +125,61 @@ llamacpp = "http://localhost:9090/v1" --- +## Extensibility (M8) + +gnoma supports hooks, skills, MCP servers, and plugins. + +### MCP Servers + +Connect any [MCP](https://modelcontextprotocol.io)-compatible tool server: + +```toml +[[mcp_servers]] +name = "git" +command = "mcp-server-git" +args = ["--repo", "."] +timeout = "30s" + +# Replace a built-in tool with an MCP tool +[mcp_servers.replace_default] +exec = "bash" # MCP tool "exec" replaces gnoma's built-in "bash" +``` + +MCP tools appear as `mcp__{server}__{tool}` (e.g., `mcp__git__status`), or under the built-in name when using `replace_default`. + +### Skills + +Drop markdown files into `.gnoma/skills/` or `~/.config/gnoma/skills/`: + +``` +/skillname # invoke a skill +/skills # list available skills +``` + +### Hooks + +Run shell commands on tool events: + +```toml +[[hooks]] +name = "block-rm-rf" +event = "pre_tool_use" +type = "command" +exec = "bash-safety-check.sh" +tool_pattern = "bash*" +``` + +### Plugins + +Bundle skills, hooks, and MCP configs into installable plugins: + +```sh +gnoma plugin install ./my-plugin # install from directory +gnoma plugin list # list installed plugins +``` + +--- + ## Session Persistence Conversations are auto-saved to `.gnoma/sessions/` after each completed turn. On a crash you lose at most the current in-flight turn; all previously completed turns are safe. diff --git a/cmd/gnoma/main.go b/cmd/gnoma/main.go index 96e315b..42e0f10 100644 --- a/cmd/gnoma/main.go +++ b/cmd/gnoma/main.go @@ -46,6 +46,13 @@ import ( "somegit.dev/Owlibou/gnoma/internal/tool/sysinfo" ) +// Set by goreleaser ldflags. +var ( + buildVersion = "dev" + buildCommit = "none" + buildDate = "unknown" +) + func main() { var resumeFlag string var ( @@ -64,7 +71,7 @@ func main() { flag.Parse() if *version { - fmt.Println("gnoma v0.1.0-dev") + fmt.Printf("gnoma %s (%s, %s)\n", buildVersion, buildCommit, buildDate) os.Exit(0) } @@ -123,7 +130,17 @@ func main() { } // Resolve API key: CLI flag → config → env vars + knownProviders := map[string]bool{ + "mistral": true, "anthropic": true, "openai": true, + "google": true, "ollama": true, "llamacpp": true, + } localProviders := map[string]bool{"ollama": true, "llamacpp": true} + + if !knownProviders[*providerName] { + fmt.Fprintf(os.Stderr, "error: unknown provider %q\n available: mistral, anthropic, openai, google, ollama, llamacpp\n usage: gnoma --provider \n", *providerName) + os.Exit(1) + } + key := *apiKey if key == "" { if cfgKey, ok := cfg.Provider.APIKeys[*providerName]; ok && cfgKey != "" { @@ -134,8 +151,14 @@ func main() { key = resolveAPIKey(*providerName) } if key == "" && !localProviders[*providerName] { - fmt.Fprintf(os.Stderr, "error: no API key for provider %q\nSet %s environment variable or use --api-key\n", - *providerName, envKeyFor(*providerName)) + envVar := envKeyFor(*providerName) + fmt.Fprintf(os.Stderr, "error: no API key for provider %q\n\n", *providerName) + fmt.Fprintf(os.Stderr, " Option 1: export %s=\n", envVar) + fmt.Fprintf(os.Stderr, " Option 2: gnoma --api-key \n") + fmt.Fprintf(os.Stderr, " Option 3: add to .gnoma/config.toml:\n") + fmt.Fprintf(os.Stderr, " [provider.api_keys]\n") + fmt.Fprintf(os.Stderr, " %s = \"\"\n\n", *providerName) + fmt.Fprintf(os.Stderr, "For local models (no API key needed): gnoma --provider ollama\n") os.Exit(1) } @@ -735,7 +758,7 @@ func createProvider(name, apiKey, model, baseURL string) (provider.Provider, err case "llamacpp": return openaicompat.NewLlamaCpp(cfg) default: - return nil, fmt.Errorf("unknown provider %q (supports: mistral, anthropic, openai, google, ollama, llamacpp)", name) + return nil, fmt.Errorf("unknown provider %q\n available: mistral, anthropic, openai, google, ollama, llamacpp\n usage: gnoma --provider ", name) } } diff --git a/docs/benchmarks/README.md b/docs/benchmarks/README.md new file mode 100644 index 0000000..74d3c01 --- /dev/null +++ b/docs/benchmarks/README.md @@ -0,0 +1,43 @@ +# Router Benchmarks + +Tracking how gnoma's multi-armed bandit router (M4 heuristic, M9 bandit) performs across providers, task types, and cost envelopes. + +## Methodology + +Each benchmark run: + +1. Registers a set of arms (provider/model pairs) with known cost profiles +2. Generates synthetic tasks across all 10 task types with varying complexity +3. Runs N routing decisions and records: arm selected, latency, quality score, cost +4. Reports convergence metrics after simulated quality feedback + +## Metrics + +| Metric | Description | +|--------|-------------| +| **Selection accuracy** | % of tasks routed to the optimal arm (vs. oracle with perfect knowledge) | +| **Cost efficiency** | Total cost relative to always-cheapest and always-best-quality baselines | +| **Convergence speed** | Observations needed before bandit matches heuristic on quality (M9) | +| **Pool utilization** | % of rate limit budget consumed before exhaustion | +| **Latency overhead** | Time spent in Select() excluding provider round-trip | + +## Running + +```sh +# Go benchmarks (in-process, no real API calls) +go test -bench=. -benchmem ./internal/router/ + +# Synthetic routing simulation (when available) +go run ./cmd/gnoma-bench/ --arms=5 --tasks=1000 --seed=42 +``` + +## Results + +No benchmark results yet. This scaffold will be populated as M9 (Router Advanced) lands. + +### Planned comparisons + +- Heuristic-only (M4) vs. bandit (M9) after 50, 200, 1000 observations +- 2-arm (local + cloud) vs. 5-arm (mixed providers) scenarios +- Cost-capped routing: $5/day budget with mixed task load +- Quality degradation under rate limit pressure (pool scarcity) diff --git a/internal/config/config.go b/internal/config/config.go index 4ceaaa5..a44e695 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -25,14 +25,14 @@ type Config struct { // args = ["--repo", "."] // env = { GIT_DIR = ".git" } // timeout = "30s" -// replace_default = ["bash"] +// replace_default = { exec = "bash" } # MCP tool "exec" replaces built-in "bash" type MCPServerConfig struct { Name string `toml:"name"` Command string `toml:"command"` Args []string `toml:"args"` Env map[string]string `toml:"env"` Timeout string `toml:"timeout"` - ReplaceDefault []string `toml:"replace_default"` + ReplaceDefault map[string]string `toml:"replace_default"` // MCP tool name → built-in name } // PluginsSection controls plugin loading. diff --git a/internal/mcp/client_test.go b/internal/mcp/client_test.go index 823acb4..aa0aba9 100644 --- a/internal/mcp/client_test.go +++ b/internal/mcp/client_test.go @@ -28,14 +28,14 @@ func writeMCPServer(t *testing.T, tools []MCPTool, callResult string) string { os.WriteFile(filepath.Join(dir, "tools.json"), toolsJSON, 0o644) os.WriteFile(filepath.Join(dir, "call.json"), []byte(callResult), 0o644) - // The script uses jq-free JSON construction: reads response payload from - // file and wraps it in a JSON-RPC envelope using python (widely available). + // The script uses pure bash for JSON parsing — no python3 or jq dependency. + // We extract "method" and "id" with grep since the JSON-RPC format is predictable. script := filepath.Join(dir, "mcp-server.sh") content := `#!/bin/bash DIR="` + dir + `" while IFS= read -r line; do - method=$(echo "$line" | python3 -c "import sys,json; print(json.load(sys.stdin).get('method',''))" 2>/dev/null) - id=$(echo "$line" | python3 -c "import sys,json; print(json.load(sys.stdin).get('id',0))" 2>/dev/null) + method=$(echo "$line" | grep -o '"method":"[^"]*"' | head -1 | sed 's/"method":"//;s/"//') + id=$(echo "$line" | grep -o '"id":[0-9]*' | head -1 | cut -d: -f2) case "$method" in initialize) diff --git a/internal/mcp/config.go b/internal/mcp/config.go index f55303a..7b7735f 100644 --- a/internal/mcp/config.go +++ b/internal/mcp/config.go @@ -16,7 +16,7 @@ type ServerConfig struct { Args []string Env map[string]string Timeout time.Duration - ReplaceDefault []string + ReplaceDefault map[string]string // MCP tool name → built-in name to replace } // ParseServerConfigs validates and converts raw config entries. diff --git a/internal/mcp/config_test.go b/internal/mcp/config_test.go index 7e75c64..d87fc19 100644 --- a/internal/mcp/config_test.go +++ b/internal/mcp/config_test.go @@ -15,7 +15,7 @@ func TestParseServerConfigs_Valid(t *testing.T) { Args: []string{"--repo", "."}, Env: map[string]string{"GIT_DIR": ".git"}, Timeout: "10s", - ReplaceDefault: []string{"bash"}, + ReplaceDefault: map[string]string{"exec": "bash"}, }, { Name: "docker", @@ -37,8 +37,8 @@ func TestParseServerConfigs_Valid(t *testing.T) { if got[0].Timeout != 10*time.Second { t.Errorf("config[0].Timeout = %v, want %v", got[0].Timeout, 10*time.Second) } - if len(got[0].ReplaceDefault) != 1 || got[0].ReplaceDefault[0] != "bash" { - t.Errorf("config[0].ReplaceDefault = %v, want [bash]", got[0].ReplaceDefault) + if got[0].ReplaceDefault["exec"] != "bash" { + t.Errorf("config[0].ReplaceDefault = %v, want map[exec:bash]", got[0].ReplaceDefault) } // Second config should get default timeout. diff --git a/internal/mcp/manager.go b/internal/mcp/manager.go index 1409b4e..18b746a 100644 --- a/internal/mcp/manager.go +++ b/internal/mcp/manager.go @@ -84,23 +84,13 @@ func (m *Manager) startServer(ctx context.Context, srv ServerConfig) (*Client, e } func (m *Manager) registerTools(srv ServerConfig, tools []MCPTool, client *Client, registry *tool.Registry) { - replaceSet := make(map[string]bool, len(srv.ReplaceDefault)) - for _, name := range srv.ReplaceDefault { - replaceSet[name] = true - } - for _, mt := range tools { adapter := NewAdapter(srv.Name, mt, client) - // Check if any replace_default entry matches this MCP tool. - // Match by checking if the MCP tool name appears in a replace target, - // or assign replacements in order. - for _, replaceName := range srv.ReplaceDefault { - if replaceSet[replaceName] { - adapter.SetOverrideName(replaceName) - delete(replaceSet, replaceName) - break - } + // Explicit mapping: if this MCP tool name has a replace_default entry, + // register it under the built-in's name instead of mcp__{server}__{tool}. + if builtinName, ok := srv.ReplaceDefault[mt.Name]; ok { + adapter.SetOverrideName(builtinName) } registry.Register(adapter) diff --git a/internal/mcp/manager_test.go b/internal/mcp/manager_test.go index 4fa69da..f380f22 100644 --- a/internal/mcp/manager_test.go +++ b/internal/mcp/manager_test.go @@ -71,7 +71,7 @@ func TestManager_StartAll_ReplaceDefault(t *testing.T) { Command: "bash", Args: []string{script}, Timeout: 5 * time.Second, - ReplaceDefault: []string{"bash"}, + ReplaceDefault: map[string]string{"exec": "bash"}, }, }, reg) if err != nil { @@ -170,7 +170,7 @@ func TestManager_StartAll_ReplaceDefault_PicksMatchingTool(t *testing.T) { Command: "bash", Args: []string{script}, Timeout: 5 * time.Second, - ReplaceDefault: []string{"fs.read", "fs.write"}, + ReplaceDefault: map[string]string{"read": "fs.read", "write": "fs.write"}, }, }, reg) if err != nil { diff --git a/internal/mcp/tool_test.go b/internal/mcp/tool_test.go index 3702a17..0de2e8b 100644 --- a/internal/mcp/tool_test.go +++ b/internal/mcp/tool_test.go @@ -165,8 +165,8 @@ func TestAdapter_Execute_RPCError(t *testing.T) { content := `#!/bin/bash DIR="` + dir + `" while IFS= read -r line; do - method=$(echo "$line" | python3 -c "import sys,json; print(json.load(sys.stdin).get('method',''))" 2>/dev/null) - id=$(echo "$line" | python3 -c "import sys,json; print(json.load(sys.stdin).get('id',0))" 2>/dev/null) + method=$(echo "$line" | grep -o '"method":"[^"]*"' | head -1 | sed 's/"method":"//;s/"//') + id=$(echo "$line" | grep -o '"id":[0-9]*' | head -1 | cut -d: -f2) case "$method" in initialize) diff --git a/internal/router/bench_test.go b/internal/router/bench_test.go new file mode 100644 index 0000000..2772084 --- /dev/null +++ b/internal/router/bench_test.go @@ -0,0 +1,164 @@ +package router + +import ( + "testing" + + "somegit.dev/Owlibou/gnoma/internal/provider" +) + +// benchArms creates a set of arms with diverse cost/capability profiles. +func benchArms() []*Arm { + return []*Arm{ + { + ID: "anthropic/claude-sonnet", ModelName: "claude-sonnet", + Capabilities: provider.Capabilities{ToolUse: true, ContextWindow: 200000, Thinking: false}, + CostPer1kInput: 0.003, CostPer1kOutput: 0.015, + }, + { + ID: "anthropic/claude-opus", ModelName: "claude-opus", + Capabilities: provider.Capabilities{ToolUse: true, ContextWindow: 200000, Thinking: true}, + CostPer1kInput: 0.015, CostPer1kOutput: 0.075, + }, + { + ID: "openai/gpt-4o", ModelName: "gpt-4o", + Capabilities: provider.Capabilities{ToolUse: true, ContextWindow: 128000}, + CostPer1kInput: 0.005, CostPer1kOutput: 0.015, + }, + { + ID: "ollama/qwen3:8b", ModelName: "qwen3:8b", + IsLocal: true, + Capabilities: provider.Capabilities{ToolUse: true, ContextWindow: 32000}, + CostPer1kInput: 0, CostPer1kOutput: 0, + }, + { + ID: "mistral/mistral-large", ModelName: "mistral-large", + Capabilities: provider.Capabilities{ToolUse: true, ContextWindow: 128000}, + CostPer1kInput: 0.002, CostPer1kOutput: 0.006, + }, + } +} + +// benchTasks returns one task per TaskType at varying complexity. +func benchTasks() []Task { + return []Task{ + {Type: TaskBoilerplate, Priority: PriorityLow, EstimatedTokens: 500, RequiresTools: true, ComplexityScore: 0.1}, + {Type: TaskGeneration, Priority: PriorityNormal, EstimatedTokens: 2000, RequiresTools: true, ComplexityScore: 0.5}, + {Type: TaskRefactor, Priority: PriorityNormal, EstimatedTokens: 3000, RequiresTools: true, ComplexityScore: 0.6}, + {Type: TaskReview, Priority: PriorityHigh, EstimatedTokens: 4000, RequiresTools: false, ComplexityScore: 0.5}, + {Type: TaskUnitTest, Priority: PriorityNormal, EstimatedTokens: 1500, RequiresTools: true, ComplexityScore: 0.4}, + {Type: TaskPlanning, Priority: PriorityHigh, EstimatedTokens: 5000, RequiresTools: false, ComplexityScore: 0.8}, + {Type: TaskOrchestration, Priority: PriorityCritical, EstimatedTokens: 8000, RequiresTools: true, ComplexityScore: 0.9}, + {Type: TaskSecurityReview, Priority: PriorityCritical, EstimatedTokens: 6000, RequiresTools: true, ComplexityScore: 0.85}, + {Type: TaskDebug, Priority: PriorityNormal, EstimatedTokens: 3000, RequiresTools: true, ComplexityScore: 0.6}, + {Type: TaskExplain, Priority: PriorityLow, EstimatedTokens: 1000, RequiresTools: false, ComplexityScore: 0.2}, + } +} + +func BenchmarkSelectBest(b *testing.B) { + arms := benchArms() + tasks := benchTasks() + qt := NewQualityTracker() + + b.ResetTimer() + for b.Loop() { + for _, task := range tasks { + selectBest(qt, arms, task) + } + } +} + +func BenchmarkFilterFeasible(b *testing.B) { + arms := benchArms() + tasks := benchTasks() + + b.ResetTimer() + for b.Loop() { + for _, task := range tasks { + filterFeasible(arms, task) + } + } +} + +func BenchmarkRouterSelect(b *testing.B) { + r := New(Config{}) + for _, arm := range benchArms() { + r.RegisterArm(arm) + } + tasks := benchTasks() + + b.ResetTimer() + for b.Loop() { + for _, task := range tasks { + d := r.Select(task) + if d.Error == nil { + d.Commit(task.EstimatedTokens) + } + } + } +} + +func BenchmarkScoreArm(b *testing.B) { + arms := benchArms() + qt := NewQualityTracker() + task := Task{Type: TaskGeneration, Priority: PriorityNormal, EstimatedTokens: 2000, RequiresTools: true, ComplexityScore: 0.5} + + b.ResetTimer() + for b.Loop() { + for _, arm := range arms { + scoreArm(qt, arm, task) + } + } +} + +func BenchmarkClassifyTask(b *testing.B) { + prompts := []string{ + "fix the null pointer in handleRequest", + "explain how the router selects arms", + "refactor the authentication middleware to use the new session store", + "add a new endpoint for user profile updates", + "review the security of the payment processing flow for OWASP vulnerabilities", + "write unit tests for the pool tracker", + "plan the architecture for the plugin system", + "scaffold a new provider adapter for Cohere", + "orchestrate a multi-step migration: backup, schema change, data backfill, verify", + "debug why the TUI freezes when streaming large responses", + } + + b.ResetTimer() + for b.Loop() { + for _, p := range prompts { + ClassifyTask(p) + } + } +} + +func BenchmarkRouterSelectWithQuality(b *testing.B) { + r := New(Config{}) + for _, arm := range benchArms() { + r.RegisterArm(arm) + } + tasks := benchTasks() + + // Seed quality tracker with 20 observations per arm/task combo + for _, arm := range benchArms() { + for _, task := range tasks { + for range 20 { + r.quality.Record(arm.ID, task.Type, true) + } + // Mix in some failures for realism + for range 3 { + r.quality.Record(arm.ID, task.Type, false) + } + } + } + + b.ResetTimer() + for b.Loop() { + for _, task := range tasks { + d := r.Select(task) + if d.Error == nil { + d.Commit(task.EstimatedTokens) + } + } + } +}