fix(m8): replace_default map, error UX, benchmarks, and launch prep
- Fix replace_default positional bug: []string → map[string]string for explicit MCP tool → built-in name mapping - Improve error messages for missing API keys (3 actionable options) and unknown providers (early validation with available list) - Remove python3 dependency from MCP tests (pure bash grep/sed parsing) - Add router benchmark scaffold (6 benchmarks in bench_test.go + docs) - Add .goreleaser.yml for cross-platform binary releases with ldflags - Add launch-ready README with quickstart, extensibility docs, GIF placeholder - Add CONTRIBUTING.md and Gitea issue templates (bug report, feature request)
This commit is contained in:
58
.gitea/issue_template/bug_report.yaml
Normal file
58
.gitea/issue_template/bug_report.yaml
Normal file
@@ -0,0 +1,58 @@
|
||||
name: Bug Report
|
||||
about: Report something that isn't working correctly
|
||||
labels:
|
||||
- bug
|
||||
body:
|
||||
- type: textarea
|
||||
id: description
|
||||
attributes:
|
||||
label: Description
|
||||
description: What happened? What did you expect?
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
id: reproduction
|
||||
attributes:
|
||||
label: Steps to reproduce
|
||||
description: Minimal steps to trigger the issue
|
||||
placeholder: |
|
||||
1. Run `gnoma --provider anthropic`
|
||||
2. Type "..."
|
||||
3. See error
|
||||
validations:
|
||||
required: true
|
||||
- type: input
|
||||
id: version
|
||||
attributes:
|
||||
label: gnoma version
|
||||
description: Output of `gnoma --version`
|
||||
placeholder: "gnoma 0.1.0 (abc1234, 2026-04-12)"
|
||||
validations:
|
||||
required: true
|
||||
- type: input
|
||||
id: os
|
||||
attributes:
|
||||
label: OS / Architecture
|
||||
placeholder: "Linux x86_64 / macOS arm64 / Windows amd64"
|
||||
validations:
|
||||
required: true
|
||||
- type: dropdown
|
||||
id: provider
|
||||
attributes:
|
||||
label: Provider
|
||||
options:
|
||||
- mistral
|
||||
- anthropic
|
||||
- openai
|
||||
- google
|
||||
- ollama
|
||||
- llamacpp
|
||||
- N/A
|
||||
validations:
|
||||
required: false
|
||||
- type: textarea
|
||||
id: logs
|
||||
attributes:
|
||||
label: Relevant logs
|
||||
description: Run with `--verbose` for debug output
|
||||
render: shell
|
||||
42
.gitea/issue_template/feature_request.yaml
Normal file
42
.gitea/issue_template/feature_request.yaml
Normal file
@@ -0,0 +1,42 @@
|
||||
name: Feature Request
|
||||
about: Suggest an improvement or new capability
|
||||
labels:
|
||||
- enhancement
|
||||
body:
|
||||
- type: textarea
|
||||
id: problem
|
||||
attributes:
|
||||
label: Problem
|
||||
description: What are you trying to do that gnoma doesn't support well?
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
id: solution
|
||||
attributes:
|
||||
label: Proposed solution
|
||||
description: How would you like this to work?
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
id: alternatives
|
||||
attributes:
|
||||
label: Alternatives considered
|
||||
description: Other approaches you've thought about
|
||||
validations:
|
||||
required: false
|
||||
- type: dropdown
|
||||
id: area
|
||||
attributes:
|
||||
label: Area
|
||||
options:
|
||||
- providers
|
||||
- tools
|
||||
- router
|
||||
- TUI
|
||||
- MCP / plugins
|
||||
- elfs (sub-agents)
|
||||
- security
|
||||
- config
|
||||
- other
|
||||
validations:
|
||||
required: false
|
||||
47
.goreleaser.yml
Normal file
47
.goreleaser.yml
Normal file
@@ -0,0 +1,47 @@
|
||||
version: 2
|
||||
|
||||
before:
|
||||
hooks:
|
||||
- go mod tidy
|
||||
|
||||
builds:
|
||||
- main: ./cmd/gnoma
|
||||
binary: gnoma
|
||||
env:
|
||||
- CGO_ENABLED=0
|
||||
goos:
|
||||
- linux
|
||||
- darwin
|
||||
- windows
|
||||
goarch:
|
||||
- amd64
|
||||
- arm64
|
||||
ldflags:
|
||||
- -s -w
|
||||
- -X main.buildVersion={{.Version}}
|
||||
- -X main.buildCommit={{.ShortCommit}}
|
||||
- -X main.buildDate={{.Date}}
|
||||
|
||||
archives:
|
||||
- formats: [tar.gz]
|
||||
format_overrides:
|
||||
- goos: windows
|
||||
formats: [zip]
|
||||
name_template: >-
|
||||
{{ .ProjectName }}_{{ .Version }}_{{ .Os }}_{{ .Arch }}
|
||||
|
||||
checksum:
|
||||
name_template: checksums.txt
|
||||
|
||||
changelog:
|
||||
sort: asc
|
||||
filters:
|
||||
exclude:
|
||||
- "^docs:"
|
||||
- "^test:"
|
||||
- "^chore:"
|
||||
|
||||
release:
|
||||
gitea:
|
||||
owner: Owlibou
|
||||
name: gnoma
|
||||
53
CONTRIBUTING.md
Normal file
53
CONTRIBUTING.md
Normal file
@@ -0,0 +1,53 @@
|
||||
# Contributing to gnoma
|
||||
|
||||
## Setup
|
||||
|
||||
```sh
|
||||
git clone https://somegit.dev/Owlibou/gnoma && cd gnoma
|
||||
make build # requires Go 1.26+
|
||||
make test
|
||||
make lint # requires golangci-lint
|
||||
```
|
||||
|
||||
## Development workflow
|
||||
|
||||
1. Create a branch from `main`
|
||||
2. Write tests first (TDD) — table-driven, `t.TempDir()` for filesystem tests
|
||||
3. `make check` (fmt + vet + lint + test) must pass
|
||||
4. Commit with conventional messages: `feat:`, `fix:`, `refactor:`, `test:`, `docs:`
|
||||
|
||||
## Code style
|
||||
|
||||
- Go 1.26 idioms (`new(expr)`, `errors.AsType[E]`)
|
||||
- Structured logging with `log/slog`
|
||||
- `json.RawMessage` for tool schemas (zero-cost passthrough)
|
||||
- Functional options for complex configuration
|
||||
- Short, lowercase package names — no underscores
|
||||
|
||||
## Testing
|
||||
|
||||
- Unit tests: `make test`
|
||||
- Integration tests (require API keys): `make test-integration`
|
||||
- Coverage: `make cover`
|
||||
- Benchmarks: `go test -bench=. ./internal/router/`
|
||||
|
||||
Integration tests use `//go:build integration` and are skipped by default.
|
||||
|
||||
## Architecture
|
||||
|
||||
Read `docs/essentials/INDEX.md` before making architectural changes. Key packages:
|
||||
|
||||
| Package | Purpose |
|
||||
|---------|---------|
|
||||
| `internal/engine` | Agentic loop (stream → tool → re-query) |
|
||||
| `internal/router` | Multi-armed bandit arm selection |
|
||||
| `internal/provider` | LLM provider adapters |
|
||||
| `internal/tool` | Tool interface + registry |
|
||||
| `internal/mcp` | MCP client (JSON-RPC over stdio) |
|
||||
| `internal/plugin` | Plugin manifest, loader, manager |
|
||||
| `internal/elf` | Sub-agent (elf) system |
|
||||
| `internal/tui` | Bubble Tea terminal UI |
|
||||
|
||||
## Issues
|
||||
|
||||
Use the issue templates when filing bugs or requesting features. Include reproduction steps, expected behavior, and gnoma version (`gnoma --version`).
|
||||
80
README.md
80
README.md
@@ -1,7 +1,28 @@
|
||||
# gnoma
|
||||
|
||||
Provider-agnostic agentic coding assistant in Go.
|
||||
Named after the northern pygmy-owl (*Glaucidium gnoma*). Agents are called **elfs** (elf owl).
|
||||
**A provider-agnostic agentic coding assistant built in Go.** gnoma routes tasks to the best available LLM — cloud or local — through a multi-armed bandit router, while tools, hooks, skills, MCP servers, and plugins keep it extensible. Named after the northern pygmy-owl (*Glaucidium gnoma*); agents are called **elfs** (elf owl).
|
||||
|
||||
<!-- TODO: replace with actual demo recording -->
|
||||
<!--  -->
|
||||
|
||||
## Quickstart
|
||||
|
||||
```sh
|
||||
# Install
|
||||
go install somegit.dev/Owlibou/gnoma/cmd/gnoma@latest
|
||||
|
||||
# Or build from source
|
||||
git clone https://somegit.dev/Owlibou/gnoma && cd gnoma
|
||||
make build # binary at ./bin/gnoma
|
||||
|
||||
# Set at least one provider key
|
||||
export ANTHROPIC_API_KEY=sk-ant-... # or OPENAI_API_KEY, MISTRAL_API_KEY, GEMINI_API_KEY
|
||||
|
||||
# Run
|
||||
gnoma # interactive TUI
|
||||
echo "list files" | gnoma # pipe mode
|
||||
gnoma --provider ollama # use a local model
|
||||
```
|
||||
|
||||
## Build
|
||||
|
||||
@@ -104,6 +125,61 @@ llamacpp = "http://localhost:9090/v1"
|
||||
|
||||
---
|
||||
|
||||
## Extensibility (M8)
|
||||
|
||||
gnoma supports hooks, skills, MCP servers, and plugins.
|
||||
|
||||
### MCP Servers
|
||||
|
||||
Connect any [MCP](https://modelcontextprotocol.io)-compatible tool server:
|
||||
|
||||
```toml
|
||||
[[mcp_servers]]
|
||||
name = "git"
|
||||
command = "mcp-server-git"
|
||||
args = ["--repo", "."]
|
||||
timeout = "30s"
|
||||
|
||||
# Replace a built-in tool with an MCP tool
|
||||
[mcp_servers.replace_default]
|
||||
exec = "bash" # MCP tool "exec" replaces gnoma's built-in "bash"
|
||||
```
|
||||
|
||||
MCP tools appear as `mcp__{server}__{tool}` (e.g., `mcp__git__status`), or under the built-in name when using `replace_default`.
|
||||
|
||||
### Skills
|
||||
|
||||
Drop markdown files into `.gnoma/skills/` or `~/.config/gnoma/skills/`:
|
||||
|
||||
```
|
||||
/skillname # invoke a skill
|
||||
/skills # list available skills
|
||||
```
|
||||
|
||||
### Hooks
|
||||
|
||||
Run shell commands on tool events:
|
||||
|
||||
```toml
|
||||
[[hooks]]
|
||||
name = "block-rm-rf"
|
||||
event = "pre_tool_use"
|
||||
type = "command"
|
||||
exec = "bash-safety-check.sh"
|
||||
tool_pattern = "bash*"
|
||||
```
|
||||
|
||||
### Plugins
|
||||
|
||||
Bundle skills, hooks, and MCP configs into installable plugins:
|
||||
|
||||
```sh
|
||||
gnoma plugin install ./my-plugin # install from directory
|
||||
gnoma plugin list # list installed plugins
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Session Persistence
|
||||
|
||||
Conversations are auto-saved to `.gnoma/sessions/` after each completed turn. On a crash you lose at most the current in-flight turn; all previously completed turns are safe.
|
||||
|
||||
@@ -46,6 +46,13 @@ import (
|
||||
"somegit.dev/Owlibou/gnoma/internal/tool/sysinfo"
|
||||
)
|
||||
|
||||
// Set by goreleaser ldflags.
|
||||
var (
|
||||
buildVersion = "dev"
|
||||
buildCommit = "none"
|
||||
buildDate = "unknown"
|
||||
)
|
||||
|
||||
func main() {
|
||||
var resumeFlag string
|
||||
var (
|
||||
@@ -64,7 +71,7 @@ func main() {
|
||||
flag.Parse()
|
||||
|
||||
if *version {
|
||||
fmt.Println("gnoma v0.1.0-dev")
|
||||
fmt.Printf("gnoma %s (%s, %s)\n", buildVersion, buildCommit, buildDate)
|
||||
os.Exit(0)
|
||||
}
|
||||
|
||||
@@ -123,7 +130,17 @@ func main() {
|
||||
}
|
||||
|
||||
// Resolve API key: CLI flag → config → env vars
|
||||
knownProviders := map[string]bool{
|
||||
"mistral": true, "anthropic": true, "openai": true,
|
||||
"google": true, "ollama": true, "llamacpp": true,
|
||||
}
|
||||
localProviders := map[string]bool{"ollama": true, "llamacpp": true}
|
||||
|
||||
if !knownProviders[*providerName] {
|
||||
fmt.Fprintf(os.Stderr, "error: unknown provider %q\n available: mistral, anthropic, openai, google, ollama, llamacpp\n usage: gnoma --provider <name>\n", *providerName)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
key := *apiKey
|
||||
if key == "" {
|
||||
if cfgKey, ok := cfg.Provider.APIKeys[*providerName]; ok && cfgKey != "" {
|
||||
@@ -134,8 +151,14 @@ func main() {
|
||||
key = resolveAPIKey(*providerName)
|
||||
}
|
||||
if key == "" && !localProviders[*providerName] {
|
||||
fmt.Fprintf(os.Stderr, "error: no API key for provider %q\nSet %s environment variable or use --api-key\n",
|
||||
*providerName, envKeyFor(*providerName))
|
||||
envVar := envKeyFor(*providerName)
|
||||
fmt.Fprintf(os.Stderr, "error: no API key for provider %q\n\n", *providerName)
|
||||
fmt.Fprintf(os.Stderr, " Option 1: export %s=<your-key>\n", envVar)
|
||||
fmt.Fprintf(os.Stderr, " Option 2: gnoma --api-key <your-key>\n")
|
||||
fmt.Fprintf(os.Stderr, " Option 3: add to .gnoma/config.toml:\n")
|
||||
fmt.Fprintf(os.Stderr, " [provider.api_keys]\n")
|
||||
fmt.Fprintf(os.Stderr, " %s = \"<your-key>\"\n\n", *providerName)
|
||||
fmt.Fprintf(os.Stderr, "For local models (no API key needed): gnoma --provider ollama\n")
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
@@ -735,7 +758,7 @@ func createProvider(name, apiKey, model, baseURL string) (provider.Provider, err
|
||||
case "llamacpp":
|
||||
return openaicompat.NewLlamaCpp(cfg)
|
||||
default:
|
||||
return nil, fmt.Errorf("unknown provider %q (supports: mistral, anthropic, openai, google, ollama, llamacpp)", name)
|
||||
return nil, fmt.Errorf("unknown provider %q\n available: mistral, anthropic, openai, google, ollama, llamacpp\n usage: gnoma --provider <name>", name)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
43
docs/benchmarks/README.md
Normal file
43
docs/benchmarks/README.md
Normal file
@@ -0,0 +1,43 @@
|
||||
# Router Benchmarks
|
||||
|
||||
Tracking how gnoma's multi-armed bandit router (M4 heuristic, M9 bandit) performs across providers, task types, and cost envelopes.
|
||||
|
||||
## Methodology
|
||||
|
||||
Each benchmark run:
|
||||
|
||||
1. Registers a set of arms (provider/model pairs) with known cost profiles
|
||||
2. Generates synthetic tasks across all 10 task types with varying complexity
|
||||
3. Runs N routing decisions and records: arm selected, latency, quality score, cost
|
||||
4. Reports convergence metrics after simulated quality feedback
|
||||
|
||||
## Metrics
|
||||
|
||||
| Metric | Description |
|
||||
|--------|-------------|
|
||||
| **Selection accuracy** | % of tasks routed to the optimal arm (vs. oracle with perfect knowledge) |
|
||||
| **Cost efficiency** | Total cost relative to always-cheapest and always-best-quality baselines |
|
||||
| **Convergence speed** | Observations needed before bandit matches heuristic on quality (M9) |
|
||||
| **Pool utilization** | % of rate limit budget consumed before exhaustion |
|
||||
| **Latency overhead** | Time spent in Select() excluding provider round-trip |
|
||||
|
||||
## Running
|
||||
|
||||
```sh
|
||||
# Go benchmarks (in-process, no real API calls)
|
||||
go test -bench=. -benchmem ./internal/router/
|
||||
|
||||
# Synthetic routing simulation (when available)
|
||||
go run ./cmd/gnoma-bench/ --arms=5 --tasks=1000 --seed=42
|
||||
```
|
||||
|
||||
## Results
|
||||
|
||||
No benchmark results yet. This scaffold will be populated as M9 (Router Advanced) lands.
|
||||
|
||||
### Planned comparisons
|
||||
|
||||
- Heuristic-only (M4) vs. bandit (M9) after 50, 200, 1000 observations
|
||||
- 2-arm (local + cloud) vs. 5-arm (mixed providers) scenarios
|
||||
- Cost-capped routing: $5/day budget with mixed task load
|
||||
- Quality degradation under rate limit pressure (pool scarcity)
|
||||
@@ -25,14 +25,14 @@ type Config struct {
|
||||
// args = ["--repo", "."]
|
||||
// env = { GIT_DIR = ".git" }
|
||||
// timeout = "30s"
|
||||
// replace_default = ["bash"]
|
||||
// replace_default = { exec = "bash" } # MCP tool "exec" replaces built-in "bash"
|
||||
type MCPServerConfig struct {
|
||||
Name string `toml:"name"`
|
||||
Command string `toml:"command"`
|
||||
Args []string `toml:"args"`
|
||||
Env map[string]string `toml:"env"`
|
||||
Timeout string `toml:"timeout"`
|
||||
ReplaceDefault []string `toml:"replace_default"`
|
||||
ReplaceDefault map[string]string `toml:"replace_default"` // MCP tool name → built-in name
|
||||
}
|
||||
|
||||
// PluginsSection controls plugin loading.
|
||||
|
||||
@@ -28,14 +28,14 @@ func writeMCPServer(t *testing.T, tools []MCPTool, callResult string) string {
|
||||
os.WriteFile(filepath.Join(dir, "tools.json"), toolsJSON, 0o644)
|
||||
os.WriteFile(filepath.Join(dir, "call.json"), []byte(callResult), 0o644)
|
||||
|
||||
// The script uses jq-free JSON construction: reads response payload from
|
||||
// file and wraps it in a JSON-RPC envelope using python (widely available).
|
||||
// The script uses pure bash for JSON parsing — no python3 or jq dependency.
|
||||
// We extract "method" and "id" with grep since the JSON-RPC format is predictable.
|
||||
script := filepath.Join(dir, "mcp-server.sh")
|
||||
content := `#!/bin/bash
|
||||
DIR="` + dir + `"
|
||||
while IFS= read -r line; do
|
||||
method=$(echo "$line" | python3 -c "import sys,json; print(json.load(sys.stdin).get('method',''))" 2>/dev/null)
|
||||
id=$(echo "$line" | python3 -c "import sys,json; print(json.load(sys.stdin).get('id',0))" 2>/dev/null)
|
||||
method=$(echo "$line" | grep -o '"method":"[^"]*"' | head -1 | sed 's/"method":"//;s/"//')
|
||||
id=$(echo "$line" | grep -o '"id":[0-9]*' | head -1 | cut -d: -f2)
|
||||
|
||||
case "$method" in
|
||||
initialize)
|
||||
|
||||
@@ -16,7 +16,7 @@ type ServerConfig struct {
|
||||
Args []string
|
||||
Env map[string]string
|
||||
Timeout time.Duration
|
||||
ReplaceDefault []string
|
||||
ReplaceDefault map[string]string // MCP tool name → built-in name to replace
|
||||
}
|
||||
|
||||
// ParseServerConfigs validates and converts raw config entries.
|
||||
|
||||
@@ -15,7 +15,7 @@ func TestParseServerConfigs_Valid(t *testing.T) {
|
||||
Args: []string{"--repo", "."},
|
||||
Env: map[string]string{"GIT_DIR": ".git"},
|
||||
Timeout: "10s",
|
||||
ReplaceDefault: []string{"bash"},
|
||||
ReplaceDefault: map[string]string{"exec": "bash"},
|
||||
},
|
||||
{
|
||||
Name: "docker",
|
||||
@@ -37,8 +37,8 @@ func TestParseServerConfigs_Valid(t *testing.T) {
|
||||
if got[0].Timeout != 10*time.Second {
|
||||
t.Errorf("config[0].Timeout = %v, want %v", got[0].Timeout, 10*time.Second)
|
||||
}
|
||||
if len(got[0].ReplaceDefault) != 1 || got[0].ReplaceDefault[0] != "bash" {
|
||||
t.Errorf("config[0].ReplaceDefault = %v, want [bash]", got[0].ReplaceDefault)
|
||||
if got[0].ReplaceDefault["exec"] != "bash" {
|
||||
t.Errorf("config[0].ReplaceDefault = %v, want map[exec:bash]", got[0].ReplaceDefault)
|
||||
}
|
||||
|
||||
// Second config should get default timeout.
|
||||
|
||||
@@ -84,23 +84,13 @@ func (m *Manager) startServer(ctx context.Context, srv ServerConfig) (*Client, e
|
||||
}
|
||||
|
||||
func (m *Manager) registerTools(srv ServerConfig, tools []MCPTool, client *Client, registry *tool.Registry) {
|
||||
replaceSet := make(map[string]bool, len(srv.ReplaceDefault))
|
||||
for _, name := range srv.ReplaceDefault {
|
||||
replaceSet[name] = true
|
||||
}
|
||||
|
||||
for _, mt := range tools {
|
||||
adapter := NewAdapter(srv.Name, mt, client)
|
||||
|
||||
// Check if any replace_default entry matches this MCP tool.
|
||||
// Match by checking if the MCP tool name appears in a replace target,
|
||||
// or assign replacements in order.
|
||||
for _, replaceName := range srv.ReplaceDefault {
|
||||
if replaceSet[replaceName] {
|
||||
adapter.SetOverrideName(replaceName)
|
||||
delete(replaceSet, replaceName)
|
||||
break
|
||||
}
|
||||
// Explicit mapping: if this MCP tool name has a replace_default entry,
|
||||
// register it under the built-in's name instead of mcp__{server}__{tool}.
|
||||
if builtinName, ok := srv.ReplaceDefault[mt.Name]; ok {
|
||||
adapter.SetOverrideName(builtinName)
|
||||
}
|
||||
|
||||
registry.Register(adapter)
|
||||
|
||||
@@ -71,7 +71,7 @@ func TestManager_StartAll_ReplaceDefault(t *testing.T) {
|
||||
Command: "bash",
|
||||
Args: []string{script},
|
||||
Timeout: 5 * time.Second,
|
||||
ReplaceDefault: []string{"bash"},
|
||||
ReplaceDefault: map[string]string{"exec": "bash"},
|
||||
},
|
||||
}, reg)
|
||||
if err != nil {
|
||||
@@ -170,7 +170,7 @@ func TestManager_StartAll_ReplaceDefault_PicksMatchingTool(t *testing.T) {
|
||||
Command: "bash",
|
||||
Args: []string{script},
|
||||
Timeout: 5 * time.Second,
|
||||
ReplaceDefault: []string{"fs.read", "fs.write"},
|
||||
ReplaceDefault: map[string]string{"read": "fs.read", "write": "fs.write"},
|
||||
},
|
||||
}, reg)
|
||||
if err != nil {
|
||||
|
||||
@@ -165,8 +165,8 @@ func TestAdapter_Execute_RPCError(t *testing.T) {
|
||||
content := `#!/bin/bash
|
||||
DIR="` + dir + `"
|
||||
while IFS= read -r line; do
|
||||
method=$(echo "$line" | python3 -c "import sys,json; print(json.load(sys.stdin).get('method',''))" 2>/dev/null)
|
||||
id=$(echo "$line" | python3 -c "import sys,json; print(json.load(sys.stdin).get('id',0))" 2>/dev/null)
|
||||
method=$(echo "$line" | grep -o '"method":"[^"]*"' | head -1 | sed 's/"method":"//;s/"//')
|
||||
id=$(echo "$line" | grep -o '"id":[0-9]*' | head -1 | cut -d: -f2)
|
||||
|
||||
case "$method" in
|
||||
initialize)
|
||||
|
||||
164
internal/router/bench_test.go
Normal file
164
internal/router/bench_test.go
Normal file
@@ -0,0 +1,164 @@
|
||||
package router
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"somegit.dev/Owlibou/gnoma/internal/provider"
|
||||
)
|
||||
|
||||
// benchArms creates a set of arms with diverse cost/capability profiles.
|
||||
func benchArms() []*Arm {
|
||||
return []*Arm{
|
||||
{
|
||||
ID: "anthropic/claude-sonnet", ModelName: "claude-sonnet",
|
||||
Capabilities: provider.Capabilities{ToolUse: true, ContextWindow: 200000, Thinking: false},
|
||||
CostPer1kInput: 0.003, CostPer1kOutput: 0.015,
|
||||
},
|
||||
{
|
||||
ID: "anthropic/claude-opus", ModelName: "claude-opus",
|
||||
Capabilities: provider.Capabilities{ToolUse: true, ContextWindow: 200000, Thinking: true},
|
||||
CostPer1kInput: 0.015, CostPer1kOutput: 0.075,
|
||||
},
|
||||
{
|
||||
ID: "openai/gpt-4o", ModelName: "gpt-4o",
|
||||
Capabilities: provider.Capabilities{ToolUse: true, ContextWindow: 128000},
|
||||
CostPer1kInput: 0.005, CostPer1kOutput: 0.015,
|
||||
},
|
||||
{
|
||||
ID: "ollama/qwen3:8b", ModelName: "qwen3:8b",
|
||||
IsLocal: true,
|
||||
Capabilities: provider.Capabilities{ToolUse: true, ContextWindow: 32000},
|
||||
CostPer1kInput: 0, CostPer1kOutput: 0,
|
||||
},
|
||||
{
|
||||
ID: "mistral/mistral-large", ModelName: "mistral-large",
|
||||
Capabilities: provider.Capabilities{ToolUse: true, ContextWindow: 128000},
|
||||
CostPer1kInput: 0.002, CostPer1kOutput: 0.006,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// benchTasks returns one task per TaskType at varying complexity.
|
||||
func benchTasks() []Task {
|
||||
return []Task{
|
||||
{Type: TaskBoilerplate, Priority: PriorityLow, EstimatedTokens: 500, RequiresTools: true, ComplexityScore: 0.1},
|
||||
{Type: TaskGeneration, Priority: PriorityNormal, EstimatedTokens: 2000, RequiresTools: true, ComplexityScore: 0.5},
|
||||
{Type: TaskRefactor, Priority: PriorityNormal, EstimatedTokens: 3000, RequiresTools: true, ComplexityScore: 0.6},
|
||||
{Type: TaskReview, Priority: PriorityHigh, EstimatedTokens: 4000, RequiresTools: false, ComplexityScore: 0.5},
|
||||
{Type: TaskUnitTest, Priority: PriorityNormal, EstimatedTokens: 1500, RequiresTools: true, ComplexityScore: 0.4},
|
||||
{Type: TaskPlanning, Priority: PriorityHigh, EstimatedTokens: 5000, RequiresTools: false, ComplexityScore: 0.8},
|
||||
{Type: TaskOrchestration, Priority: PriorityCritical, EstimatedTokens: 8000, RequiresTools: true, ComplexityScore: 0.9},
|
||||
{Type: TaskSecurityReview, Priority: PriorityCritical, EstimatedTokens: 6000, RequiresTools: true, ComplexityScore: 0.85},
|
||||
{Type: TaskDebug, Priority: PriorityNormal, EstimatedTokens: 3000, RequiresTools: true, ComplexityScore: 0.6},
|
||||
{Type: TaskExplain, Priority: PriorityLow, EstimatedTokens: 1000, RequiresTools: false, ComplexityScore: 0.2},
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkSelectBest(b *testing.B) {
|
||||
arms := benchArms()
|
||||
tasks := benchTasks()
|
||||
qt := NewQualityTracker()
|
||||
|
||||
b.ResetTimer()
|
||||
for b.Loop() {
|
||||
for _, task := range tasks {
|
||||
selectBest(qt, arms, task)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkFilterFeasible(b *testing.B) {
|
||||
arms := benchArms()
|
||||
tasks := benchTasks()
|
||||
|
||||
b.ResetTimer()
|
||||
for b.Loop() {
|
||||
for _, task := range tasks {
|
||||
filterFeasible(arms, task)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkRouterSelect(b *testing.B) {
|
||||
r := New(Config{})
|
||||
for _, arm := range benchArms() {
|
||||
r.RegisterArm(arm)
|
||||
}
|
||||
tasks := benchTasks()
|
||||
|
||||
b.ResetTimer()
|
||||
for b.Loop() {
|
||||
for _, task := range tasks {
|
||||
d := r.Select(task)
|
||||
if d.Error == nil {
|
||||
d.Commit(task.EstimatedTokens)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkScoreArm(b *testing.B) {
|
||||
arms := benchArms()
|
||||
qt := NewQualityTracker()
|
||||
task := Task{Type: TaskGeneration, Priority: PriorityNormal, EstimatedTokens: 2000, RequiresTools: true, ComplexityScore: 0.5}
|
||||
|
||||
b.ResetTimer()
|
||||
for b.Loop() {
|
||||
for _, arm := range arms {
|
||||
scoreArm(qt, arm, task)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkClassifyTask(b *testing.B) {
|
||||
prompts := []string{
|
||||
"fix the null pointer in handleRequest",
|
||||
"explain how the router selects arms",
|
||||
"refactor the authentication middleware to use the new session store",
|
||||
"add a new endpoint for user profile updates",
|
||||
"review the security of the payment processing flow for OWASP vulnerabilities",
|
||||
"write unit tests for the pool tracker",
|
||||
"plan the architecture for the plugin system",
|
||||
"scaffold a new provider adapter for Cohere",
|
||||
"orchestrate a multi-step migration: backup, schema change, data backfill, verify",
|
||||
"debug why the TUI freezes when streaming large responses",
|
||||
}
|
||||
|
||||
b.ResetTimer()
|
||||
for b.Loop() {
|
||||
for _, p := range prompts {
|
||||
ClassifyTask(p)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkRouterSelectWithQuality(b *testing.B) {
|
||||
r := New(Config{})
|
||||
for _, arm := range benchArms() {
|
||||
r.RegisterArm(arm)
|
||||
}
|
||||
tasks := benchTasks()
|
||||
|
||||
// Seed quality tracker with 20 observations per arm/task combo
|
||||
for _, arm := range benchArms() {
|
||||
for _, task := range tasks {
|
||||
for range 20 {
|
||||
r.quality.Record(arm.ID, task.Type, true)
|
||||
}
|
||||
// Mix in some failures for realism
|
||||
for range 3 {
|
||||
r.quality.Record(arm.ID, task.Type, false)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
b.ResetTimer()
|
||||
for b.Loop() {
|
||||
for _, task := range tasks {
|
||||
d := r.Select(task)
|
||||
if d.Error == nil {
|
||||
d.Commit(task.EstimatedTokens)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user