Replace the Mistral + Ollama AI stack with a single Google Gemini provider backed by google.golang.org/genai. API key moves from env/Helm to the DB (AES-256-GCM, key derived from JWT_SECRET via HKDF) so it can be rotated via the admin UI without a pod restart. New: - pkg/crypto/secretbox — AES-256-GCM encrypt/decrypt for secrets at rest - pkg/ai/gemini — GeminiProvider with grounding, structured output, usage recording, and hot-reload (Reinitialize swaps client under mutex) - pkg/ai/usage — UsageRecorder interface + UsageEvent struct - domain/settings/store — DB-backed settings (model, grounding toggle, key) - domain/settings/usage — UsageRepo implementing UsageRecorder; ai_usage table - migrations 000021 (system_settings) + 000022 (ai_usage) - settings API: GET /ai, POST /ai/key, POST /ai/model, POST /ai/grounding, GET /ai/usage - admin UI: 4-card settings page — provider status, model selector, grounding toggle with quota, usage rollups + recent-calls table Removed: - pkg/ai/ollama, mistral_provider, ratelimiter (+ tests) - Helm AI_API_KEY, AI_PROVIDER, AI_MODEL_COMPLEX, AI_AGENT_DISCOVERY, AI_RATE_LIMIT_RPS env vars Call sites set Grounded+CallType: research (true/"research"), enrich Pass B (true/"enrich_b"), similarity (false/"similarity"). Integration test updated to use a stub ai.Provider instead of a fake Ollama HTTP server.
208 lines
6.1 KiB
Go
208 lines
6.1 KiB
Go
// discovery-eval measures discovery's AI-backed components against labelled
|
|
// fixtures. Two modes:
|
|
//
|
|
// -mode similarity (default) — grades SimilarityClassifier on
|
|
// pair-labelled fixtures. Precision/recall/F1/accuracy
|
|
// + confidence calibration.
|
|
// -mode category — grades LLMEnricher's `category` output on
|
|
// row-labelled fixtures. Accuracy + per-label confusion.
|
|
//
|
|
// Usage:
|
|
//
|
|
// GEMINI_API_KEY=... \
|
|
// discovery-eval \
|
|
// -mode similarity \
|
|
// -fixture backend/cmd/discovery-eval/fixtures/similarity.json \
|
|
// -cache .eval-cache.json \
|
|
// -threshold 0.8 \
|
|
// -report eval-report.json
|
|
//
|
|
// Each mode has its own cache key so switching modes doesn't churn entries.
|
|
// Set GEMINI_MODEL to override the model (default: gemini-2.5-flash-lite).
|
|
package main
|
|
|
|
import (
|
|
"context"
|
|
"flag"
|
|
"fmt"
|
|
"log/slog"
|
|
"os"
|
|
"time"
|
|
|
|
"marktvogt.de/backend/internal/domain/discovery/enrich"
|
|
"marktvogt.de/backend/internal/pkg/ai"
|
|
"marktvogt.de/backend/internal/pkg/scrape"
|
|
)
|
|
|
|
const (
|
|
modeSimilarity = "similarity"
|
|
modeCategory = "category"
|
|
)
|
|
|
|
var validModes = []string{modeSimilarity, modeCategory}
|
|
|
|
type evalConfig struct {
|
|
model string
|
|
fixturePath string
|
|
cachePath string
|
|
reportPath string
|
|
threshold float64
|
|
}
|
|
|
|
// realMain returns the desired exit code. Kept separate from main() so
|
|
// deferred cleanup runs even on error paths — os.Exit would skip defers.
|
|
func realMain() int {
|
|
slog.SetDefault(slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
|
|
Level: slog.LevelInfo,
|
|
})))
|
|
|
|
var (
|
|
mode = flag.String("mode", modeSimilarity, "eval mode: similarity | category")
|
|
fixturePath = flag.String("fixture", "", "path to labelled fixture JSON (defaults per mode)")
|
|
cachePath = flag.String("cache", ".eval-cache.json", "path to local verdict cache (gitignored)")
|
|
reportPath = flag.String("report", "", "optional path to write machine-readable JSON report")
|
|
threshold = flag.Float64("threshold", 0.0, "fail (exit 1) when F1/accuracy is below this value; 0 disables gating")
|
|
)
|
|
flag.Parse()
|
|
|
|
apiKey := os.Getenv("GEMINI_API_KEY")
|
|
if apiKey == "" {
|
|
slog.Error("GEMINI_API_KEY is required for eval")
|
|
return 2
|
|
}
|
|
model := os.Getenv("GEMINI_MODEL")
|
|
if model == "" {
|
|
model = "gemini-2.5-flash-lite"
|
|
}
|
|
userAgent := os.Getenv("AI_USER_AGENT")
|
|
if userAgent == "" {
|
|
userAgent = "marktvogt-eval/1.0 (+https://marktvogt.de)"
|
|
}
|
|
|
|
ctx, cancel := context.WithTimeout(context.Background(), 15*time.Minute)
|
|
defer cancel()
|
|
|
|
client, err := ai.NewGeminiProvider(ctx, apiKey, model, nil)
|
|
if err != nil {
|
|
slog.Error("AI client init failed", "error", err)
|
|
return 2
|
|
}
|
|
|
|
switch *mode {
|
|
case modeSimilarity:
|
|
cfg := evalConfig{
|
|
model: model,
|
|
fixturePath: pathWithDefault(*fixturePath, "backend/cmd/discovery-eval/fixtures/similarity.json"),
|
|
cachePath: *cachePath,
|
|
reportPath: *reportPath,
|
|
threshold: *threshold,
|
|
}
|
|
return runSimilarityMode(ctx, client, cfg)
|
|
case modeCategory:
|
|
scraper := scrape.New(userAgent)
|
|
enricher := enrich.NewLLMEnricher(client, scraper)
|
|
cfg := evalConfig{
|
|
model: model,
|
|
fixturePath: pathWithDefault(*fixturePath, "backend/cmd/discovery-eval/fixtures/category.json"),
|
|
cachePath: *cachePath,
|
|
reportPath: *reportPath,
|
|
threshold: *threshold,
|
|
}
|
|
return runCategoryMode(ctx, enricher, cfg)
|
|
default:
|
|
slog.Error("unknown mode", "mode", *mode, "valid", validModes)
|
|
return 2
|
|
}
|
|
}
|
|
|
|
func pathWithDefault(p, dflt string) string {
|
|
if p == "" {
|
|
return dflt
|
|
}
|
|
return p
|
|
}
|
|
|
|
func runSimilarityMode(ctx context.Context, client ai.Provider, cfg evalConfig) int {
|
|
fixture, err := loadFixture(cfg.fixturePath)
|
|
if err != nil {
|
|
slog.Error("load fixture", "path", cfg.fixturePath, "error", err)
|
|
return 2
|
|
}
|
|
slog.Info("loaded fixture", "mode", modeSimilarity, "pairs", len(fixture.Pairs), "path", cfg.fixturePath)
|
|
|
|
classifier := enrich.NewSimilarityClassifier(client)
|
|
|
|
cache, err := loadCache(cfg.cachePath)
|
|
if err != nil {
|
|
slog.Warn("cache load failed; starting empty", "path", cfg.cachePath, "error", err)
|
|
}
|
|
|
|
results, err := run(ctx, classifier, cache, fixture, cfg.model)
|
|
if err != nil {
|
|
slog.Error("eval run failed", "error", err)
|
|
return 2
|
|
}
|
|
|
|
if err := saveCache(cfg.cachePath, cache); err != nil {
|
|
slog.Warn("cache save failed; metrics still reported", "path", cfg.cachePath, "error", err)
|
|
}
|
|
|
|
metrics := computeMetrics(results)
|
|
printSummary(os.Stdout, results, metrics, cfg.model)
|
|
|
|
if cfg.reportPath != "" {
|
|
if err := writeReport(cfg.reportPath, results, metrics, cfg.model); err != nil {
|
|
slog.Warn("report write failed", "path", cfg.reportPath, "error", err)
|
|
}
|
|
}
|
|
|
|
if cfg.threshold > 0 && metrics.F1 < cfg.threshold {
|
|
fmt.Fprintf(os.Stderr, "\nFAIL: F1=%.3f < threshold=%.3f\n", metrics.F1, cfg.threshold)
|
|
return 1
|
|
}
|
|
return 0
|
|
}
|
|
|
|
func runCategoryMode(ctx context.Context, enricher enrich.LLMEnricher, cfg evalConfig) int {
|
|
fixture, err := loadCategoryFixture(cfg.fixturePath)
|
|
if err != nil {
|
|
slog.Error("load fixture", "path", cfg.fixturePath, "error", err)
|
|
return 2
|
|
}
|
|
slog.Info("loaded fixture", "mode", modeCategory, "rows", len(fixture.Rows), "path", cfg.fixturePath)
|
|
|
|
cache, err := loadCategoryCache(cfg.cachePath)
|
|
if err != nil {
|
|
slog.Warn("cache load failed; starting empty", "path", cfg.cachePath, "error", err)
|
|
}
|
|
|
|
results, err := runCategory(ctx, enricher, cache, fixture, cfg.model)
|
|
if err != nil {
|
|
slog.Error("eval run failed", "error", err)
|
|
return 2
|
|
}
|
|
|
|
if err := saveCategoryCache(cfg.cachePath, cache); err != nil {
|
|
slog.Warn("cache save failed; metrics still reported", "path", cfg.cachePath, "error", err)
|
|
}
|
|
|
|
metrics := computeCategoryMetrics(results)
|
|
printCategorySummary(os.Stdout, results, metrics, cfg.model)
|
|
|
|
if cfg.reportPath != "" {
|
|
if err := writeCategoryReport(cfg.reportPath, results, metrics, cfg.model); err != nil {
|
|
slog.Warn("report write failed", "path", cfg.reportPath, "error", err)
|
|
}
|
|
}
|
|
|
|
if cfg.threshold > 0 && metrics.Accuracy < cfg.threshold {
|
|
fmt.Fprintf(os.Stderr, "\nFAIL: accuracy=%.3f < threshold=%.3f\n", metrics.Accuracy, cfg.threshold)
|
|
return 1
|
|
}
|
|
return 0
|
|
}
|
|
|
|
func main() {
|
|
os.Exit(realMain())
|
|
}
|