Files
marktvogt.de/backend/cmd/discovery-eval/main.go
vikingowl 24e072b63d feat(ai): pluggable provider interface, Ollama + Mistral impls, migrate Pass2 sites
Replaces the Mistral-only ai.Client with an ai.Provider interface backed by
Ollama and Mistral implementations. Migrates enrichment + similarity callers
to ai.Provider.Chat. Research endpoint returns 501 until commit 2 reinstates
it on the new orchestrator.
2026-04-24 16:35:18 +02:00

215 lines
6.3 KiB
Go

// discovery-eval measures discovery's AI-backed components against labelled
// fixtures. Two modes:
//
// -mode similarity (default) — grades MistralSimilarityClassifier on
// pair-labelled fixtures. Precision/recall/F1/accuracy
// + confidence calibration.
// -mode category — grades MistralLLMEnricher's `category` output on
// row-labelled fixtures. Accuracy + per-label confusion.
//
// Usage:
//
// AI_API_KEY=... AI_MODEL_COMPLEX=mistral-large-latest \
// discovery-eval \
// -mode similarity \
// -fixture backend/cmd/discovery-eval/fixtures/similarity.json \
// -cache .eval-cache.json \
// -threshold 0.8 \
// -report eval-report.json
//
// Each mode has its own cache key so switching modes doesn't churn entries.
// Bump AI_MODEL_COMPLEX or edit a fixture to force a refresh.
package main
import (
"context"
"flag"
"fmt"
"log/slog"
"os"
"time"
"marktvogt.de/backend/internal/config"
"marktvogt.de/backend/internal/domain/discovery/enrich"
"marktvogt.de/backend/internal/pkg/ai"
"marktvogt.de/backend/internal/pkg/scrape"
)
const (
modeSimilarity = "similarity"
modeCategory = "category"
)
var validModes = []string{modeSimilarity, modeCategory}
type evalConfig struct {
model string
fixturePath string
cachePath string
reportPath string
threshold float64
}
// realMain returns the desired exit code. Kept separate from main() so
// deferred cleanup runs even on error paths — os.Exit would skip defers.
func realMain() int {
slog.SetDefault(slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
Level: slog.LevelInfo,
})))
var (
mode = flag.String("mode", modeSimilarity, "eval mode: similarity | category")
fixturePath = flag.String("fixture", "", "path to labelled fixture JSON (defaults per mode)")
cachePath = flag.String("cache", ".eval-cache.json", "path to local verdict cache (gitignored)")
reportPath = flag.String("report", "", "optional path to write machine-readable JSON report")
threshold = flag.Float64("threshold", 0.0, "fail (exit 1) when F1/accuracy is below this value; 0 disables gating")
)
flag.Parse()
apiKey := os.Getenv("AI_MISTRAL_API_KEY")
if apiKey == "" {
apiKey = os.Getenv("AI_API_KEY") // legacy fallback
}
model := os.Getenv("AI_MISTRAL_MODEL")
if model == "" {
model = os.Getenv("AI_MODEL_COMPLEX") // legacy fallback
}
if model == "" {
model = "mistral-large-latest"
}
userAgent := os.Getenv("AI_USER_AGENT")
if userAgent == "" {
userAgent = "marktvogt-eval/1.0 (+https://marktvogt.de)"
}
client, err := ai.NewFromConfig(config.AIConfig{
Provider: "mistral",
MistralAPIKey: apiKey,
MistralModel: model,
RateLimitRPS: 1.0,
})
if err != nil {
slog.Error("AI client not configured", "error", err)
return 2
}
ctx, cancel := context.WithTimeout(context.Background(), 15*time.Minute)
defer cancel()
switch *mode {
case modeSimilarity:
cfg := evalConfig{
model: model,
fixturePath: pathWithDefault(*fixturePath, "backend/cmd/discovery-eval/fixtures/similarity.json"),
cachePath: *cachePath,
reportPath: *reportPath,
threshold: *threshold,
}
return runSimilarityMode(ctx, client, cfg)
case modeCategory:
scraper := scrape.New(userAgent)
enricher := enrich.NewLLMEnricher(client, scraper)
cfg := evalConfig{
model: model,
fixturePath: pathWithDefault(*fixturePath, "backend/cmd/discovery-eval/fixtures/category.json"),
cachePath: *cachePath,
reportPath: *reportPath,
threshold: *threshold,
}
return runCategoryMode(ctx, enricher, cfg)
default:
slog.Error("unknown mode", "mode", *mode, "valid", validModes)
return 2
}
}
func pathWithDefault(p, dflt string) string {
if p == "" {
return dflt
}
return p
}
func runSimilarityMode(ctx context.Context, client ai.Provider, cfg evalConfig) int {
fixture, err := loadFixture(cfg.fixturePath)
if err != nil {
slog.Error("load fixture", "path", cfg.fixturePath, "error", err)
return 2
}
slog.Info("loaded fixture", "mode", modeSimilarity, "pairs", len(fixture.Pairs), "path", cfg.fixturePath)
classifier := enrich.NewSimilarityClassifier(client)
cache, err := loadCache(cfg.cachePath)
if err != nil {
slog.Warn("cache load failed; starting empty", "path", cfg.cachePath, "error", err)
}
results, err := run(ctx, classifier, cache, fixture, cfg.model)
if err != nil {
slog.Error("eval run failed", "error", err)
return 2
}
if err := saveCache(cfg.cachePath, cache); err != nil {
slog.Warn("cache save failed; metrics still reported", "path", cfg.cachePath, "error", err)
}
metrics := computeMetrics(results)
printSummary(os.Stdout, results, metrics, cfg.model)
if cfg.reportPath != "" {
if err := writeReport(cfg.reportPath, results, metrics, cfg.model); err != nil {
slog.Warn("report write failed", "path", cfg.reportPath, "error", err)
}
}
if cfg.threshold > 0 && metrics.F1 < cfg.threshold {
fmt.Fprintf(os.Stderr, "\nFAIL: F1=%.3f < threshold=%.3f\n", metrics.F1, cfg.threshold)
return 1
}
return 0
}
func runCategoryMode(ctx context.Context, enricher enrich.LLMEnricher, cfg evalConfig) int {
fixture, err := loadCategoryFixture(cfg.fixturePath)
if err != nil {
slog.Error("load fixture", "path", cfg.fixturePath, "error", err)
return 2
}
slog.Info("loaded fixture", "mode", modeCategory, "rows", len(fixture.Rows), "path", cfg.fixturePath)
cache, err := loadCategoryCache(cfg.cachePath)
if err != nil {
slog.Warn("cache load failed; starting empty", "path", cfg.cachePath, "error", err)
}
results, err := runCategory(ctx, enricher, cache, fixture, cfg.model)
if err != nil {
slog.Error("eval run failed", "error", err)
return 2
}
if err := saveCategoryCache(cfg.cachePath, cache); err != nil {
slog.Warn("cache save failed; metrics still reported", "path", cfg.cachePath, "error", err)
}
metrics := computeCategoryMetrics(results)
printCategorySummary(os.Stdout, results, metrics, cfg.model)
if cfg.reportPath != "" {
if err := writeCategoryReport(cfg.reportPath, results, metrics, cfg.model); err != nil {
slog.Warn("report write failed", "path", cfg.reportPath, "error", err)
}
}
if cfg.threshold > 0 && metrics.Accuracy < cfg.threshold {
fmt.Fprintf(os.Stderr, "\nFAIL: accuracy=%.3f < threshold=%.3f\n", metrics.Accuracy, cfg.threshold)
return 1
}
return 0
}
func main() {
os.Exit(realMain())
}