marktvogt.de/backend/cmd/discovery-eval/main.go

// discovery-eval measures discovery's AI-backed components against labelled
// fixtures. Two modes:
//
//	-mode similarity  (default) — grades SimilarityClassifier on
//	                   pair-labelled fixtures. Precision/recall/F1/accuracy
//	                   + confidence calibration.
//	-mode category    — grades LLMEnricher's `category` output on
//	                   row-labelled fixtures. Accuracy + per-label confusion.
//
// Usage:
//
//	GEMINI_API_KEY=... \
//	  discovery-eval \
//	    -mode    similarity \
//	    -fixture backend/cmd/discovery-eval/fixtures/similarity.json \
//	    -cache   .eval-cache.json \
//	    -threshold 0.8 \
//	    -report  eval-report.json
//
// Each mode has its own cache key so switching modes doesn't churn entries.
// Set GEMINI_MODEL to override the model (default: gemini-2.5-flash-lite).
package main

import (
	"context"
	"flag"
	"fmt"
	"log/slog"
	"os"
	"time"

	"marktvogt.de/backend/internal/domain/discovery/enrich"
	"marktvogt.de/backend/internal/pkg/ai"
	"marktvogt.de/backend/internal/pkg/scrape"
)

const (
	modeSimilarity = "similarity"
	modeCategory   = "category"
)

var validModes = []string{modeSimilarity, modeCategory}

type evalConfig struct {
	model       string
	fixturePath string
	cachePath   string
	reportPath  string
	threshold   float64
}

// realMain returns the desired exit code. Kept separate from main() so
// deferred cleanup runs even on error paths — os.Exit would skip defers.
func realMain() int {
	slog.SetDefault(slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
		Level: slog.LevelInfo,
	})))

	var (
		mode        = flag.String("mode", modeSimilarity, "eval mode: similarity | category")
		fixturePath = flag.String("fixture", "", "path to labelled fixture JSON (defaults per mode)")
		cachePath   = flag.String("cache", ".eval-cache.json", "path to local verdict cache (gitignored)")
		reportPath  = flag.String("report", "", "optional path to write machine-readable JSON report")
		threshold   = flag.Float64("threshold", 0.0, "fail (exit 1) when F1/accuracy is below this value; 0 disables gating")
	)
	flag.Parse()

	apiKey := os.Getenv("GEMINI_API_KEY")
	if apiKey == "" {
		slog.Error("GEMINI_API_KEY is required for eval")
		return 2
	}
	model := os.Getenv("GEMINI_MODEL")
	if model == "" {
		model = "gemini-2.5-flash-lite"
	}
	userAgent := os.Getenv("AI_USER_AGENT")
	if userAgent == "" {
		userAgent = "marktvogt-eval/1.0 (+https://marktvogt.de)"
	}

	ctx, cancel := context.WithTimeout(context.Background(), 15*time.Minute)
	defer cancel()

	client, err := ai.NewGeminiProvider(ctx, apiKey, model, nil)
	if err != nil {
		slog.Error("AI client init failed", "error", err)
		return 2
	}

	switch *mode {
	case modeSimilarity:
		cfg := evalConfig{
			model:       model,
			fixturePath: pathWithDefault(*fixturePath, "backend/cmd/discovery-eval/fixtures/similarity.json"),
			cachePath:   *cachePath,
			reportPath:  *reportPath,
			threshold:   *threshold,
		}
		return runSimilarityMode(ctx, client, cfg)
	case modeCategory:
		scraper := scrape.New(userAgent)
		enricher := enrich.NewLLMEnricher(client, scraper)
		cfg := evalConfig{
			model:       model,
			fixturePath: pathWithDefault(*fixturePath, "backend/cmd/discovery-eval/fixtures/category.json"),
			cachePath:   *cachePath,
			reportPath:  *reportPath,
			threshold:   *threshold,
		}
		return runCategoryMode(ctx, enricher, cfg)
	default:
		slog.Error("unknown mode", "mode", *mode, "valid", validModes)
		return 2
	}
}

func pathWithDefault(p, dflt string) string {
	if p == "" {
		return dflt
	}
	return p
}

func runSimilarityMode(ctx context.Context, client ai.Provider, cfg evalConfig) int {
	fixture, err := loadFixture(cfg.fixturePath)
	if err != nil {
		slog.Error("load fixture", "path", cfg.fixturePath, "error", err)
		return 2
	}
	slog.Info("loaded fixture", "mode", modeSimilarity, "pairs", len(fixture.Pairs), "path", cfg.fixturePath)

	classifier := enrich.NewSimilarityClassifier(client)

	cache, err := loadCache(cfg.cachePath)
	if err != nil {
		slog.Warn("cache load failed; starting empty", "path", cfg.cachePath, "error", err)
	}

	results, err := run(ctx, classifier, cache, fixture, cfg.model)
	if err != nil {
		slog.Error("eval run failed", "error", err)
		return 2
	}

	if err := saveCache(cfg.cachePath, cache); err != nil {
		slog.Warn("cache save failed; metrics still reported", "path", cfg.cachePath, "error", err)
	}

	metrics := computeMetrics(results)
	printSummary(os.Stdout, results, metrics, cfg.model)

	if cfg.reportPath != "" {
		if err := writeReport(cfg.reportPath, results, metrics, cfg.model); err != nil {
			slog.Warn("report write failed", "path", cfg.reportPath, "error", err)
		}
	}

	if cfg.threshold > 0 && metrics.F1 < cfg.threshold {
		fmt.Fprintf(os.Stderr, "\nFAIL: F1=%.3f < threshold=%.3f\n", metrics.F1, cfg.threshold)
		return 1
	}
	return 0
}

func runCategoryMode(ctx context.Context, enricher enrich.LLMEnricher, cfg evalConfig) int {
	fixture, err := loadCategoryFixture(cfg.fixturePath)
	if err != nil {
		slog.Error("load fixture", "path", cfg.fixturePath, "error", err)
		return 2
	}
	slog.Info("loaded fixture", "mode", modeCategory, "rows", len(fixture.Rows), "path", cfg.fixturePath)

	cache, err := loadCategoryCache(cfg.cachePath)
	if err != nil {
		slog.Warn("cache load failed; starting empty", "path", cfg.cachePath, "error", err)
	}

	results, err := runCategory(ctx, enricher, cache, fixture, cfg.model)
	if err != nil {
		slog.Error("eval run failed", "error", err)
		return 2
	}

	if err := saveCategoryCache(cfg.cachePath, cache); err != nil {
		slog.Warn("cache save failed; metrics still reported", "path", cfg.cachePath, "error", err)
	}

	metrics := computeCategoryMetrics(results)
	printCategorySummary(os.Stdout, results, metrics, cfg.model)

	if cfg.reportPath != "" {
		if err := writeCategoryReport(cfg.reportPath, results, metrics, cfg.model); err != nil {
			slog.Warn("report write failed", "path", cfg.reportPath, "error", err)
		}
	}

	if cfg.threshold > 0 && metrics.Accuracy < cfg.threshold {
		fmt.Fprintf(os.Stderr, "\nFAIL: accuracy=%.3f < threshold=%.3f\n", metrics.Accuracy, cfg.threshold)
		return 1
	}
	return 0
}

func main() {
	os.Exit(realMain())
}