Files
vikingowl cf5408ab66 feat(discovery): eval harness for the AI similarity classifier
Ship 2 MR 5. Adds a CLI that measures MistralSimilarityClassifier
against a labelled fixture: precision, recall, F1, accuracy, plus a
confidence calibration table so we can tell whether "90% confident"
verdicts are actually right 90% of the time.

Usage: go run ./backend/cmd/discovery-eval -fixture ... -cache ...
-threshold 0.8 -report eval-report.json.

Structure
- main.go: arg parsing + wiring (ai.Client, classifier, cache,
  metrics). The work happens in realMain() which returns an exit code
  — keeps defers running on error paths.
- fixture.go: parses labelled pairs JSON. Fixture authors only need to
  fill in name/stadt/year; name_normalized falls back to name when
  omitted.
- cache.go: file-backed map keyed by SimilarityPairKey + model string.
  Symmetric (a,b) == (b,a). Atomic writes (temp file + rename) so a
  crashed run cannot corrupt the cache. Corrupt-file load returns an
  empty usable cache and reports the parse error.
- run.go: executes each pair through the classifier, populating the
  cache. Individual classify errors are downgraded to "not correct"
  and logged — the run always finishes so the operator sees whatever
  data is available.
- metrics.go: confusion matrix, P/R/F1/accuracy, per-confidence-
  bucket calibration ([0-0.5), [0.5-0.75), [0.75-0.9), [0.9-1.0]).
  Prints human summary + surfaces highest-confidence mismatches
  first (most actionable for prompt iteration). Optional JSON report.
- Threshold gate: -threshold N exits non-zero when F1<N. Default 0
  (gating disabled until we have a baseline F1).

Fixture: seeds 15 hand-crafted DACH-market pairs covering the edge
cases we actually care about — umlaut drift (Straßburg/Strassburg),
year difference on a recurring series, word-reordering, distinct
events at the same venue, historical proper names (Striezelmarkt),
same city with multiple distinct Christmas markets. Operator extends
over time; each pair carries a `note` explaining the case it locks.

.gitignore adds .eval-cache.json and eval-report.json — neither
should land in the repo.

Tests cover metrics edge cases (all correct, imbalanced,
no-positive-predictions-no-NaN, calibration bucket assignment,
cache accounting, empty input) and cache behaviour (round-trip,
symmetric lookup, model-scoped invalidation, missing/corrupt file
handling, atomic-write leaves no temp files).

Out of scope for MR 5: enrichment field accuracy (fuzzy text
scoring is its own problem — tracked for a follow-up), CI wiring
(needs a baseline F1 first).
2026-04-24 12:26:18 +02:00

107 lines
3.3 KiB
Go

package main
import (
"os"
"path/filepath"
"testing"
"marktvogt.de/backend/internal/domain/discovery/enrich"
)
func TestCache_RoundTrip(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "cache.json")
c := newCache()
a := enrich.SimilarityRow{NameNormalized: "ritterfest dresden", Stadt: "Dresden", Year: 2026}
b := enrich.SimilarityRow{NameNormalized: "mittelaltermarkt dresden", Stadt: "Dresden", Year: 2026}
want := CachedVerdict{Same: false, Confidence: 0.72, Reason: "distinct events", Model: "m"}
c.Put(a, b, "m", want)
if err := saveCache(path, c); err != nil {
t.Fatalf("save: %v", err)
}
loaded, err := loadCache(path)
if err != nil {
t.Fatalf("load: %v", err)
}
got, ok := loaded.Get(a, b, "m")
if !ok {
t.Fatalf("cache miss after round-trip")
}
if got.Same != want.Same || got.Confidence != want.Confidence || got.Reason != want.Reason {
t.Errorf("verdict changed across round-trip: got=%+v want=%+v", got, want)
}
}
func TestCache_SymmetricKey(t *testing.T) {
// (a, b) and (b, a) must hit the same entry — classifier is symmetric.
c := newCache()
a := enrich.SimilarityRow{NameNormalized: "a", Stadt: "A", Year: 2026}
b := enrich.SimilarityRow{NameNormalized: "b", Stadt: "B", Year: 2026}
c.Put(a, b, "m", CachedVerdict{Same: true, Confidence: 0.9})
if _, ok := c.Get(b, a, "m"); !ok {
t.Error("reversed lookup should hit the same entry (SimilarityPairKey is symmetric)")
}
}
func TestCache_ModelScopesEntries(t *testing.T) {
// Changing the model string must invalidate — two different model names
// can produce different verdicts on the same pair.
c := newCache()
a := enrich.SimilarityRow{NameNormalized: "x", Stadt: "X", Year: 2026}
b := enrich.SimilarityRow{NameNormalized: "y", Stadt: "Y", Year: 2026}
c.Put(a, b, "m1", CachedVerdict{Same: true, Confidence: 0.9})
if _, ok := c.Get(a, b, "m2"); ok {
t.Error("cache hit under a different model; should be a miss")
}
}
func TestLoadCache_MissingFile(t *testing.T) {
c, err := loadCache(filepath.Join(t.TempDir(), "does-not-exist.json"))
if err != nil {
t.Errorf("missing file should not error; got %v", err)
}
if c == nil || c.Entries == nil {
t.Error("missing-file load should return an empty usable cache")
}
}
func TestLoadCache_CorruptFileStartsEmpty(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "cache.json")
if err := os.WriteFile(path, []byte("{not valid json"), 0o644); err != nil {
t.Fatal(err)
}
c, err := loadCache(path)
if err == nil {
t.Error("expected a parse error to be reported (so the operator can investigate)")
}
if c == nil || c.Entries == nil {
t.Error("corrupt file should still return a usable empty cache")
}
}
func TestSaveCache_AtomicWrite(t *testing.T) {
// Save, then check no .tmp files are left behind. Not a perfect test
// of atomicity — that's hard to exercise without injecting a crash —
// but catches the common failure where the tmp file is leaked.
dir := t.TempDir()
path := filepath.Join(dir, "cache.json")
if err := saveCache(path, newCache()); err != nil {
t.Fatal(err)
}
entries, err := os.ReadDir(dir)
if err != nil {
t.Fatal(err)
}
if len(entries) != 1 {
names := make([]string, 0, len(entries))
for _, e := range entries {
names = append(names, e.Name())
}
t.Errorf("expected exactly cache.json in dir; got %v", names)
}
}