Ship 2 MR 5. Adds a CLI that measures MistralSimilarityClassifier against a labelled fixture: precision, recall, F1, accuracy, plus a confidence calibration table so we can tell whether "90% confident" verdicts are actually right 90% of the time. Usage: go run ./backend/cmd/discovery-eval -fixture ... -cache ... -threshold 0.8 -report eval-report.json. Structure - main.go: arg parsing + wiring (ai.Client, classifier, cache, metrics). The work happens in realMain() which returns an exit code — keeps defers running on error paths. - fixture.go: parses labelled pairs JSON. Fixture authors only need to fill in name/stadt/year; name_normalized falls back to name when omitted. - cache.go: file-backed map keyed by SimilarityPairKey + model string. Symmetric (a,b) == (b,a). Atomic writes (temp file + rename) so a crashed run cannot corrupt the cache. Corrupt-file load returns an empty usable cache and reports the parse error. - run.go: executes each pair through the classifier, populating the cache. Individual classify errors are downgraded to "not correct" and logged — the run always finishes so the operator sees whatever data is available. - metrics.go: confusion matrix, P/R/F1/accuracy, per-confidence- bucket calibration ([0-0.5), [0.5-0.75), [0.75-0.9), [0.9-1.0]). Prints human summary + surfaces highest-confidence mismatches first (most actionable for prompt iteration). Optional JSON report. - Threshold gate: -threshold N exits non-zero when F1<N. Default 0 (gating disabled until we have a baseline F1). Fixture: seeds 15 hand-crafted DACH-market pairs covering the edge cases we actually care about — umlaut drift (Straßburg/Strassburg), year difference on a recurring series, word-reordering, distinct events at the same venue, historical proper names (Striezelmarkt), same city with multiple distinct Christmas markets. Operator extends over time; each pair carries a `note` explaining the case it locks. .gitignore adds .eval-cache.json and eval-report.json — neither should land in the repo. Tests cover metrics edge cases (all correct, imbalanced, no-positive-predictions-no-NaN, calibration bucket assignment, cache accounting, empty input) and cache behaviour (round-trip, symmetric lookup, model-scoped invalidation, missing/corrupt file handling, atomic-write leaves no temp files). Out of scope for MR 5: enrichment field accuracy (fuzzy text scoring is its own problem — tracked for a follow-up), CI wiring (needs a baseline F1 first).
107 lines
3.3 KiB
Go
107 lines
3.3 KiB
Go
package main
|
|
|
|
import (
|
|
"os"
|
|
"path/filepath"
|
|
"testing"
|
|
|
|
"marktvogt.de/backend/internal/domain/discovery/enrich"
|
|
)
|
|
|
|
func TestCache_RoundTrip(t *testing.T) {
|
|
dir := t.TempDir()
|
|
path := filepath.Join(dir, "cache.json")
|
|
|
|
c := newCache()
|
|
a := enrich.SimilarityRow{NameNormalized: "ritterfest dresden", Stadt: "Dresden", Year: 2026}
|
|
b := enrich.SimilarityRow{NameNormalized: "mittelaltermarkt dresden", Stadt: "Dresden", Year: 2026}
|
|
want := CachedVerdict{Same: false, Confidence: 0.72, Reason: "distinct events", Model: "m"}
|
|
c.Put(a, b, "m", want)
|
|
|
|
if err := saveCache(path, c); err != nil {
|
|
t.Fatalf("save: %v", err)
|
|
}
|
|
|
|
loaded, err := loadCache(path)
|
|
if err != nil {
|
|
t.Fatalf("load: %v", err)
|
|
}
|
|
got, ok := loaded.Get(a, b, "m")
|
|
if !ok {
|
|
t.Fatalf("cache miss after round-trip")
|
|
}
|
|
if got.Same != want.Same || got.Confidence != want.Confidence || got.Reason != want.Reason {
|
|
t.Errorf("verdict changed across round-trip: got=%+v want=%+v", got, want)
|
|
}
|
|
}
|
|
|
|
func TestCache_SymmetricKey(t *testing.T) {
|
|
// (a, b) and (b, a) must hit the same entry — classifier is symmetric.
|
|
c := newCache()
|
|
a := enrich.SimilarityRow{NameNormalized: "a", Stadt: "A", Year: 2026}
|
|
b := enrich.SimilarityRow{NameNormalized: "b", Stadt: "B", Year: 2026}
|
|
c.Put(a, b, "m", CachedVerdict{Same: true, Confidence: 0.9})
|
|
if _, ok := c.Get(b, a, "m"); !ok {
|
|
t.Error("reversed lookup should hit the same entry (SimilarityPairKey is symmetric)")
|
|
}
|
|
}
|
|
|
|
func TestCache_ModelScopesEntries(t *testing.T) {
|
|
// Changing the model string must invalidate — two different model names
|
|
// can produce different verdicts on the same pair.
|
|
c := newCache()
|
|
a := enrich.SimilarityRow{NameNormalized: "x", Stadt: "X", Year: 2026}
|
|
b := enrich.SimilarityRow{NameNormalized: "y", Stadt: "Y", Year: 2026}
|
|
c.Put(a, b, "m1", CachedVerdict{Same: true, Confidence: 0.9})
|
|
if _, ok := c.Get(a, b, "m2"); ok {
|
|
t.Error("cache hit under a different model; should be a miss")
|
|
}
|
|
}
|
|
|
|
func TestLoadCache_MissingFile(t *testing.T) {
|
|
c, err := loadCache(filepath.Join(t.TempDir(), "does-not-exist.json"))
|
|
if err != nil {
|
|
t.Errorf("missing file should not error; got %v", err)
|
|
}
|
|
if c == nil || c.Entries == nil {
|
|
t.Error("missing-file load should return an empty usable cache")
|
|
}
|
|
}
|
|
|
|
func TestLoadCache_CorruptFileStartsEmpty(t *testing.T) {
|
|
dir := t.TempDir()
|
|
path := filepath.Join(dir, "cache.json")
|
|
if err := os.WriteFile(path, []byte("{not valid json"), 0o644); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
c, err := loadCache(path)
|
|
if err == nil {
|
|
t.Error("expected a parse error to be reported (so the operator can investigate)")
|
|
}
|
|
if c == nil || c.Entries == nil {
|
|
t.Error("corrupt file should still return a usable empty cache")
|
|
}
|
|
}
|
|
|
|
func TestSaveCache_AtomicWrite(t *testing.T) {
|
|
// Save, then check no .tmp files are left behind. Not a perfect test
|
|
// of atomicity — that's hard to exercise without injecting a crash —
|
|
// but catches the common failure where the tmp file is leaked.
|
|
dir := t.TempDir()
|
|
path := filepath.Join(dir, "cache.json")
|
|
if err := saveCache(path, newCache()); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
entries, err := os.ReadDir(dir)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
if len(entries) != 1 {
|
|
names := make([]string, 0, len(entries))
|
|
for _, e := range entries {
|
|
names = append(names, e.Name())
|
|
}
|
|
t.Errorf("expected exactly cache.json in dir; got %v", names)
|
|
}
|
|
}
|