Ship 2 MR 5. Adds a CLI that measures MistralSimilarityClassifier against a labelled fixture: precision, recall, F1, accuracy, plus a confidence calibration table so we can tell whether "90% confident" verdicts are actually right 90% of the time. Usage: go run ./backend/cmd/discovery-eval -fixture ... -cache ... -threshold 0.8 -report eval-report.json. Structure - main.go: arg parsing + wiring (ai.Client, classifier, cache, metrics). The work happens in realMain() which returns an exit code — keeps defers running on error paths. - fixture.go: parses labelled pairs JSON. Fixture authors only need to fill in name/stadt/year; name_normalized falls back to name when omitted. - cache.go: file-backed map keyed by SimilarityPairKey + model string. Symmetric (a,b) == (b,a). Atomic writes (temp file + rename) so a crashed run cannot corrupt the cache. Corrupt-file load returns an empty usable cache and reports the parse error. - run.go: executes each pair through the classifier, populating the cache. Individual classify errors are downgraded to "not correct" and logged — the run always finishes so the operator sees whatever data is available. - metrics.go: confusion matrix, P/R/F1/accuracy, per-confidence- bucket calibration ([0-0.5), [0.5-0.75), [0.75-0.9), [0.9-1.0]). Prints human summary + surfaces highest-confidence mismatches first (most actionable for prompt iteration). Optional JSON report. - Threshold gate: -threshold N exits non-zero when F1<N. Default 0 (gating disabled until we have a baseline F1). Fixture: seeds 15 hand-crafted DACH-market pairs covering the edge cases we actually care about — umlaut drift (Straßburg/Strassburg), year difference on a recurring series, word-reordering, distinct events at the same venue, historical proper names (Striezelmarkt), same city with multiple distinct Christmas markets. Operator extends over time; each pair carries a `note` explaining the case it locks. .gitignore adds .eval-cache.json and eval-report.json — neither should land in the repo. Tests cover metrics edge cases (all correct, imbalanced, no-positive-predictions-no-NaN, calibration bucket assignment, cache accounting, empty input) and cache behaviour (round-trip, symmetric lookup, model-scoped invalidation, missing/corrupt file handling, atomic-write leaves no temp files). Out of scope for MR 5: enrichment field accuracy (fuzzy text scoring is its own problem — tracked for a follow-up), CI wiring (needs a baseline F1 first).
86 lines
2.2 KiB
Go
86 lines
2.2 KiB
Go
package main
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"log/slog"
|
|
|
|
"marktvogt.de/backend/internal/domain/discovery/enrich"
|
|
)
|
|
|
|
// Result is one pair's eval outcome: the label, the verdict, and cache hit
|
|
// status (so the summary can report how much was actually paid for).
|
|
type Result struct {
|
|
Pair LabelledPair
|
|
Verdict CachedVerdict
|
|
Correct bool
|
|
FromCache bool
|
|
}
|
|
|
|
// run executes the classifier against every pair in the fixture, populating
|
|
// the cache along the way. Never fails on individual classifier errors —
|
|
// those are reported as "not correct" with a prominent log line so the
|
|
// eval always finishes and reports whatever it has.
|
|
func run(ctx context.Context, classifier enrich.SimilarityClassifier, cache *Cache, fixture *Fixture, model string) ([]Result, error) {
|
|
results := make([]Result, 0, len(fixture.Pairs))
|
|
for i, p := range fixture.Pairs {
|
|
if err := ctx.Err(); err != nil {
|
|
return results, err
|
|
}
|
|
|
|
a := toSimRow(p.A)
|
|
b := toSimRow(p.B)
|
|
|
|
if v, ok := cache.Get(a, b, model); ok {
|
|
results = append(results, Result{
|
|
Pair: p,
|
|
Verdict: v,
|
|
Correct: v.Same == p.Same,
|
|
FromCache: true,
|
|
})
|
|
continue
|
|
}
|
|
|
|
verdict, err := classifier.Classify(ctx, a, b)
|
|
if err != nil {
|
|
slog.Warn("classify failed; counting as incorrect",
|
|
"pair_index", i, "a_name", p.A.Name, "b_name", p.B.Name, "error", err)
|
|
results = append(results, Result{
|
|
Pair: p,
|
|
Verdict: CachedVerdict{Reason: fmt.Sprintf("error: %v", err)},
|
|
Correct: false,
|
|
})
|
|
continue
|
|
}
|
|
cached := CachedVerdict{
|
|
Same: verdict.Same,
|
|
Confidence: verdict.Confidence,
|
|
Reason: verdict.Reason,
|
|
Model: verdict.Model,
|
|
}
|
|
cache.Put(a, b, model, cached)
|
|
results = append(results, Result{
|
|
Pair: p,
|
|
Verdict: cached,
|
|
Correct: cached.Same == p.Same,
|
|
})
|
|
}
|
|
return results, nil
|
|
}
|
|
|
|
// toSimRow maps the fixture shape to the classifier input. NameNormalized
|
|
// defaults to Name when the fixture author didn't bother — eval pairs are
|
|
// typically written with display names only.
|
|
func toSimRow(r PairRow) enrich.SimilarityRow {
|
|
nn := r.NameNormalized
|
|
if nn == "" {
|
|
nn = r.Name
|
|
}
|
|
return enrich.SimilarityRow{
|
|
Name: r.Name,
|
|
Stadt: r.Stadt,
|
|
Year: r.Year,
|
|
NameNormalized: nn,
|
|
}
|
|
}
|