feat(discovery): validator — catches agent self-contradictions before insert

Pass 0 agents produce schema-valid but semantically wrong output: markets
claimed in the wrong bundesland, status 'bestaetigt' with a hinweis about
Vorjahresdaten, etc. The schema alone can't catch these. This validator
does, as a blocking gate before InsertDiscovered.

Checks (Pass 0 scope):
- bundesland_mismatch: agent's bundesland must equal bucket.region, with
  a light normalizer for CH 'Kanton X' prefix so Phase B can refine the
  Schweiz seed without a signature break.
- status_hinweis_inconsistent: if agent_status=='bestaetigt' AND hinweis
  contains 'vorjahr' (case-insensitive), the agent contradicted itself.

Errors drop the market (counted as summary.validation_failed); warnings
would get merged into hinweis — no warning-level checks exist yet at
Pass 0 scope, placeholder reserved.

Phase B (research agent) checks will extend this file: oeffnungszeiten
dedup, start_datum window coverage, full quellen liveness for Pass 1.
This commit is contained in:
2026-04-18 10:05:08 +02:00
parent fda30de158
commit de1a3f6efb
3 changed files with 217 additions and 0 deletions

View File

@@ -62,6 +62,7 @@ type TickSummary struct {
Errors int `json:"errors"`
RateLimited int `json:"rate_limited"`
LinkCheckFailed int `json:"link_check_failed"`
ValidationFailed int `json:"validation_failed"`
}
// Tick picks N stale buckets and runs Pass 0 for each, writing net-new discoveries.
@@ -230,6 +231,18 @@ func (s *Service) processBucketResponse(ctx context.Context, b Bucket, resp Pass
NameNormalized: nameNorm,
MatchedSeriesID: matchedSeriesID,
}
// Semantic validation — catches agent self-contradictions that the
// schema alone cannot. Errors drop the market; warnings would be
// appended to hinweis (none defined yet at Pass 0 scope).
issues := ValidateForInsert(dm, b)
if HasErrors(issues) {
slog.InfoContext(ctx, "validation failed; skipping market",
"markt", m.MarktName, "stadt", m.Stadt, "issues", formatIssues(issues))
summary.ValidationFailed++
continue
}
if _, err := s.repo.InsertDiscovered(ctx, dm); err != nil {
slog.WarnContext(ctx, "insert discovered", "error", err)
continue
@@ -239,6 +252,15 @@ func (s *Service) processBucketResponse(ctx context.Context, b Bucket, resp Pass
return summary
}
// formatIssues produces a compact log-friendly summary of validation issues.
func formatIssues(issues []Issue) string {
parts := make([]string, 0, len(issues))
for _, i := range issues {
parts = append(parts, string(i.Severity)+":"+i.Code)
}
return strings.Join(parts, ",")
}
func parseOptionalDate(s string) *time.Time {
if s == "" {
return nil

View File

@@ -0,0 +1,89 @@
// Package discovery validation — semantic checks applied to agent output
// before it lands in the admin queue. Link liveness is handled separately
// by LinkChecker; this file catches self-inconsistencies the agent cannot
// verify on its own (bucket-vs-field mismatches, status-vs-hinweis
// contradictions, malformed ranges).
//
// Pass 1/2 checks (oeffnungszeiten dedup, start_datum coverage, etc.) will
// live in this same file once the Phase B research redesign lands; see
// docs/superpowers/plans/2026-04-18-phase-a-pass0-halbmonat.md.
package discovery
import (
"fmt"
"strings"
)
// Severity classifies a validation finding. Errors block the insert; warnings
// are surfaced in the hinweis for admin attention but do not drop the market.
type Severity string
const (
SeverityError Severity = "error"
SeverityWarning Severity = "warning"
)
// Issue is a single validation finding. Code is a stable identifier the UI
// can key off of; Message is human-readable.
type Issue struct {
Severity Severity
Code string
Message string
Field string // optional — name of the offending field, for UI targeting
}
// ValidateForInsert checks a Pass 0 DiscoveredMarket against the bucket it came
// from, before it hits the queue. Current checks:
//
// - bundesland_mismatch: m.Bundesland does not equal b.Region (CH "Kanton X"
// prefix is normalized).
// - status_hinweis_inconsistent: AgentStatus=="bestaetigt" AND hinweis
// mentions "vorjahr" — the agent contradicted itself.
//
// Returns nil when clean.
func ValidateForInsert(m DiscoveredMarket, b Bucket) []Issue {
var issues []Issue
if m.Bundesland != "" && !regionsEqual(m.Bundesland, b.Region) {
issues = append(issues, Issue{
Severity: SeverityError,
Code: "bundesland_mismatch",
Message: fmt.Sprintf("bundesland %q does not match bucket region %q", m.Bundesland, b.Region),
Field: "bundesland",
})
}
if m.AgentStatus == "bestaetigt" && strings.Contains(strings.ToLower(m.Hinweis), "vorjahr") {
issues = append(issues, Issue{
Severity: SeverityError,
Code: "status_hinweis_inconsistent",
Message: "status is 'bestaetigt' but hinweis mentions 'vorjahr'",
Field: "agent_status",
})
}
return issues
}
// HasErrors returns true if any issue has severity=error.
func HasErrors(issues []Issue) bool {
for _, i := range issues {
if i.Severity == SeverityError {
return true
}
}
return false
}
// regionsEqual compares two region strings tolerantly: case- and
// whitespace-insensitive, ignoring a leading "Kanton " prefix used for
// Swiss cantons in the agent output (bucket stores just "Schweiz" for v1,
// but once CH is split by kanton the comparison needs to strip the prefix).
func regionsEqual(a, b string) bool {
norm := func(s string) string {
s = strings.ToLower(strings.TrimSpace(s))
s = strings.TrimPrefix(s, "kanton ")
return s
}
return norm(a) == norm(b)
}

View File

@@ -0,0 +1,106 @@
package discovery
import "testing"
func TestValidateForInsert(t *testing.T) {
baseBucket := Bucket{Land: "Deutschland", Region: "Bayern", YearMonth: "2026-09", Halbmonat: "H1"}
tests := []struct {
name string
m DiscoveredMarket
b Bucket
wantCodes []string
wantErrors bool
}{
{
name: "clean",
m: DiscoveredMarket{Bundesland: "Bayern", AgentStatus: "bestaetigt"},
b: baseBucket,
wantCodes: nil,
},
{
name: "bundesland mismatch",
m: DiscoveredMarket{Bundesland: "Baden-Württemberg", AgentStatus: "bestaetigt"},
b: baseBucket,
wantCodes: []string{"bundesland_mismatch"},
wantErrors: true,
},
{
name: "bundesland empty is not an error",
m: DiscoveredMarket{Bundesland: "", AgentStatus: "bestaetigt"},
b: baseBucket,
wantCodes: nil,
},
{
name: "status bestaetigt but hinweis mentions vorjahr",
m: DiscoveredMarket{
Bundesland: "Bayern",
AgentStatus: "bestaetigt",
Hinweis: "Termin aus Vorjahr, noch nicht bestaetigt",
},
b: baseBucket,
wantCodes: []string{"status_hinweis_inconsistent"},
wantErrors: true,
},
{
name: "vorjahr hinweis with vorjahr_unbestaetigt status is fine",
m: DiscoveredMarket{
Bundesland: "Bayern",
AgentStatus: "vorjahr_unbestaetigt",
Hinweis: "Aus dem Vorjahr uebernommen",
},
b: baseBucket,
wantCodes: nil,
},
{
name: "kanton prefix is normalized when bucket is just the kanton name",
m: DiscoveredMarket{Bundesland: "Kanton Zürich", AgentStatus: "bestaetigt"},
b: Bucket{Region: "Zürich"},
wantCodes: nil,
},
{
name: "multiple errors coexist",
m: DiscoveredMarket{
Bundesland: "Salzburg",
AgentStatus: "bestaetigt",
Hinweis: "Termin aus dem Vorjahr uebernommen",
},
b: baseBucket,
wantCodes: []string{"bundesland_mismatch", "status_hinweis_inconsistent"},
wantErrors: true,
},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
issues := ValidateForInsert(tc.m, tc.b)
gotCodes := make([]string, 0, len(issues))
for _, i := range issues {
gotCodes = append(gotCodes, i.Code)
}
if !equalStringSets(gotCodes, tc.wantCodes) {
t.Errorf("codes = %v, want %v", gotCodes, tc.wantCodes)
}
if HasErrors(issues) != tc.wantErrors {
t.Errorf("HasErrors = %v, want %v", HasErrors(issues), tc.wantErrors)
}
})
}
}
func equalStringSets(a, b []string) bool {
if len(a) != len(b) {
return false
}
set := make(map[string]int, len(a))
for _, s := range a {
set[s]++
}
for _, s := range b {
set[s]--
if set[s] < 0 {
return false
}
}
return true
}