feat(discovery): validator — catches agent self-contradictions before insert
Pass 0 agents produce schema-valid but semantically wrong output: markets claimed in the wrong bundesland, status 'bestaetigt' with a hinweis about Vorjahresdaten, etc. The schema alone can't catch these. This validator does, as a blocking gate before InsertDiscovered. Checks (Pass 0 scope): - bundesland_mismatch: agent's bundesland must equal bucket.region, with a light normalizer for CH 'Kanton X' prefix so Phase B can refine the Schweiz seed without a signature break. - status_hinweis_inconsistent: if agent_status=='bestaetigt' AND hinweis contains 'vorjahr' (case-insensitive), the agent contradicted itself. Errors drop the market (counted as summary.validation_failed); warnings would get merged into hinweis — no warning-level checks exist yet at Pass 0 scope, placeholder reserved. Phase B (research agent) checks will extend this file: oeffnungszeiten dedup, start_datum window coverage, full quellen liveness for Pass 1.
This commit is contained in:
@@ -62,6 +62,7 @@ type TickSummary struct {
|
||||
Errors int `json:"errors"`
|
||||
RateLimited int `json:"rate_limited"`
|
||||
LinkCheckFailed int `json:"link_check_failed"`
|
||||
ValidationFailed int `json:"validation_failed"`
|
||||
}
|
||||
|
||||
// Tick picks N stale buckets and runs Pass 0 for each, writing net-new discoveries.
|
||||
@@ -230,6 +231,18 @@ func (s *Service) processBucketResponse(ctx context.Context, b Bucket, resp Pass
|
||||
NameNormalized: nameNorm,
|
||||
MatchedSeriesID: matchedSeriesID,
|
||||
}
|
||||
|
||||
// Semantic validation — catches agent self-contradictions that the
|
||||
// schema alone cannot. Errors drop the market; warnings would be
|
||||
// appended to hinweis (none defined yet at Pass 0 scope).
|
||||
issues := ValidateForInsert(dm, b)
|
||||
if HasErrors(issues) {
|
||||
slog.InfoContext(ctx, "validation failed; skipping market",
|
||||
"markt", m.MarktName, "stadt", m.Stadt, "issues", formatIssues(issues))
|
||||
summary.ValidationFailed++
|
||||
continue
|
||||
}
|
||||
|
||||
if _, err := s.repo.InsertDiscovered(ctx, dm); err != nil {
|
||||
slog.WarnContext(ctx, "insert discovered", "error", err)
|
||||
continue
|
||||
@@ -239,6 +252,15 @@ func (s *Service) processBucketResponse(ctx context.Context, b Bucket, resp Pass
|
||||
return summary
|
||||
}
|
||||
|
||||
// formatIssues produces a compact log-friendly summary of validation issues.
|
||||
func formatIssues(issues []Issue) string {
|
||||
parts := make([]string, 0, len(issues))
|
||||
for _, i := range issues {
|
||||
parts = append(parts, string(i.Severity)+":"+i.Code)
|
||||
}
|
||||
return strings.Join(parts, ",")
|
||||
}
|
||||
|
||||
func parseOptionalDate(s string) *time.Time {
|
||||
if s == "" {
|
||||
return nil
|
||||
|
||||
89
backend/internal/domain/discovery/validate.go
Normal file
89
backend/internal/domain/discovery/validate.go
Normal file
@@ -0,0 +1,89 @@
|
||||
// Package discovery validation — semantic checks applied to agent output
|
||||
// before it lands in the admin queue. Link liveness is handled separately
|
||||
// by LinkChecker; this file catches self-inconsistencies the agent cannot
|
||||
// verify on its own (bucket-vs-field mismatches, status-vs-hinweis
|
||||
// contradictions, malformed ranges).
|
||||
//
|
||||
// Pass 1/2 checks (oeffnungszeiten dedup, start_datum coverage, etc.) will
|
||||
// live in this same file once the Phase B research redesign lands; see
|
||||
// docs/superpowers/plans/2026-04-18-phase-a-pass0-halbmonat.md.
|
||||
package discovery
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// Severity classifies a validation finding. Errors block the insert; warnings
|
||||
// are surfaced in the hinweis for admin attention but do not drop the market.
|
||||
type Severity string
|
||||
|
||||
const (
|
||||
SeverityError Severity = "error"
|
||||
SeverityWarning Severity = "warning"
|
||||
)
|
||||
|
||||
// Issue is a single validation finding. Code is a stable identifier the UI
|
||||
// can key off of; Message is human-readable.
|
||||
type Issue struct {
|
||||
Severity Severity
|
||||
Code string
|
||||
Message string
|
||||
Field string // optional — name of the offending field, for UI targeting
|
||||
}
|
||||
|
||||
// ValidateForInsert checks a Pass 0 DiscoveredMarket against the bucket it came
|
||||
// from, before it hits the queue. Current checks:
|
||||
//
|
||||
// - bundesland_mismatch: m.Bundesland does not equal b.Region (CH "Kanton X"
|
||||
// prefix is normalized).
|
||||
// - status_hinweis_inconsistent: AgentStatus=="bestaetigt" AND hinweis
|
||||
// mentions "vorjahr" — the agent contradicted itself.
|
||||
//
|
||||
// Returns nil when clean.
|
||||
func ValidateForInsert(m DiscoveredMarket, b Bucket) []Issue {
|
||||
var issues []Issue
|
||||
|
||||
if m.Bundesland != "" && !regionsEqual(m.Bundesland, b.Region) {
|
||||
issues = append(issues, Issue{
|
||||
Severity: SeverityError,
|
||||
Code: "bundesland_mismatch",
|
||||
Message: fmt.Sprintf("bundesland %q does not match bucket region %q", m.Bundesland, b.Region),
|
||||
Field: "bundesland",
|
||||
})
|
||||
}
|
||||
|
||||
if m.AgentStatus == "bestaetigt" && strings.Contains(strings.ToLower(m.Hinweis), "vorjahr") {
|
||||
issues = append(issues, Issue{
|
||||
Severity: SeverityError,
|
||||
Code: "status_hinweis_inconsistent",
|
||||
Message: "status is 'bestaetigt' but hinweis mentions 'vorjahr'",
|
||||
Field: "agent_status",
|
||||
})
|
||||
}
|
||||
|
||||
return issues
|
||||
}
|
||||
|
||||
// HasErrors returns true if any issue has severity=error.
|
||||
func HasErrors(issues []Issue) bool {
|
||||
for _, i := range issues {
|
||||
if i.Severity == SeverityError {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// regionsEqual compares two region strings tolerantly: case- and
|
||||
// whitespace-insensitive, ignoring a leading "Kanton " prefix used for
|
||||
// Swiss cantons in the agent output (bucket stores just "Schweiz" for v1,
|
||||
// but once CH is split by kanton the comparison needs to strip the prefix).
|
||||
func regionsEqual(a, b string) bool {
|
||||
norm := func(s string) string {
|
||||
s = strings.ToLower(strings.TrimSpace(s))
|
||||
s = strings.TrimPrefix(s, "kanton ")
|
||||
return s
|
||||
}
|
||||
return norm(a) == norm(b)
|
||||
}
|
||||
106
backend/internal/domain/discovery/validate_test.go
Normal file
106
backend/internal/domain/discovery/validate_test.go
Normal file
@@ -0,0 +1,106 @@
|
||||
package discovery
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestValidateForInsert(t *testing.T) {
|
||||
baseBucket := Bucket{Land: "Deutschland", Region: "Bayern", YearMonth: "2026-09", Halbmonat: "H1"}
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
m DiscoveredMarket
|
||||
b Bucket
|
||||
wantCodes []string
|
||||
wantErrors bool
|
||||
}{
|
||||
{
|
||||
name: "clean",
|
||||
m: DiscoveredMarket{Bundesland: "Bayern", AgentStatus: "bestaetigt"},
|
||||
b: baseBucket,
|
||||
wantCodes: nil,
|
||||
},
|
||||
{
|
||||
name: "bundesland mismatch",
|
||||
m: DiscoveredMarket{Bundesland: "Baden-Württemberg", AgentStatus: "bestaetigt"},
|
||||
b: baseBucket,
|
||||
wantCodes: []string{"bundesland_mismatch"},
|
||||
wantErrors: true,
|
||||
},
|
||||
{
|
||||
name: "bundesland empty is not an error",
|
||||
m: DiscoveredMarket{Bundesland: "", AgentStatus: "bestaetigt"},
|
||||
b: baseBucket,
|
||||
wantCodes: nil,
|
||||
},
|
||||
{
|
||||
name: "status bestaetigt but hinweis mentions vorjahr",
|
||||
m: DiscoveredMarket{
|
||||
Bundesland: "Bayern",
|
||||
AgentStatus: "bestaetigt",
|
||||
Hinweis: "Termin aus Vorjahr, noch nicht bestaetigt",
|
||||
},
|
||||
b: baseBucket,
|
||||
wantCodes: []string{"status_hinweis_inconsistent"},
|
||||
wantErrors: true,
|
||||
},
|
||||
{
|
||||
name: "vorjahr hinweis with vorjahr_unbestaetigt status is fine",
|
||||
m: DiscoveredMarket{
|
||||
Bundesland: "Bayern",
|
||||
AgentStatus: "vorjahr_unbestaetigt",
|
||||
Hinweis: "Aus dem Vorjahr uebernommen",
|
||||
},
|
||||
b: baseBucket,
|
||||
wantCodes: nil,
|
||||
},
|
||||
{
|
||||
name: "kanton prefix is normalized when bucket is just the kanton name",
|
||||
m: DiscoveredMarket{Bundesland: "Kanton Zürich", AgentStatus: "bestaetigt"},
|
||||
b: Bucket{Region: "Zürich"},
|
||||
wantCodes: nil,
|
||||
},
|
||||
{
|
||||
name: "multiple errors coexist",
|
||||
m: DiscoveredMarket{
|
||||
Bundesland: "Salzburg",
|
||||
AgentStatus: "bestaetigt",
|
||||
Hinweis: "Termin aus dem Vorjahr uebernommen",
|
||||
},
|
||||
b: baseBucket,
|
||||
wantCodes: []string{"bundesland_mismatch", "status_hinweis_inconsistent"},
|
||||
wantErrors: true,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range tests {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
issues := ValidateForInsert(tc.m, tc.b)
|
||||
gotCodes := make([]string, 0, len(issues))
|
||||
for _, i := range issues {
|
||||
gotCodes = append(gotCodes, i.Code)
|
||||
}
|
||||
if !equalStringSets(gotCodes, tc.wantCodes) {
|
||||
t.Errorf("codes = %v, want %v", gotCodes, tc.wantCodes)
|
||||
}
|
||||
if HasErrors(issues) != tc.wantErrors {
|
||||
t.Errorf("HasErrors = %v, want %v", HasErrors(issues), tc.wantErrors)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func equalStringSets(a, b []string) bool {
|
||||
if len(a) != len(b) {
|
||||
return false
|
||||
}
|
||||
set := make(map[string]int, len(a))
|
||||
for _, s := range a {
|
||||
set[s]++
|
||||
}
|
||||
for _, s := range b {
|
||||
set[s]--
|
||||
if set[s] < 0 {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
Reference in New Issue
Block a user