From de1a3f6efb74de9999dc97dc9fe08213d1b13495 Mon Sep 17 00:00:00 2001 From: vikingowl Date: Sat, 18 Apr 2026 10:05:08 +0200 Subject: [PATCH] =?UTF-8?q?feat(discovery):=20validator=20=E2=80=94=20catc?= =?UTF-8?q?hes=20agent=20self-contradictions=20before=20insert?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pass 0 agents produce schema-valid but semantically wrong output: markets claimed in the wrong bundesland, status 'bestaetigt' with a hinweis about Vorjahresdaten, etc. The schema alone can't catch these. This validator does, as a blocking gate before InsertDiscovered. Checks (Pass 0 scope): - bundesland_mismatch: agent's bundesland must equal bucket.region, with a light normalizer for CH 'Kanton X' prefix so Phase B can refine the Schweiz seed without a signature break. - status_hinweis_inconsistent: if agent_status=='bestaetigt' AND hinweis contains 'vorjahr' (case-insensitive), the agent contradicted itself. Errors drop the market (counted as summary.validation_failed); warnings would get merged into hinweis — no warning-level checks exist yet at Pass 0 scope, placeholder reserved. Phase B (research agent) checks will extend this file: oeffnungszeiten dedup, start_datum window coverage, full quellen liveness for Pass 1. --- backend/internal/domain/discovery/service.go | 22 ++++ backend/internal/domain/discovery/validate.go | 89 +++++++++++++++ .../domain/discovery/validate_test.go | 106 ++++++++++++++++++ 3 files changed, 217 insertions(+) create mode 100644 backend/internal/domain/discovery/validate.go create mode 100644 backend/internal/domain/discovery/validate_test.go diff --git a/backend/internal/domain/discovery/service.go b/backend/internal/domain/discovery/service.go index 3f484bb..c9f89ee 100644 --- a/backend/internal/domain/discovery/service.go +++ b/backend/internal/domain/discovery/service.go @@ -62,6 +62,7 @@ type TickSummary struct { Errors int `json:"errors"` RateLimited int `json:"rate_limited"` LinkCheckFailed int `json:"link_check_failed"` + ValidationFailed int `json:"validation_failed"` } // Tick picks N stale buckets and runs Pass 0 for each, writing net-new discoveries. @@ -230,6 +231,18 @@ func (s *Service) processBucketResponse(ctx context.Context, b Bucket, resp Pass NameNormalized: nameNorm, MatchedSeriesID: matchedSeriesID, } + + // Semantic validation — catches agent self-contradictions that the + // schema alone cannot. Errors drop the market; warnings would be + // appended to hinweis (none defined yet at Pass 0 scope). + issues := ValidateForInsert(dm, b) + if HasErrors(issues) { + slog.InfoContext(ctx, "validation failed; skipping market", + "markt", m.MarktName, "stadt", m.Stadt, "issues", formatIssues(issues)) + summary.ValidationFailed++ + continue + } + if _, err := s.repo.InsertDiscovered(ctx, dm); err != nil { slog.WarnContext(ctx, "insert discovered", "error", err) continue @@ -239,6 +252,15 @@ func (s *Service) processBucketResponse(ctx context.Context, b Bucket, resp Pass return summary } +// formatIssues produces a compact log-friendly summary of validation issues. +func formatIssues(issues []Issue) string { + parts := make([]string, 0, len(issues)) + for _, i := range issues { + parts = append(parts, string(i.Severity)+":"+i.Code) + } + return strings.Join(parts, ",") +} + func parseOptionalDate(s string) *time.Time { if s == "" { return nil diff --git a/backend/internal/domain/discovery/validate.go b/backend/internal/domain/discovery/validate.go new file mode 100644 index 0000000..3653758 --- /dev/null +++ b/backend/internal/domain/discovery/validate.go @@ -0,0 +1,89 @@ +// Package discovery validation — semantic checks applied to agent output +// before it lands in the admin queue. Link liveness is handled separately +// by LinkChecker; this file catches self-inconsistencies the agent cannot +// verify on its own (bucket-vs-field mismatches, status-vs-hinweis +// contradictions, malformed ranges). +// +// Pass 1/2 checks (oeffnungszeiten dedup, start_datum coverage, etc.) will +// live in this same file once the Phase B research redesign lands; see +// docs/superpowers/plans/2026-04-18-phase-a-pass0-halbmonat.md. +package discovery + +import ( + "fmt" + "strings" +) + +// Severity classifies a validation finding. Errors block the insert; warnings +// are surfaced in the hinweis for admin attention but do not drop the market. +type Severity string + +const ( + SeverityError Severity = "error" + SeverityWarning Severity = "warning" +) + +// Issue is a single validation finding. Code is a stable identifier the UI +// can key off of; Message is human-readable. +type Issue struct { + Severity Severity + Code string + Message string + Field string // optional — name of the offending field, for UI targeting +} + +// ValidateForInsert checks a Pass 0 DiscoveredMarket against the bucket it came +// from, before it hits the queue. Current checks: +// +// - bundesland_mismatch: m.Bundesland does not equal b.Region (CH "Kanton X" +// prefix is normalized). +// - status_hinweis_inconsistent: AgentStatus=="bestaetigt" AND hinweis +// mentions "vorjahr" — the agent contradicted itself. +// +// Returns nil when clean. +func ValidateForInsert(m DiscoveredMarket, b Bucket) []Issue { + var issues []Issue + + if m.Bundesland != "" && !regionsEqual(m.Bundesland, b.Region) { + issues = append(issues, Issue{ + Severity: SeverityError, + Code: "bundesland_mismatch", + Message: fmt.Sprintf("bundesland %q does not match bucket region %q", m.Bundesland, b.Region), + Field: "bundesland", + }) + } + + if m.AgentStatus == "bestaetigt" && strings.Contains(strings.ToLower(m.Hinweis), "vorjahr") { + issues = append(issues, Issue{ + Severity: SeverityError, + Code: "status_hinweis_inconsistent", + Message: "status is 'bestaetigt' but hinweis mentions 'vorjahr'", + Field: "agent_status", + }) + } + + return issues +} + +// HasErrors returns true if any issue has severity=error. +func HasErrors(issues []Issue) bool { + for _, i := range issues { + if i.Severity == SeverityError { + return true + } + } + return false +} + +// regionsEqual compares two region strings tolerantly: case- and +// whitespace-insensitive, ignoring a leading "Kanton " prefix used for +// Swiss cantons in the agent output (bucket stores just "Schweiz" for v1, +// but once CH is split by kanton the comparison needs to strip the prefix). +func regionsEqual(a, b string) bool { + norm := func(s string) string { + s = strings.ToLower(strings.TrimSpace(s)) + s = strings.TrimPrefix(s, "kanton ") + return s + } + return norm(a) == norm(b) +} diff --git a/backend/internal/domain/discovery/validate_test.go b/backend/internal/domain/discovery/validate_test.go new file mode 100644 index 0000000..8a23cc4 --- /dev/null +++ b/backend/internal/domain/discovery/validate_test.go @@ -0,0 +1,106 @@ +package discovery + +import "testing" + +func TestValidateForInsert(t *testing.T) { + baseBucket := Bucket{Land: "Deutschland", Region: "Bayern", YearMonth: "2026-09", Halbmonat: "H1"} + + tests := []struct { + name string + m DiscoveredMarket + b Bucket + wantCodes []string + wantErrors bool + }{ + { + name: "clean", + m: DiscoveredMarket{Bundesland: "Bayern", AgentStatus: "bestaetigt"}, + b: baseBucket, + wantCodes: nil, + }, + { + name: "bundesland mismatch", + m: DiscoveredMarket{Bundesland: "Baden-Württemberg", AgentStatus: "bestaetigt"}, + b: baseBucket, + wantCodes: []string{"bundesland_mismatch"}, + wantErrors: true, + }, + { + name: "bundesland empty is not an error", + m: DiscoveredMarket{Bundesland: "", AgentStatus: "bestaetigt"}, + b: baseBucket, + wantCodes: nil, + }, + { + name: "status bestaetigt but hinweis mentions vorjahr", + m: DiscoveredMarket{ + Bundesland: "Bayern", + AgentStatus: "bestaetigt", + Hinweis: "Termin aus Vorjahr, noch nicht bestaetigt", + }, + b: baseBucket, + wantCodes: []string{"status_hinweis_inconsistent"}, + wantErrors: true, + }, + { + name: "vorjahr hinweis with vorjahr_unbestaetigt status is fine", + m: DiscoveredMarket{ + Bundesland: "Bayern", + AgentStatus: "vorjahr_unbestaetigt", + Hinweis: "Aus dem Vorjahr uebernommen", + }, + b: baseBucket, + wantCodes: nil, + }, + { + name: "kanton prefix is normalized when bucket is just the kanton name", + m: DiscoveredMarket{Bundesland: "Kanton Zürich", AgentStatus: "bestaetigt"}, + b: Bucket{Region: "Zürich"}, + wantCodes: nil, + }, + { + name: "multiple errors coexist", + m: DiscoveredMarket{ + Bundesland: "Salzburg", + AgentStatus: "bestaetigt", + Hinweis: "Termin aus dem Vorjahr uebernommen", + }, + b: baseBucket, + wantCodes: []string{"bundesland_mismatch", "status_hinweis_inconsistent"}, + wantErrors: true, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + issues := ValidateForInsert(tc.m, tc.b) + gotCodes := make([]string, 0, len(issues)) + for _, i := range issues { + gotCodes = append(gotCodes, i.Code) + } + if !equalStringSets(gotCodes, tc.wantCodes) { + t.Errorf("codes = %v, want %v", gotCodes, tc.wantCodes) + } + if HasErrors(issues) != tc.wantErrors { + t.Errorf("HasErrors = %v, want %v", HasErrors(issues), tc.wantErrors) + } + }) + } +} + +func equalStringSets(a, b []string) bool { + if len(a) != len(b) { + return false + } + set := make(map[string]int, len(a)) + for _, s := range a { + set[s]++ + } + for _, s := range b { + set[s]-- + if set[s] < 0 { + return false + } + } + return true +}