fix(discovery): review follow-ups — konfidenz signal, end-date default, determinism, rate-limit=0

- Service.Crawl derives Konfidenz from merged source count + rank instead of hardcoded "mittel". Two+ sources -> "hoch"; single curated source -> "mittel"; single suendenfrei (prose regex) -> "niedrig". - New AgentStatus constant "crawler" replaces "bestaetigt" for crawler rows so the validator's agent-specific rules don't fire on them and operators can filter the queue by origin. Added Konfidenz* and AgentStatus* constants to model.go. - Default EndDatum to StartDatum when a source reports a single date (festival_alarm one-day events, suendenfrei lines without a "bis" range). Avoids Service.Accept rejecting nil-EndDatum rows. - Sort PerSource names before assembling raw events for merge — makes merged output order deterministic across runs. - NewHandler: manualRateLimitPerHour <= 0 now explicitly disables the rate limit (previously silently floored to 1/hour). Documented behavior for all three cases in a constructor comment. - Added four new tests for Service.Crawl failure/quality paths: LinkCheckFailed, DedupedQueue, EndDatum default, multi-source Konfidenz. - Documented the substring-match approximation in cmd/discovery-compare/main.go's groupCrawlerByBucket — diagnostic-only, not safe for production routing.
2026-04-18 16:35:26 +02:00
parent c5a4bc441c
commit 7c8a8c6419
5 changed files with 234 additions and 12 deletions
--- a/backend/cmd/discovery-compare/main.go
+++ b/backend/cmd/discovery-compare/main.go
@@ -102,6 +102,13 @@ func parseBuckets(s string) ([]sampleBucket, error) {
 	return out, nil
 }

+// groupCrawlerByBucket assigns merged crawler events to sample buckets.
+//
+// NOTE: this is an approximation for the diagnostic CLI only — not for
+// production dedup. The Bundesland match uses `strings.Contains` so a merged
+// event with Bundesland="Bayern" will join a bucket with Region="Bay" (or
+// "ern"). Good enough to compare coverage between the crawler and Mistral
+// Pass 0 at bucket granularity; not safe for business-logic routing.
 func groupCrawlerByBucket(merged []crawler.MergedEvent, buckets []sampleBucket) map[string][]crawler.MergedEvent {
 	result := make(map[string][]crawler.MergedEvent)
 	for _, b := range buckets {
--- a/backend/internal/domain/discovery/handler.go
+++ b/backend/internal/domain/discovery/handler.go
@@ -21,9 +21,23 @@ type Handler struct {
 	crawlRateLimit  time.Duration
 }

+// NewHandler constructs a Handler. manualRateLimitPerHour controls how
+// frequently the admin-session /crawl-manual endpoint may be invoked:
+//
+//	<= 0 : disabled (no rate limit — every request is allowed)
+//	  1  : 1 request per hour (default)
+//	> 1  : N requests per hour, evenly spaced
+//
+// The bearer-token /crawl endpoint always bypasses this limit via the
+// `crawl_bypass_rate_limit` gin-context flag set by its route handler.
 func NewHandler(s *Service, manualRateLimitPerHour int) *Handler {
-	rl := time.Hour
-	if manualRateLimitPerHour > 1 {
+	var rl time.Duration
+	switch {
+	case manualRateLimitPerHour <= 0:
+		rl = 0 // sentinel: rate limiting disabled
+	case manualRateLimitPerHour == 1:
+		rl = time.Hour
+	default:
 		rl = time.Hour / time.Duration(manualRateLimitPerHour)
 	}
 	return &Handler{service: s, crawlRateLimit: rl}
@@ -52,7 +66,7 @@ func (h *Handler) Crawl(c *gin.Context) {
 	}
 	defer h.crawlMu.Unlock()

-	if _, bypass := c.Get("crawl_bypass_rate_limit"); !bypass {
+	if _, bypass := c.Get("crawl_bypass_rate_limit"); !bypass && h.crawlRateLimit > 0 {
 		if since := time.Since(h.crawlLastManual); since < h.crawlRateLimit {
 			retryIn := (h.crawlRateLimit - since).Seconds()
 			c.Header("Retry-After", fmt.Sprint(int(retryIn)+1))
--- a/backend/internal/domain/discovery/model.go
+++ b/backend/internal/domain/discovery/model.go
@@ -107,6 +107,27 @@ const (
 	StatusRejected = "rejected"
 )

+// AgentStatus constants.
+// Mistral Pass 0 produces: bestaetigt | unklar | vorjahr_unbestaetigt | abgesagt.
+// The crawler uses its own sentinel value so the validator's agent-specific
+// rules (e.g. bestaetigt+vorjahr_hinweis inconsistency) don't fire on crawler-
+// produced rows, and so operators can filter the queue by origin.
+const (
+	AgentStatusBestaetigt          = "bestaetigt"
+	AgentStatusUnklar              = "unklar"
+	AgentStatusVorjahrUnbestaetigt = "vorjahr_unbestaetigt"
+	AgentStatusAbgesagt            = "abgesagt"
+	AgentStatusCrawler             = "crawler"
+)
+
+// Konfidenz constants. The three-level scale is used by both Pass 0 (agent-
+// reported) and the crawler (derived from source agreement + source rank).
+const (
+	KonfidenzHoch    = "hoch"
+	KonfidenzMittel  = "mittel"
+	KonfidenzNiedrig = "niedrig"
+)
+
 // Stats is the discovery health snapshot used by the admin dashboard strip.
 type Stats struct {
 	LastTickAt    *time.Time    `json:"last_tick_at"`
--- a/backend/internal/domain/discovery/service.go
+++ b/backend/internal/domain/discovery/service.go
@@ -5,6 +5,7 @@ import (
 	"errors"
 	"fmt"
 	"log/slog"
+	"sort"
 	"strings"
 	"time"

@@ -307,8 +308,17 @@ func (s *Service) Crawl(ctx context.Context) (CrawlSummary, error) {
 		return summary, err
 	}

+	// Sort source names for deterministic event ordering across runs;
+	// Merge's internal bucket order then depends only on input.
+	sourceNames := make([]string, 0, len(res.PerSource))
+	for name := range res.PerSource {
+		sourceNames = append(sourceNames, name)
+	}
+	sort.Strings(sourceNames)
+
 	var all []crawler.RawEvent
-	for name, evs := range res.PerSource {
+	for _, name := range sourceNames {
+		evs := res.PerSource[name]
 		summary.PerSource[name] = SourceSummary{
 			EventsFetched: len(evs),
 			ElapsedMs:     res.PerSourceMS[name],
@@ -389,6 +399,15 @@ func (s *Service) Crawl(ctx context.Context) (CrawlSummary, error) {
 			continue
 		}

+		// Default EndDatum to StartDatum for sources that only reported a
+		// single date (festival_alarm one-day events, suendenfrei lines
+		// without a "bis" range). Admin can still edit via /queue/:id
+		// before accept. Avoids a blocking nil-EndDatum check in Service.Accept.
+		endDatum := m.EndDate
+		if endDatum == nil && m.StartDate != nil {
+			endDatum = m.StartDate
+		}
+
 		dm := DiscoveredMarket{
 			BucketID:        nil,
 			MarktName:       m.Name,
@@ -396,11 +415,11 @@ func (s *Service) Crawl(ctx context.Context) (CrawlSummary, error) {
 			Bundesland:      m.Bundesland,
 			Land:            m.Land,
 			StartDatum:      m.StartDate,
-			EndDatum:        m.EndDate,
+			EndDatum:        endDatum,
 			Website:         website,
 			Quellen:         quellen,
-			Konfidenz:       "mittel",
-			AgentStatus:     "bestaetigt",
+			Konfidenz:       crawlerKonfidenz(m),
+			AgentStatus:     AgentStatusCrawler,
 			Hinweis:         m.Hinweis,
 			NameNormalized:  nameNorm,
 			MatchedSeriesID: matchedSeriesID,
@@ -423,6 +442,29 @@ func (s *Service) Crawl(ctx context.Context) (CrawlSummary, error) {
 	return summary, nil
 }

+// crawlerKonfidenz derives a three-level confidence label for a merged event.
+// Signal: cross-source agreement is the strongest indicator — two or more
+// independent calendars emitting the same (normalized name, city, start_date)
+// triple is high confidence. Single-source rows fall back to source rank:
+// Tribe JSON and marktkalendarium curate their data, suendenfrei's prose
+// regex is brittle.
+func crawlerKonfidenz(m crawler.MergedEvent) string {
+	if len(m.Sources) >= 2 {
+		return KonfidenzHoch
+	}
+	if len(m.Sources) == 1 {
+		switch m.Sources[0] {
+		case "mittelaltermarkt_online", "marktkalendarium":
+			return KonfidenzMittel
+		case "mittelalterkalender", "festival_alarm":
+			return KonfidenzMittel
+		case "suendenfrei":
+			return KonfidenzNiedrig
+		}
+	}
+	return KonfidenzNiedrig
+}
+
 // formatIssues produces a compact log-friendly summary of validation issues.
 func formatIssues(issues []Issue) string {
 	parts := make([]string, 0, len(issues))
--- a/backend/internal/domain/discovery/service_test.go
+++ b/backend/internal/domain/discovery/service_test.go
@@ -278,15 +278,15 @@ func TestServiceCrawlHappyPath(t *testing.T) {
 	sc := &stubCrawlerRunner{
 		result: crawler.CrawlResult{
 			PerSource: map[string][]crawler.RawEvent{
-				"a": {
+				"marktkalendarium": {
 					{
-						SourceName: "a", SourceURL: "https://a/",
+						SourceName: "marktkalendarium", SourceURL: "https://a/",
 						Name: "Markt X", City: "Dresden", PLZ: "01067", Land: "Deutschland",
 						StartDate: start, EndDate: end,
 					},
 				},
 			},
-			PerSourceMS: map[string]int64{"a": 1},
+			PerSourceMS: map[string]int64{"marktkalendarium": 1},
 		},
 	}
 	svc := NewServiceWithCrawler(repo, sc, lc, noopMarketCreator{})
@@ -301,7 +301,145 @@ func TestServiceCrawlHappyPath(t *testing.T) {
 	if len(repo.inserted) != 1 {
 		t.Errorf("inserted = %d; want 1", len(repo.inserted))
 	}
-	if repo.inserted[0].BucketID != nil {
-		t.Errorf("BucketID = %v; want nil (crawler-produced row)", repo.inserted[0].BucketID)
+	got := repo.inserted[0]
+	if got.BucketID != nil {
+		t.Errorf("BucketID = %v; want nil (crawler-produced row)", got.BucketID)
+	}
+	if got.AgentStatus != AgentStatusCrawler {
+		t.Errorf("AgentStatus = %q; want %q", got.AgentStatus, AgentStatusCrawler)
+	}
+	if got.Konfidenz != KonfidenzMittel {
+		t.Errorf("Konfidenz = %q; want %q (single curated source)", got.Konfidenz, KonfidenzMittel)
+	}
+}
+
+// alwaysFailLinkVerifier filters every URL out — simulates a batch where every
+// source URL fails link verification.
+type alwaysFailLinkVerifier struct{}
+
+func (alwaysFailLinkVerifier) FilterURLs(_ context.Context, _ []string) []string { return nil }
+func (alwaysFailLinkVerifier) CheckURL(_ context.Context, _ string) bool         { return false }
+
+func TestServiceCrawlLinkCheckFailed(t *testing.T) {
+	repo := newMockRepo()
+	start := mustParseDate(t, "2026-05-01")
+
+	sc := &stubCrawlerRunner{
+		result: crawler.CrawlResult{
+			PerSource: map[string][]crawler.RawEvent{
+				"marktkalendarium": {
+					{SourceName: "marktkalendarium", SourceURL: "https://dead/", Name: "X", City: "Y", StartDate: start},
+				},
+			},
+		},
+	}
+	svc := NewServiceWithCrawler(repo, sc, alwaysFailLinkVerifier{}, noopMarketCreator{})
+
+	summary, err := svc.Crawl(context.Background())
+	if err != nil {
+		t.Fatal(err)
+	}
+	if summary.LinkCheckFailed != 1 {
+		t.Errorf("LinkCheckFailed = %d; want 1", summary.LinkCheckFailed)
+	}
+	if summary.Discovered != 0 {
+		t.Errorf("Discovered = %d; want 0", summary.Discovered)
+	}
+	if len(repo.inserted) != 0 {
+		t.Errorf("inserted = %d; want 0 (dead link should block insert)", len(repo.inserted))
+	}
+}
+
+func TestServiceCrawlDedupQueue(t *testing.T) {
+	repo := newMockRepo()
+	// Simulate: queue already has a matching pending row.
+	repo.queuePendingFn = func(_ context.Context, _, _ string, _ *time.Time) (bool, error) {
+		return true, nil
+	}
+	start := mustParseDate(t, "2026-05-01")
+
+	sc := &stubCrawlerRunner{
+		result: crawler.CrawlResult{
+			PerSource: map[string][]crawler.RawEvent{
+				"marktkalendarium": {
+					{SourceName: "marktkalendarium", SourceURL: "https://a/", Name: "X", City: "Y", StartDate: start},
+				},
+			},
+		},
+	}
+	svc := NewServiceWithCrawler(repo, sc, noopLinkVerifier{}, noopMarketCreator{})
+
+	summary, err := svc.Crawl(context.Background())
+	if err != nil {
+		t.Fatal(err)
+	}
+	if summary.DedupedQueue != 1 {
+		t.Errorf("DedupedQueue = %d; want 1", summary.DedupedQueue)
+	}
+	if summary.Discovered != 0 {
+		t.Errorf("Discovered = %d; want 0 (dupe should block insert)", summary.Discovered)
+	}
+	if len(repo.inserted) != 0 {
+		t.Errorf("inserted = %d; want 0", len(repo.inserted))
+	}
+}
+
+func TestServiceCrawlDefaultsEndDate(t *testing.T) {
+	repo := newMockRepo()
+	start := mustParseDate(t, "2026-05-01")
+
+	// RawEvent with no EndDate (e.g., festival_alarm one-day event).
+	sc := &stubCrawlerRunner{
+		result: crawler.CrawlResult{
+			PerSource: map[string][]crawler.RawEvent{
+				"marktkalendarium": {
+					{SourceName: "marktkalendarium", SourceURL: "https://a/", Name: "One Day Fest", City: "Y", StartDate: start, EndDate: nil},
+				},
+			},
+		},
+	}
+	svc := NewServiceWithCrawler(repo, sc, noopLinkVerifier{}, noopMarketCreator{})
+
+	if _, err := svc.Crawl(context.Background()); err != nil {
+		t.Fatal(err)
+	}
+	if len(repo.inserted) != 1 {
+		t.Fatalf("inserted = %d; want 1", len(repo.inserted))
+	}
+	got := repo.inserted[0]
+	if got.EndDatum == nil {
+		t.Error("EndDatum is nil; expected default to StartDatum")
+	}
+	if !got.EndDatum.Equal(*got.StartDatum) {
+		t.Errorf("EndDatum = %v; want equal to StartDatum %v", got.EndDatum, got.StartDatum)
+	}
+}
+
+func TestServiceCrawlMultiSourceHighKonfidenz(t *testing.T) {
+	repo := newMockRepo()
+	start := mustParseDate(t, "2026-05-01")
+
+	sc := &stubCrawlerRunner{
+		result: crawler.CrawlResult{
+			PerSource: map[string][]crawler.RawEvent{
+				"marktkalendarium":        {{SourceName: "marktkalendarium", SourceURL: "https://a/", Name: "X", City: "Y", StartDate: start}},
+				"mittelaltermarkt_online": {{SourceName: "mittelaltermarkt_online", SourceURL: "https://b/", Name: "X", City: "Y", StartDate: start}},
+			},
+		},
+	}
+	svc := NewServiceWithCrawler(repo, sc, noopLinkVerifier{}, noopMarketCreator{})
+
+	summary, err := svc.Crawl(context.Background())
+	if err != nil {
+		t.Fatal(err)
+	}
+	if summary.Discovered != 1 {
+		t.Errorf("Discovered = %d; want 1 (two sources merge into one event)", summary.Discovered)
+	}
+	if summary.MergedAcrossSites != 1 {
+		t.Errorf("MergedAcrossSites = %d; want 1", summary.MergedAcrossSites)
+	}
+	if repo.inserted[0].Konfidenz != KonfidenzHoch {
+		t.Errorf("Konfidenz = %q; want %q (2+ sources)", repo.inserted[0].Konfidenz, KonfidenzHoch)
 	}
 }