diff --git a/backend/cmd/discovery-compare/main.go b/backend/cmd/discovery-compare/main.go index fce2e4e..1b32fdd 100644 --- a/backend/cmd/discovery-compare/main.go +++ b/backend/cmd/discovery-compare/main.go @@ -102,6 +102,13 @@ func parseBuckets(s string) ([]sampleBucket, error) { return out, nil } +// groupCrawlerByBucket assigns merged crawler events to sample buckets. +// +// NOTE: this is an approximation for the diagnostic CLI only — not for +// production dedup. The Bundesland match uses `strings.Contains` so a merged +// event with Bundesland="Bayern" will join a bucket with Region="Bay" (or +// "ern"). Good enough to compare coverage between the crawler and Mistral +// Pass 0 at bucket granularity; not safe for business-logic routing. func groupCrawlerByBucket(merged []crawler.MergedEvent, buckets []sampleBucket) map[string][]crawler.MergedEvent { result := make(map[string][]crawler.MergedEvent) for _, b := range buckets { diff --git a/backend/internal/domain/discovery/handler.go b/backend/internal/domain/discovery/handler.go index 733b4ed..60c37a3 100644 --- a/backend/internal/domain/discovery/handler.go +++ b/backend/internal/domain/discovery/handler.go @@ -21,9 +21,23 @@ type Handler struct { crawlRateLimit time.Duration } +// NewHandler constructs a Handler. manualRateLimitPerHour controls how +// frequently the admin-session /crawl-manual endpoint may be invoked: +// +// <= 0 : disabled (no rate limit — every request is allowed) +// 1 : 1 request per hour (default) +// > 1 : N requests per hour, evenly spaced +// +// The bearer-token /crawl endpoint always bypasses this limit via the +// `crawl_bypass_rate_limit` gin-context flag set by its route handler. func NewHandler(s *Service, manualRateLimitPerHour int) *Handler { - rl := time.Hour - if manualRateLimitPerHour > 1 { + var rl time.Duration + switch { + case manualRateLimitPerHour <= 0: + rl = 0 // sentinel: rate limiting disabled + case manualRateLimitPerHour == 1: + rl = time.Hour + default: rl = time.Hour / time.Duration(manualRateLimitPerHour) } return &Handler{service: s, crawlRateLimit: rl} @@ -52,7 +66,7 @@ func (h *Handler) Crawl(c *gin.Context) { } defer h.crawlMu.Unlock() - if _, bypass := c.Get("crawl_bypass_rate_limit"); !bypass { + if _, bypass := c.Get("crawl_bypass_rate_limit"); !bypass && h.crawlRateLimit > 0 { if since := time.Since(h.crawlLastManual); since < h.crawlRateLimit { retryIn := (h.crawlRateLimit - since).Seconds() c.Header("Retry-After", fmt.Sprint(int(retryIn)+1)) diff --git a/backend/internal/domain/discovery/model.go b/backend/internal/domain/discovery/model.go index af566ff..3d2a114 100644 --- a/backend/internal/domain/discovery/model.go +++ b/backend/internal/domain/discovery/model.go @@ -107,6 +107,27 @@ const ( StatusRejected = "rejected" ) +// AgentStatus constants. +// Mistral Pass 0 produces: bestaetigt | unklar | vorjahr_unbestaetigt | abgesagt. +// The crawler uses its own sentinel value so the validator's agent-specific +// rules (e.g. bestaetigt+vorjahr_hinweis inconsistency) don't fire on crawler- +// produced rows, and so operators can filter the queue by origin. +const ( + AgentStatusBestaetigt = "bestaetigt" + AgentStatusUnklar = "unklar" + AgentStatusVorjahrUnbestaetigt = "vorjahr_unbestaetigt" + AgentStatusAbgesagt = "abgesagt" + AgentStatusCrawler = "crawler" +) + +// Konfidenz constants. The three-level scale is used by both Pass 0 (agent- +// reported) and the crawler (derived from source agreement + source rank). +const ( + KonfidenzHoch = "hoch" + KonfidenzMittel = "mittel" + KonfidenzNiedrig = "niedrig" +) + // Stats is the discovery health snapshot used by the admin dashboard strip. type Stats struct { LastTickAt *time.Time `json:"last_tick_at"` diff --git a/backend/internal/domain/discovery/service.go b/backend/internal/domain/discovery/service.go index 683adf5..15107c0 100644 --- a/backend/internal/domain/discovery/service.go +++ b/backend/internal/domain/discovery/service.go @@ -5,6 +5,7 @@ import ( "errors" "fmt" "log/slog" + "sort" "strings" "time" @@ -307,8 +308,17 @@ func (s *Service) Crawl(ctx context.Context) (CrawlSummary, error) { return summary, err } + // Sort source names for deterministic event ordering across runs; + // Merge's internal bucket order then depends only on input. + sourceNames := make([]string, 0, len(res.PerSource)) + for name := range res.PerSource { + sourceNames = append(sourceNames, name) + } + sort.Strings(sourceNames) + var all []crawler.RawEvent - for name, evs := range res.PerSource { + for _, name := range sourceNames { + evs := res.PerSource[name] summary.PerSource[name] = SourceSummary{ EventsFetched: len(evs), ElapsedMs: res.PerSourceMS[name], @@ -389,6 +399,15 @@ func (s *Service) Crawl(ctx context.Context) (CrawlSummary, error) { continue } + // Default EndDatum to StartDatum for sources that only reported a + // single date (festival_alarm one-day events, suendenfrei lines + // without a "bis" range). Admin can still edit via /queue/:id + // before accept. Avoids a blocking nil-EndDatum check in Service.Accept. + endDatum := m.EndDate + if endDatum == nil && m.StartDate != nil { + endDatum = m.StartDate + } + dm := DiscoveredMarket{ BucketID: nil, MarktName: m.Name, @@ -396,11 +415,11 @@ func (s *Service) Crawl(ctx context.Context) (CrawlSummary, error) { Bundesland: m.Bundesland, Land: m.Land, StartDatum: m.StartDate, - EndDatum: m.EndDate, + EndDatum: endDatum, Website: website, Quellen: quellen, - Konfidenz: "mittel", - AgentStatus: "bestaetigt", + Konfidenz: crawlerKonfidenz(m), + AgentStatus: AgentStatusCrawler, Hinweis: m.Hinweis, NameNormalized: nameNorm, MatchedSeriesID: matchedSeriesID, @@ -423,6 +442,29 @@ func (s *Service) Crawl(ctx context.Context) (CrawlSummary, error) { return summary, nil } +// crawlerKonfidenz derives a three-level confidence label for a merged event. +// Signal: cross-source agreement is the strongest indicator — two or more +// independent calendars emitting the same (normalized name, city, start_date) +// triple is high confidence. Single-source rows fall back to source rank: +// Tribe JSON and marktkalendarium curate their data, suendenfrei's prose +// regex is brittle. +func crawlerKonfidenz(m crawler.MergedEvent) string { + if len(m.Sources) >= 2 { + return KonfidenzHoch + } + if len(m.Sources) == 1 { + switch m.Sources[0] { + case "mittelaltermarkt_online", "marktkalendarium": + return KonfidenzMittel + case "mittelalterkalender", "festival_alarm": + return KonfidenzMittel + case "suendenfrei": + return KonfidenzNiedrig + } + } + return KonfidenzNiedrig +} + // formatIssues produces a compact log-friendly summary of validation issues. func formatIssues(issues []Issue) string { parts := make([]string, 0, len(issues)) diff --git a/backend/internal/domain/discovery/service_test.go b/backend/internal/domain/discovery/service_test.go index 8da7503..7454616 100644 --- a/backend/internal/domain/discovery/service_test.go +++ b/backend/internal/domain/discovery/service_test.go @@ -278,15 +278,15 @@ func TestServiceCrawlHappyPath(t *testing.T) { sc := &stubCrawlerRunner{ result: crawler.CrawlResult{ PerSource: map[string][]crawler.RawEvent{ - "a": { + "marktkalendarium": { { - SourceName: "a", SourceURL: "https://a/", + SourceName: "marktkalendarium", SourceURL: "https://a/", Name: "Markt X", City: "Dresden", PLZ: "01067", Land: "Deutschland", StartDate: start, EndDate: end, }, }, }, - PerSourceMS: map[string]int64{"a": 1}, + PerSourceMS: map[string]int64{"marktkalendarium": 1}, }, } svc := NewServiceWithCrawler(repo, sc, lc, noopMarketCreator{}) @@ -301,7 +301,145 @@ func TestServiceCrawlHappyPath(t *testing.T) { if len(repo.inserted) != 1 { t.Errorf("inserted = %d; want 1", len(repo.inserted)) } - if repo.inserted[0].BucketID != nil { - t.Errorf("BucketID = %v; want nil (crawler-produced row)", repo.inserted[0].BucketID) + got := repo.inserted[0] + if got.BucketID != nil { + t.Errorf("BucketID = %v; want nil (crawler-produced row)", got.BucketID) + } + if got.AgentStatus != AgentStatusCrawler { + t.Errorf("AgentStatus = %q; want %q", got.AgentStatus, AgentStatusCrawler) + } + if got.Konfidenz != KonfidenzMittel { + t.Errorf("Konfidenz = %q; want %q (single curated source)", got.Konfidenz, KonfidenzMittel) + } +} + +// alwaysFailLinkVerifier filters every URL out — simulates a batch where every +// source URL fails link verification. +type alwaysFailLinkVerifier struct{} + +func (alwaysFailLinkVerifier) FilterURLs(_ context.Context, _ []string) []string { return nil } +func (alwaysFailLinkVerifier) CheckURL(_ context.Context, _ string) bool { return false } + +func TestServiceCrawlLinkCheckFailed(t *testing.T) { + repo := newMockRepo() + start := mustParseDate(t, "2026-05-01") + + sc := &stubCrawlerRunner{ + result: crawler.CrawlResult{ + PerSource: map[string][]crawler.RawEvent{ + "marktkalendarium": { + {SourceName: "marktkalendarium", SourceURL: "https://dead/", Name: "X", City: "Y", StartDate: start}, + }, + }, + }, + } + svc := NewServiceWithCrawler(repo, sc, alwaysFailLinkVerifier{}, noopMarketCreator{}) + + summary, err := svc.Crawl(context.Background()) + if err != nil { + t.Fatal(err) + } + if summary.LinkCheckFailed != 1 { + t.Errorf("LinkCheckFailed = %d; want 1", summary.LinkCheckFailed) + } + if summary.Discovered != 0 { + t.Errorf("Discovered = %d; want 0", summary.Discovered) + } + if len(repo.inserted) != 0 { + t.Errorf("inserted = %d; want 0 (dead link should block insert)", len(repo.inserted)) + } +} + +func TestServiceCrawlDedupQueue(t *testing.T) { + repo := newMockRepo() + // Simulate: queue already has a matching pending row. + repo.queuePendingFn = func(_ context.Context, _, _ string, _ *time.Time) (bool, error) { + return true, nil + } + start := mustParseDate(t, "2026-05-01") + + sc := &stubCrawlerRunner{ + result: crawler.CrawlResult{ + PerSource: map[string][]crawler.RawEvent{ + "marktkalendarium": { + {SourceName: "marktkalendarium", SourceURL: "https://a/", Name: "X", City: "Y", StartDate: start}, + }, + }, + }, + } + svc := NewServiceWithCrawler(repo, sc, noopLinkVerifier{}, noopMarketCreator{}) + + summary, err := svc.Crawl(context.Background()) + if err != nil { + t.Fatal(err) + } + if summary.DedupedQueue != 1 { + t.Errorf("DedupedQueue = %d; want 1", summary.DedupedQueue) + } + if summary.Discovered != 0 { + t.Errorf("Discovered = %d; want 0 (dupe should block insert)", summary.Discovered) + } + if len(repo.inserted) != 0 { + t.Errorf("inserted = %d; want 0", len(repo.inserted)) + } +} + +func TestServiceCrawlDefaultsEndDate(t *testing.T) { + repo := newMockRepo() + start := mustParseDate(t, "2026-05-01") + + // RawEvent with no EndDate (e.g., festival_alarm one-day event). + sc := &stubCrawlerRunner{ + result: crawler.CrawlResult{ + PerSource: map[string][]crawler.RawEvent{ + "marktkalendarium": { + {SourceName: "marktkalendarium", SourceURL: "https://a/", Name: "One Day Fest", City: "Y", StartDate: start, EndDate: nil}, + }, + }, + }, + } + svc := NewServiceWithCrawler(repo, sc, noopLinkVerifier{}, noopMarketCreator{}) + + if _, err := svc.Crawl(context.Background()); err != nil { + t.Fatal(err) + } + if len(repo.inserted) != 1 { + t.Fatalf("inserted = %d; want 1", len(repo.inserted)) + } + got := repo.inserted[0] + if got.EndDatum == nil { + t.Error("EndDatum is nil; expected default to StartDatum") + } + if !got.EndDatum.Equal(*got.StartDatum) { + t.Errorf("EndDatum = %v; want equal to StartDatum %v", got.EndDatum, got.StartDatum) + } +} + +func TestServiceCrawlMultiSourceHighKonfidenz(t *testing.T) { + repo := newMockRepo() + start := mustParseDate(t, "2026-05-01") + + sc := &stubCrawlerRunner{ + result: crawler.CrawlResult{ + PerSource: map[string][]crawler.RawEvent{ + "marktkalendarium": {{SourceName: "marktkalendarium", SourceURL: "https://a/", Name: "X", City: "Y", StartDate: start}}, + "mittelaltermarkt_online": {{SourceName: "mittelaltermarkt_online", SourceURL: "https://b/", Name: "X", City: "Y", StartDate: start}}, + }, + }, + } + svc := NewServiceWithCrawler(repo, sc, noopLinkVerifier{}, noopMarketCreator{}) + + summary, err := svc.Crawl(context.Background()) + if err != nil { + t.Fatal(err) + } + if summary.Discovered != 1 { + t.Errorf("Discovered = %d; want 1 (two sources merge into one event)", summary.Discovered) + } + if summary.MergedAcrossSites != 1 { + t.Errorf("MergedAcrossSites = %d; want 1", summary.MergedAcrossSites) + } + if repo.inserted[0].Konfidenz != KonfidenzHoch { + t.Errorf("Konfidenz = %q; want %q (2+ sources)", repo.inserted[0].Konfidenz, KonfidenzHoch) } }