fix(discovery): review follow-ups — konfidenz signal, end-date default, determinism, rate-limit=0

- Service.Crawl derives Konfidenz from merged source count + rank instead of
  hardcoded "mittel". Two+ sources -> "hoch"; single curated source ->
  "mittel"; single suendenfrei (prose regex) -> "niedrig".
- New AgentStatus constant "crawler" replaces "bestaetigt" for crawler rows
  so the validator's agent-specific rules don't fire on them and operators
  can filter the queue by origin. Added Konfidenz* and AgentStatus*
  constants to model.go.
- Default EndDatum to StartDatum when a source reports a single date
  (festival_alarm one-day events, suendenfrei lines without a "bis" range).
  Avoids Service.Accept rejecting nil-EndDatum rows.
- Sort PerSource names before assembling raw events for merge — makes
  merged output order deterministic across runs.
- NewHandler: manualRateLimitPerHour <= 0 now explicitly disables the
  rate limit (previously silently floored to 1/hour). Documented behavior
  for all three cases in a constructor comment.
- Added four new tests for Service.Crawl failure/quality paths:
  LinkCheckFailed, DedupedQueue, EndDatum default, multi-source Konfidenz.
- Documented the substring-match approximation in
  cmd/discovery-compare/main.go's groupCrawlerByBucket — diagnostic-only,
  not safe for production routing.
This commit is contained in:
2026-04-18 16:35:26 +02:00
parent c5a4bc441c
commit 7c8a8c6419
5 changed files with 234 additions and 12 deletions

View File

@@ -102,6 +102,13 @@ func parseBuckets(s string) ([]sampleBucket, error) {
return out, nil
}
// groupCrawlerByBucket assigns merged crawler events to sample buckets.
//
// NOTE: this is an approximation for the diagnostic CLI only — not for
// production dedup. The Bundesland match uses `strings.Contains` so a merged
// event with Bundesland="Bayern" will join a bucket with Region="Bay" (or
// "ern"). Good enough to compare coverage between the crawler and Mistral
// Pass 0 at bucket granularity; not safe for business-logic routing.
func groupCrawlerByBucket(merged []crawler.MergedEvent, buckets []sampleBucket) map[string][]crawler.MergedEvent {
result := make(map[string][]crawler.MergedEvent)
for _, b := range buckets {

View File

@@ -21,9 +21,23 @@ type Handler struct {
crawlRateLimit time.Duration
}
// NewHandler constructs a Handler. manualRateLimitPerHour controls how
// frequently the admin-session /crawl-manual endpoint may be invoked:
//
// <= 0 : disabled (no rate limit — every request is allowed)
// 1 : 1 request per hour (default)
// > 1 : N requests per hour, evenly spaced
//
// The bearer-token /crawl endpoint always bypasses this limit via the
// `crawl_bypass_rate_limit` gin-context flag set by its route handler.
func NewHandler(s *Service, manualRateLimitPerHour int) *Handler {
rl := time.Hour
if manualRateLimitPerHour > 1 {
var rl time.Duration
switch {
case manualRateLimitPerHour <= 0:
rl = 0 // sentinel: rate limiting disabled
case manualRateLimitPerHour == 1:
rl = time.Hour
default:
rl = time.Hour / time.Duration(manualRateLimitPerHour)
}
return &Handler{service: s, crawlRateLimit: rl}
@@ -52,7 +66,7 @@ func (h *Handler) Crawl(c *gin.Context) {
}
defer h.crawlMu.Unlock()
if _, bypass := c.Get("crawl_bypass_rate_limit"); !bypass {
if _, bypass := c.Get("crawl_bypass_rate_limit"); !bypass && h.crawlRateLimit > 0 {
if since := time.Since(h.crawlLastManual); since < h.crawlRateLimit {
retryIn := (h.crawlRateLimit - since).Seconds()
c.Header("Retry-After", fmt.Sprint(int(retryIn)+1))

View File

@@ -107,6 +107,27 @@ const (
StatusRejected = "rejected"
)
// AgentStatus constants.
// Mistral Pass 0 produces: bestaetigt | unklar | vorjahr_unbestaetigt | abgesagt.
// The crawler uses its own sentinel value so the validator's agent-specific
// rules (e.g. bestaetigt+vorjahr_hinweis inconsistency) don't fire on crawler-
// produced rows, and so operators can filter the queue by origin.
const (
AgentStatusBestaetigt = "bestaetigt"
AgentStatusUnklar = "unklar"
AgentStatusVorjahrUnbestaetigt = "vorjahr_unbestaetigt"
AgentStatusAbgesagt = "abgesagt"
AgentStatusCrawler = "crawler"
)
// Konfidenz constants. The three-level scale is used by both Pass 0 (agent-
// reported) and the crawler (derived from source agreement + source rank).
const (
KonfidenzHoch = "hoch"
KonfidenzMittel = "mittel"
KonfidenzNiedrig = "niedrig"
)
// Stats is the discovery health snapshot used by the admin dashboard strip.
type Stats struct {
LastTickAt *time.Time `json:"last_tick_at"`

View File

@@ -5,6 +5,7 @@ import (
"errors"
"fmt"
"log/slog"
"sort"
"strings"
"time"
@@ -307,8 +308,17 @@ func (s *Service) Crawl(ctx context.Context) (CrawlSummary, error) {
return summary, err
}
// Sort source names for deterministic event ordering across runs;
// Merge's internal bucket order then depends only on input.
sourceNames := make([]string, 0, len(res.PerSource))
for name := range res.PerSource {
sourceNames = append(sourceNames, name)
}
sort.Strings(sourceNames)
var all []crawler.RawEvent
for name, evs := range res.PerSource {
for _, name := range sourceNames {
evs := res.PerSource[name]
summary.PerSource[name] = SourceSummary{
EventsFetched: len(evs),
ElapsedMs: res.PerSourceMS[name],
@@ -389,6 +399,15 @@ func (s *Service) Crawl(ctx context.Context) (CrawlSummary, error) {
continue
}
// Default EndDatum to StartDatum for sources that only reported a
// single date (festival_alarm one-day events, suendenfrei lines
// without a "bis" range). Admin can still edit via /queue/:id
// before accept. Avoids a blocking nil-EndDatum check in Service.Accept.
endDatum := m.EndDate
if endDatum == nil && m.StartDate != nil {
endDatum = m.StartDate
}
dm := DiscoveredMarket{
BucketID: nil,
MarktName: m.Name,
@@ -396,11 +415,11 @@ func (s *Service) Crawl(ctx context.Context) (CrawlSummary, error) {
Bundesland: m.Bundesland,
Land: m.Land,
StartDatum: m.StartDate,
EndDatum: m.EndDate,
EndDatum: endDatum,
Website: website,
Quellen: quellen,
Konfidenz: "mittel",
AgentStatus: "bestaetigt",
Konfidenz: crawlerKonfidenz(m),
AgentStatus: AgentStatusCrawler,
Hinweis: m.Hinweis,
NameNormalized: nameNorm,
MatchedSeriesID: matchedSeriesID,
@@ -423,6 +442,29 @@ func (s *Service) Crawl(ctx context.Context) (CrawlSummary, error) {
return summary, nil
}
// crawlerKonfidenz derives a three-level confidence label for a merged event.
// Signal: cross-source agreement is the strongest indicator — two or more
// independent calendars emitting the same (normalized name, city, start_date)
// triple is high confidence. Single-source rows fall back to source rank:
// Tribe JSON and marktkalendarium curate their data, suendenfrei's prose
// regex is brittle.
func crawlerKonfidenz(m crawler.MergedEvent) string {
if len(m.Sources) >= 2 {
return KonfidenzHoch
}
if len(m.Sources) == 1 {
switch m.Sources[0] {
case "mittelaltermarkt_online", "marktkalendarium":
return KonfidenzMittel
case "mittelalterkalender", "festival_alarm":
return KonfidenzMittel
case "suendenfrei":
return KonfidenzNiedrig
}
}
return KonfidenzNiedrig
}
// formatIssues produces a compact log-friendly summary of validation issues.
func formatIssues(issues []Issue) string {
parts := make([]string, 0, len(issues))

View File

@@ -278,15 +278,15 @@ func TestServiceCrawlHappyPath(t *testing.T) {
sc := &stubCrawlerRunner{
result: crawler.CrawlResult{
PerSource: map[string][]crawler.RawEvent{
"a": {
"marktkalendarium": {
{
SourceName: "a", SourceURL: "https://a/",
SourceName: "marktkalendarium", SourceURL: "https://a/",
Name: "Markt X", City: "Dresden", PLZ: "01067", Land: "Deutschland",
StartDate: start, EndDate: end,
},
},
},
PerSourceMS: map[string]int64{"a": 1},
PerSourceMS: map[string]int64{"marktkalendarium": 1},
},
}
svc := NewServiceWithCrawler(repo, sc, lc, noopMarketCreator{})
@@ -301,7 +301,145 @@ func TestServiceCrawlHappyPath(t *testing.T) {
if len(repo.inserted) != 1 {
t.Errorf("inserted = %d; want 1", len(repo.inserted))
}
if repo.inserted[0].BucketID != nil {
t.Errorf("BucketID = %v; want nil (crawler-produced row)", repo.inserted[0].BucketID)
got := repo.inserted[0]
if got.BucketID != nil {
t.Errorf("BucketID = %v; want nil (crawler-produced row)", got.BucketID)
}
if got.AgentStatus != AgentStatusCrawler {
t.Errorf("AgentStatus = %q; want %q", got.AgentStatus, AgentStatusCrawler)
}
if got.Konfidenz != KonfidenzMittel {
t.Errorf("Konfidenz = %q; want %q (single curated source)", got.Konfidenz, KonfidenzMittel)
}
}
// alwaysFailLinkVerifier filters every URL out — simulates a batch where every
// source URL fails link verification.
type alwaysFailLinkVerifier struct{}
func (alwaysFailLinkVerifier) FilterURLs(_ context.Context, _ []string) []string { return nil }
func (alwaysFailLinkVerifier) CheckURL(_ context.Context, _ string) bool { return false }
func TestServiceCrawlLinkCheckFailed(t *testing.T) {
repo := newMockRepo()
start := mustParseDate(t, "2026-05-01")
sc := &stubCrawlerRunner{
result: crawler.CrawlResult{
PerSource: map[string][]crawler.RawEvent{
"marktkalendarium": {
{SourceName: "marktkalendarium", SourceURL: "https://dead/", Name: "X", City: "Y", StartDate: start},
},
},
},
}
svc := NewServiceWithCrawler(repo, sc, alwaysFailLinkVerifier{}, noopMarketCreator{})
summary, err := svc.Crawl(context.Background())
if err != nil {
t.Fatal(err)
}
if summary.LinkCheckFailed != 1 {
t.Errorf("LinkCheckFailed = %d; want 1", summary.LinkCheckFailed)
}
if summary.Discovered != 0 {
t.Errorf("Discovered = %d; want 0", summary.Discovered)
}
if len(repo.inserted) != 0 {
t.Errorf("inserted = %d; want 0 (dead link should block insert)", len(repo.inserted))
}
}
func TestServiceCrawlDedupQueue(t *testing.T) {
repo := newMockRepo()
// Simulate: queue already has a matching pending row.
repo.queuePendingFn = func(_ context.Context, _, _ string, _ *time.Time) (bool, error) {
return true, nil
}
start := mustParseDate(t, "2026-05-01")
sc := &stubCrawlerRunner{
result: crawler.CrawlResult{
PerSource: map[string][]crawler.RawEvent{
"marktkalendarium": {
{SourceName: "marktkalendarium", SourceURL: "https://a/", Name: "X", City: "Y", StartDate: start},
},
},
},
}
svc := NewServiceWithCrawler(repo, sc, noopLinkVerifier{}, noopMarketCreator{})
summary, err := svc.Crawl(context.Background())
if err != nil {
t.Fatal(err)
}
if summary.DedupedQueue != 1 {
t.Errorf("DedupedQueue = %d; want 1", summary.DedupedQueue)
}
if summary.Discovered != 0 {
t.Errorf("Discovered = %d; want 0 (dupe should block insert)", summary.Discovered)
}
if len(repo.inserted) != 0 {
t.Errorf("inserted = %d; want 0", len(repo.inserted))
}
}
func TestServiceCrawlDefaultsEndDate(t *testing.T) {
repo := newMockRepo()
start := mustParseDate(t, "2026-05-01")
// RawEvent with no EndDate (e.g., festival_alarm one-day event).
sc := &stubCrawlerRunner{
result: crawler.CrawlResult{
PerSource: map[string][]crawler.RawEvent{
"marktkalendarium": {
{SourceName: "marktkalendarium", SourceURL: "https://a/", Name: "One Day Fest", City: "Y", StartDate: start, EndDate: nil},
},
},
},
}
svc := NewServiceWithCrawler(repo, sc, noopLinkVerifier{}, noopMarketCreator{})
if _, err := svc.Crawl(context.Background()); err != nil {
t.Fatal(err)
}
if len(repo.inserted) != 1 {
t.Fatalf("inserted = %d; want 1", len(repo.inserted))
}
got := repo.inserted[0]
if got.EndDatum == nil {
t.Error("EndDatum is nil; expected default to StartDatum")
}
if !got.EndDatum.Equal(*got.StartDatum) {
t.Errorf("EndDatum = %v; want equal to StartDatum %v", got.EndDatum, got.StartDatum)
}
}
func TestServiceCrawlMultiSourceHighKonfidenz(t *testing.T) {
repo := newMockRepo()
start := mustParseDate(t, "2026-05-01")
sc := &stubCrawlerRunner{
result: crawler.CrawlResult{
PerSource: map[string][]crawler.RawEvent{
"marktkalendarium": {{SourceName: "marktkalendarium", SourceURL: "https://a/", Name: "X", City: "Y", StartDate: start}},
"mittelaltermarkt_online": {{SourceName: "mittelaltermarkt_online", SourceURL: "https://b/", Name: "X", City: "Y", StartDate: start}},
},
},
}
svc := NewServiceWithCrawler(repo, sc, noopLinkVerifier{}, noopMarketCreator{})
summary, err := svc.Crawl(context.Background())
if err != nil {
t.Fatal(err)
}
if summary.Discovered != 1 {
t.Errorf("Discovered = %d; want 1 (two sources merge into one event)", summary.Discovered)
}
if summary.MergedAcrossSites != 1 {
t.Errorf("MergedAcrossSites = %d; want 1", summary.MergedAcrossSites)
}
if repo.inserted[0].Konfidenz != KonfidenzHoch {
t.Errorf("Konfidenz = %q; want %q (2+ sources)", repo.inserted[0].Konfidenz, KonfidenzHoch)
}
}