fix(discovery): review follow-ups — konfidenz signal, end-date default, determinism, rate-limit=0
- Service.Crawl derives Konfidenz from merged source count + rank instead of hardcoded "mittel". Two+ sources -> "hoch"; single curated source -> "mittel"; single suendenfrei (prose regex) -> "niedrig". - New AgentStatus constant "crawler" replaces "bestaetigt" for crawler rows so the validator's agent-specific rules don't fire on them and operators can filter the queue by origin. Added Konfidenz* and AgentStatus* constants to model.go. - Default EndDatum to StartDatum when a source reports a single date (festival_alarm one-day events, suendenfrei lines without a "bis" range). Avoids Service.Accept rejecting nil-EndDatum rows. - Sort PerSource names before assembling raw events for merge — makes merged output order deterministic across runs. - NewHandler: manualRateLimitPerHour <= 0 now explicitly disables the rate limit (previously silently floored to 1/hour). Documented behavior for all three cases in a constructor comment. - Added four new tests for Service.Crawl failure/quality paths: LinkCheckFailed, DedupedQueue, EndDatum default, multi-source Konfidenz. - Documented the substring-match approximation in cmd/discovery-compare/main.go's groupCrawlerByBucket — diagnostic-only, not safe for production routing.
This commit is contained in:
@@ -102,6 +102,13 @@ func parseBuckets(s string) ([]sampleBucket, error) {
|
||||
return out, nil
|
||||
}
|
||||
|
||||
// groupCrawlerByBucket assigns merged crawler events to sample buckets.
|
||||
//
|
||||
// NOTE: this is an approximation for the diagnostic CLI only — not for
|
||||
// production dedup. The Bundesland match uses `strings.Contains` so a merged
|
||||
// event with Bundesland="Bayern" will join a bucket with Region="Bay" (or
|
||||
// "ern"). Good enough to compare coverage between the crawler and Mistral
|
||||
// Pass 0 at bucket granularity; not safe for business-logic routing.
|
||||
func groupCrawlerByBucket(merged []crawler.MergedEvent, buckets []sampleBucket) map[string][]crawler.MergedEvent {
|
||||
result := make(map[string][]crawler.MergedEvent)
|
||||
for _, b := range buckets {
|
||||
|
||||
@@ -21,9 +21,23 @@ type Handler struct {
|
||||
crawlRateLimit time.Duration
|
||||
}
|
||||
|
||||
// NewHandler constructs a Handler. manualRateLimitPerHour controls how
|
||||
// frequently the admin-session /crawl-manual endpoint may be invoked:
|
||||
//
|
||||
// <= 0 : disabled (no rate limit — every request is allowed)
|
||||
// 1 : 1 request per hour (default)
|
||||
// > 1 : N requests per hour, evenly spaced
|
||||
//
|
||||
// The bearer-token /crawl endpoint always bypasses this limit via the
|
||||
// `crawl_bypass_rate_limit` gin-context flag set by its route handler.
|
||||
func NewHandler(s *Service, manualRateLimitPerHour int) *Handler {
|
||||
rl := time.Hour
|
||||
if manualRateLimitPerHour > 1 {
|
||||
var rl time.Duration
|
||||
switch {
|
||||
case manualRateLimitPerHour <= 0:
|
||||
rl = 0 // sentinel: rate limiting disabled
|
||||
case manualRateLimitPerHour == 1:
|
||||
rl = time.Hour
|
||||
default:
|
||||
rl = time.Hour / time.Duration(manualRateLimitPerHour)
|
||||
}
|
||||
return &Handler{service: s, crawlRateLimit: rl}
|
||||
@@ -52,7 +66,7 @@ func (h *Handler) Crawl(c *gin.Context) {
|
||||
}
|
||||
defer h.crawlMu.Unlock()
|
||||
|
||||
if _, bypass := c.Get("crawl_bypass_rate_limit"); !bypass {
|
||||
if _, bypass := c.Get("crawl_bypass_rate_limit"); !bypass && h.crawlRateLimit > 0 {
|
||||
if since := time.Since(h.crawlLastManual); since < h.crawlRateLimit {
|
||||
retryIn := (h.crawlRateLimit - since).Seconds()
|
||||
c.Header("Retry-After", fmt.Sprint(int(retryIn)+1))
|
||||
|
||||
@@ -107,6 +107,27 @@ const (
|
||||
StatusRejected = "rejected"
|
||||
)
|
||||
|
||||
// AgentStatus constants.
|
||||
// Mistral Pass 0 produces: bestaetigt | unklar | vorjahr_unbestaetigt | abgesagt.
|
||||
// The crawler uses its own sentinel value so the validator's agent-specific
|
||||
// rules (e.g. bestaetigt+vorjahr_hinweis inconsistency) don't fire on crawler-
|
||||
// produced rows, and so operators can filter the queue by origin.
|
||||
const (
|
||||
AgentStatusBestaetigt = "bestaetigt"
|
||||
AgentStatusUnklar = "unklar"
|
||||
AgentStatusVorjahrUnbestaetigt = "vorjahr_unbestaetigt"
|
||||
AgentStatusAbgesagt = "abgesagt"
|
||||
AgentStatusCrawler = "crawler"
|
||||
)
|
||||
|
||||
// Konfidenz constants. The three-level scale is used by both Pass 0 (agent-
|
||||
// reported) and the crawler (derived from source agreement + source rank).
|
||||
const (
|
||||
KonfidenzHoch = "hoch"
|
||||
KonfidenzMittel = "mittel"
|
||||
KonfidenzNiedrig = "niedrig"
|
||||
)
|
||||
|
||||
// Stats is the discovery health snapshot used by the admin dashboard strip.
|
||||
type Stats struct {
|
||||
LastTickAt *time.Time `json:"last_tick_at"`
|
||||
|
||||
@@ -5,6 +5,7 @@ import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"sort"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
@@ -307,8 +308,17 @@ func (s *Service) Crawl(ctx context.Context) (CrawlSummary, error) {
|
||||
return summary, err
|
||||
}
|
||||
|
||||
// Sort source names for deterministic event ordering across runs;
|
||||
// Merge's internal bucket order then depends only on input.
|
||||
sourceNames := make([]string, 0, len(res.PerSource))
|
||||
for name := range res.PerSource {
|
||||
sourceNames = append(sourceNames, name)
|
||||
}
|
||||
sort.Strings(sourceNames)
|
||||
|
||||
var all []crawler.RawEvent
|
||||
for name, evs := range res.PerSource {
|
||||
for _, name := range sourceNames {
|
||||
evs := res.PerSource[name]
|
||||
summary.PerSource[name] = SourceSummary{
|
||||
EventsFetched: len(evs),
|
||||
ElapsedMs: res.PerSourceMS[name],
|
||||
@@ -389,6 +399,15 @@ func (s *Service) Crawl(ctx context.Context) (CrawlSummary, error) {
|
||||
continue
|
||||
}
|
||||
|
||||
// Default EndDatum to StartDatum for sources that only reported a
|
||||
// single date (festival_alarm one-day events, suendenfrei lines
|
||||
// without a "bis" range). Admin can still edit via /queue/:id
|
||||
// before accept. Avoids a blocking nil-EndDatum check in Service.Accept.
|
||||
endDatum := m.EndDate
|
||||
if endDatum == nil && m.StartDate != nil {
|
||||
endDatum = m.StartDate
|
||||
}
|
||||
|
||||
dm := DiscoveredMarket{
|
||||
BucketID: nil,
|
||||
MarktName: m.Name,
|
||||
@@ -396,11 +415,11 @@ func (s *Service) Crawl(ctx context.Context) (CrawlSummary, error) {
|
||||
Bundesland: m.Bundesland,
|
||||
Land: m.Land,
|
||||
StartDatum: m.StartDate,
|
||||
EndDatum: m.EndDate,
|
||||
EndDatum: endDatum,
|
||||
Website: website,
|
||||
Quellen: quellen,
|
||||
Konfidenz: "mittel",
|
||||
AgentStatus: "bestaetigt",
|
||||
Konfidenz: crawlerKonfidenz(m),
|
||||
AgentStatus: AgentStatusCrawler,
|
||||
Hinweis: m.Hinweis,
|
||||
NameNormalized: nameNorm,
|
||||
MatchedSeriesID: matchedSeriesID,
|
||||
@@ -423,6 +442,29 @@ func (s *Service) Crawl(ctx context.Context) (CrawlSummary, error) {
|
||||
return summary, nil
|
||||
}
|
||||
|
||||
// crawlerKonfidenz derives a three-level confidence label for a merged event.
|
||||
// Signal: cross-source agreement is the strongest indicator — two or more
|
||||
// independent calendars emitting the same (normalized name, city, start_date)
|
||||
// triple is high confidence. Single-source rows fall back to source rank:
|
||||
// Tribe JSON and marktkalendarium curate their data, suendenfrei's prose
|
||||
// regex is brittle.
|
||||
func crawlerKonfidenz(m crawler.MergedEvent) string {
|
||||
if len(m.Sources) >= 2 {
|
||||
return KonfidenzHoch
|
||||
}
|
||||
if len(m.Sources) == 1 {
|
||||
switch m.Sources[0] {
|
||||
case "mittelaltermarkt_online", "marktkalendarium":
|
||||
return KonfidenzMittel
|
||||
case "mittelalterkalender", "festival_alarm":
|
||||
return KonfidenzMittel
|
||||
case "suendenfrei":
|
||||
return KonfidenzNiedrig
|
||||
}
|
||||
}
|
||||
return KonfidenzNiedrig
|
||||
}
|
||||
|
||||
// formatIssues produces a compact log-friendly summary of validation issues.
|
||||
func formatIssues(issues []Issue) string {
|
||||
parts := make([]string, 0, len(issues))
|
||||
|
||||
@@ -278,15 +278,15 @@ func TestServiceCrawlHappyPath(t *testing.T) {
|
||||
sc := &stubCrawlerRunner{
|
||||
result: crawler.CrawlResult{
|
||||
PerSource: map[string][]crawler.RawEvent{
|
||||
"a": {
|
||||
"marktkalendarium": {
|
||||
{
|
||||
SourceName: "a", SourceURL: "https://a/",
|
||||
SourceName: "marktkalendarium", SourceURL: "https://a/",
|
||||
Name: "Markt X", City: "Dresden", PLZ: "01067", Land: "Deutschland",
|
||||
StartDate: start, EndDate: end,
|
||||
},
|
||||
},
|
||||
},
|
||||
PerSourceMS: map[string]int64{"a": 1},
|
||||
PerSourceMS: map[string]int64{"marktkalendarium": 1},
|
||||
},
|
||||
}
|
||||
svc := NewServiceWithCrawler(repo, sc, lc, noopMarketCreator{})
|
||||
@@ -301,7 +301,145 @@ func TestServiceCrawlHappyPath(t *testing.T) {
|
||||
if len(repo.inserted) != 1 {
|
||||
t.Errorf("inserted = %d; want 1", len(repo.inserted))
|
||||
}
|
||||
if repo.inserted[0].BucketID != nil {
|
||||
t.Errorf("BucketID = %v; want nil (crawler-produced row)", repo.inserted[0].BucketID)
|
||||
got := repo.inserted[0]
|
||||
if got.BucketID != nil {
|
||||
t.Errorf("BucketID = %v; want nil (crawler-produced row)", got.BucketID)
|
||||
}
|
||||
if got.AgentStatus != AgentStatusCrawler {
|
||||
t.Errorf("AgentStatus = %q; want %q", got.AgentStatus, AgentStatusCrawler)
|
||||
}
|
||||
if got.Konfidenz != KonfidenzMittel {
|
||||
t.Errorf("Konfidenz = %q; want %q (single curated source)", got.Konfidenz, KonfidenzMittel)
|
||||
}
|
||||
}
|
||||
|
||||
// alwaysFailLinkVerifier filters every URL out — simulates a batch where every
|
||||
// source URL fails link verification.
|
||||
type alwaysFailLinkVerifier struct{}
|
||||
|
||||
func (alwaysFailLinkVerifier) FilterURLs(_ context.Context, _ []string) []string { return nil }
|
||||
func (alwaysFailLinkVerifier) CheckURL(_ context.Context, _ string) bool { return false }
|
||||
|
||||
func TestServiceCrawlLinkCheckFailed(t *testing.T) {
|
||||
repo := newMockRepo()
|
||||
start := mustParseDate(t, "2026-05-01")
|
||||
|
||||
sc := &stubCrawlerRunner{
|
||||
result: crawler.CrawlResult{
|
||||
PerSource: map[string][]crawler.RawEvent{
|
||||
"marktkalendarium": {
|
||||
{SourceName: "marktkalendarium", SourceURL: "https://dead/", Name: "X", City: "Y", StartDate: start},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
svc := NewServiceWithCrawler(repo, sc, alwaysFailLinkVerifier{}, noopMarketCreator{})
|
||||
|
||||
summary, err := svc.Crawl(context.Background())
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if summary.LinkCheckFailed != 1 {
|
||||
t.Errorf("LinkCheckFailed = %d; want 1", summary.LinkCheckFailed)
|
||||
}
|
||||
if summary.Discovered != 0 {
|
||||
t.Errorf("Discovered = %d; want 0", summary.Discovered)
|
||||
}
|
||||
if len(repo.inserted) != 0 {
|
||||
t.Errorf("inserted = %d; want 0 (dead link should block insert)", len(repo.inserted))
|
||||
}
|
||||
}
|
||||
|
||||
func TestServiceCrawlDedupQueue(t *testing.T) {
|
||||
repo := newMockRepo()
|
||||
// Simulate: queue already has a matching pending row.
|
||||
repo.queuePendingFn = func(_ context.Context, _, _ string, _ *time.Time) (bool, error) {
|
||||
return true, nil
|
||||
}
|
||||
start := mustParseDate(t, "2026-05-01")
|
||||
|
||||
sc := &stubCrawlerRunner{
|
||||
result: crawler.CrawlResult{
|
||||
PerSource: map[string][]crawler.RawEvent{
|
||||
"marktkalendarium": {
|
||||
{SourceName: "marktkalendarium", SourceURL: "https://a/", Name: "X", City: "Y", StartDate: start},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
svc := NewServiceWithCrawler(repo, sc, noopLinkVerifier{}, noopMarketCreator{})
|
||||
|
||||
summary, err := svc.Crawl(context.Background())
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if summary.DedupedQueue != 1 {
|
||||
t.Errorf("DedupedQueue = %d; want 1", summary.DedupedQueue)
|
||||
}
|
||||
if summary.Discovered != 0 {
|
||||
t.Errorf("Discovered = %d; want 0 (dupe should block insert)", summary.Discovered)
|
||||
}
|
||||
if len(repo.inserted) != 0 {
|
||||
t.Errorf("inserted = %d; want 0", len(repo.inserted))
|
||||
}
|
||||
}
|
||||
|
||||
func TestServiceCrawlDefaultsEndDate(t *testing.T) {
|
||||
repo := newMockRepo()
|
||||
start := mustParseDate(t, "2026-05-01")
|
||||
|
||||
// RawEvent with no EndDate (e.g., festival_alarm one-day event).
|
||||
sc := &stubCrawlerRunner{
|
||||
result: crawler.CrawlResult{
|
||||
PerSource: map[string][]crawler.RawEvent{
|
||||
"marktkalendarium": {
|
||||
{SourceName: "marktkalendarium", SourceURL: "https://a/", Name: "One Day Fest", City: "Y", StartDate: start, EndDate: nil},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
svc := NewServiceWithCrawler(repo, sc, noopLinkVerifier{}, noopMarketCreator{})
|
||||
|
||||
if _, err := svc.Crawl(context.Background()); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if len(repo.inserted) != 1 {
|
||||
t.Fatalf("inserted = %d; want 1", len(repo.inserted))
|
||||
}
|
||||
got := repo.inserted[0]
|
||||
if got.EndDatum == nil {
|
||||
t.Error("EndDatum is nil; expected default to StartDatum")
|
||||
}
|
||||
if !got.EndDatum.Equal(*got.StartDatum) {
|
||||
t.Errorf("EndDatum = %v; want equal to StartDatum %v", got.EndDatum, got.StartDatum)
|
||||
}
|
||||
}
|
||||
|
||||
func TestServiceCrawlMultiSourceHighKonfidenz(t *testing.T) {
|
||||
repo := newMockRepo()
|
||||
start := mustParseDate(t, "2026-05-01")
|
||||
|
||||
sc := &stubCrawlerRunner{
|
||||
result: crawler.CrawlResult{
|
||||
PerSource: map[string][]crawler.RawEvent{
|
||||
"marktkalendarium": {{SourceName: "marktkalendarium", SourceURL: "https://a/", Name: "X", City: "Y", StartDate: start}},
|
||||
"mittelaltermarkt_online": {{SourceName: "mittelaltermarkt_online", SourceURL: "https://b/", Name: "X", City: "Y", StartDate: start}},
|
||||
},
|
||||
},
|
||||
}
|
||||
svc := NewServiceWithCrawler(repo, sc, noopLinkVerifier{}, noopMarketCreator{})
|
||||
|
||||
summary, err := svc.Crawl(context.Background())
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if summary.Discovered != 1 {
|
||||
t.Errorf("Discovered = %d; want 1 (two sources merge into one event)", summary.Discovered)
|
||||
}
|
||||
if summary.MergedAcrossSites != 1 {
|
||||
t.Errorf("MergedAcrossSites = %d; want 1", summary.MergedAcrossSites)
|
||||
}
|
||||
if repo.inserted[0].Konfidenz != KonfidenzHoch {
|
||||
t.Errorf("Konfidenz = %q; want %q (2+ sources)", repo.inserted[0].Konfidenz, KonfidenzHoch)
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user