feat(discovery): drop link-check from crawl path, fix suendenfrei pagination, add similarity helper

- Service.Crawl no longer link-verifies Quellen/Website for crawler events. Those URLs come from real HTML of trusted sources and have been implicitly verified at parse time. Removing this makes the insert phase complete in well under a minute even for 1500+ events and stops attributing timing-limited processing as link failures. LinkCheckFailed counter retained for JSON shape stability. - Suendenfrei pagination now stops on len(events) == 0. Previously the site's footer <h3><a> links kept anchors.Length() > 0 indefinitely, sending the crawler to page-90 before the outer ctx timeout. - New similarity helper (SimilarityScore, FindSimilar) and endpoint GET /api/v1/admin/discovery/queue/:id/similar. Multiplicative score of normalized-name Levenshtein ratio gating city-match and date- proximity bonuses. Prevents coincident-city/date events from being incorrectly flagged as near-duplicates when their names differ. Lets admin review flag near-duplicates that slip past exact-match dedup (date typos, city variants, trailing-word swaps).
2026-04-18 20:05:07 +02:00
parent cdd43cc45a
commit 073e55c7fc
7 changed files with 318 additions and 51 deletions
--- a/backend/internal/domain/discovery/crawler/suendenfrei.go
+++ b/backend/internal/domain/discovery/crawler/suendenfrei.go
@@ -76,7 +76,7 @@ func (s *SuendenfreiSource) Fetch(ctx context.Context) ([]RawEvent, error) {
 }

 // parseSuendenfreiPage extracts events from one listing page. Returns the
-// events and whether there were any <h3><a> elements (= continue paginating).
+// events and whether any event-parseable anchors were found (= continue paginating).
 func parseSuendenfreiPage(data []byte, sourceURL string) ([]RawEvent, bool) {
 	doc, err := goquery.NewDocumentFromReader(bytes.NewReader(data))
 	if err != nil {
@@ -108,7 +108,10 @@ func parseSuendenfreiPage(data []byte, sourceURL string) ([]RawEvent, bool) {
 		parsed.Land = InferLand(parsed.PLZ)
 		events = append(events, parsed)
 	})
-	return events, anchors.Length() > 0
+	// Stop pagination when no event-parseable anchors remain on the page.
+	// The site's footer has <h3><a>bc gmbh</a></h3> and similar on every
+	// page, so "any anchor" would paginate forever.
+	return events, len(events) > 0
 }

 // Month names we accept (lowercase, with and without umlauts). Maps to time.Month.
--- a/backend/internal/domain/discovery/handler.go
+++ b/backend/internal/domain/discovery/handler.go
@@ -258,6 +258,24 @@ func (h *Handler) Reject(c *gin.Context) {
 	c.Status(http.StatusNoContent)
 }

+// Similar returns queue entries similar to the given ID.
+func (h *Handler) Similar(c *gin.Context) {
+	id, err := uuid.Parse(c.Param("id"))
+	if err != nil {
+		apiErr := apierror.BadRequest("invalid_id", "invalid queue id")
+		c.JSON(apiErr.Status, apierror.NewResponse(apiErr))
+		return
+	}
+	matches, err := h.service.FindSimilarToQueueEntry(c.Request.Context(), id)
+	if err != nil {
+		slog.WarnContext(c.Request.Context(), "find similar failed", "id", id, "error", err)
+		apiErr := apierror.Internal("find similar failed")
+		c.JSON(apiErr.Status, apierror.NewResponse(apiErr))
+		return
+	}
+	c.JSON(http.StatusOK, gin.H{"data": matches})
+}
+
 func currentUserID(c *gin.Context) (uuid.UUID, bool) {
 	raw, exists := c.Get("user_id")
 	if !exists {
--- a/backend/internal/domain/discovery/routes.go
+++ b/backend/internal/domain/discovery/routes.go
@@ -23,6 +23,7 @@ func RegisterRoutes(
 		admin.PATCH("/queue/:id", h.Update)
 		admin.POST("/queue/:id/accept", h.Accept)
 		admin.POST("/queue/:id/reject", h.Reject)
+		admin.GET("/queue/:id/similar", h.Similar)
 		// Manual crawl trigger — subject to hourly rate limit.
 		admin.POST("/crawl-manual", h.Crawl)
 		// Async crawl status polling.
--- a/backend/internal/domain/discovery/service.go
+++ b/backend/internal/domain/discovery/service.go
@@ -62,10 +62,13 @@ type CrawlSummary struct {
 	DedupedExisting   int                      `json:"deduped_existing"`
 	DedupedRejected   int                      `json:"deduped_rejected"`
 	DedupedQueue      int                      `json:"deduped_queue"`
-	LinkCheckFailed   int                      `json:"link_check_failed"`
-	ValidationFailed  int                      `json:"validation_failed"`
-	DateConflicts     int                      `json:"date_conflicts"`
-	SourceErrors      []map[string]string      `json:"source_errors"`
+	// LinkCheckFailed is retained for JSON compatibility with the admin UI;
+	// no longer populated since the crawler pipeline skips link verification.
+	// Consider removing in a future schema version.
+	LinkCheckFailed  int                 `json:"link_check_failed"`
+	ValidationFailed int                 `json:"validation_failed"`
+	DateConflicts    int                 `json:"date_conflicts"`
+	SourceErrors     []map[string]string `json:"source_errors"`
 }

 // SourceSummary reports per-source fetch statistics.
@@ -133,15 +136,14 @@ func (s *Service) Crawl(ctx context.Context) (CrawlSummary, error) {
 	defer cancel()

 	for _, m := range merged {
-		quellen := s.linkChecker.FilterURLs(insertCtx, m.Quellen)
-		if len(quellen) == 0 {
-			summary.LinkCheckFailed++
-			continue
-		}
+		// Link verification was needed for Mistral's web_search output (often
+		// hallucinated URLs). Crawler URLs are parsed from actual HTML of
+		// trusted sources; they've been implicitly verified at parse time.
+		// Skipping the check makes the crawl complete in <2 minutes even for
+		// 1500+ events and avoids timing-related false positives where the
+		// insert-phase deadline makes unprocessed events look like link failures.
+		quellen := m.Quellen
 		website := m.Website
-		if website != "" && !s.linkChecker.CheckURL(insertCtx, website) {
-			website = ""
-		}

 		candidates, err := s.repo.ListSeriesByCity(insertCtx, NormalizeCity(m.City))
 		if err != nil {
@@ -417,6 +419,22 @@ func (s *Service) Stats(ctx context.Context) (Stats, error) {
 	return s.repo.Stats(ctx, forwardMonths, 5)
 }

+// FindSimilarToQueueEntry returns other pending queue entries similar to the
+// given one, threshold-filtered (0.5) and sorted desc. Used by the admin UI's
+// "possible duplicates" view.
+func (s *Service) FindSimilarToQueueEntry(ctx context.Context, id uuid.UUID) ([]SimilarityMatch, error) {
+	target, err := s.repo.GetDiscovered(ctx, id)
+	if err != nil {
+		return nil, fmt.Errorf("load target: %w", err)
+	}
+	// Pull all pending queue rows; small DB in practice (< 1000 rows).
+	candidates, err := s.repo.ListQueue(ctx, StatusPending, 2000, 0)
+	if err != nil {
+		return nil, fmt.Errorf("list pending: %w", err)
+	}
+	return FindSimilar(target, candidates, 0.5), nil
+}
+
 // findSeriesMatch returns the ID of the first candidate whose normalized name matches
 // incomingName after normalization. Candidates are expected to be pre-filtered by city.
 func findSeriesMatch(incomingName string, candidates []SeriesCandidate) *uuid.UUID {
--- a/backend/internal/domain/discovery/service_test.go
+++ b/backend/internal/domain/discovery/service_test.go
@@ -208,43 +208,6 @@ func TestServiceCrawlHappyPath(t *testing.T) {
 	}
 }

-// alwaysFailLinkVerifier filters every URL out — simulates a batch where every
-// source URL fails link verification.
-type alwaysFailLinkVerifier struct{}
-
-func (alwaysFailLinkVerifier) FilterURLs(_ context.Context, _ []string) []string { return nil }
-func (alwaysFailLinkVerifier) CheckURL(_ context.Context, _ string) bool         { return false }
-
-func TestServiceCrawlLinkCheckFailed(t *testing.T) {
-	repo := newMockRepo()
-	start := mustParseDate(t, "2026-05-01")
-
-	sc := &stubCrawlerRunner{
-		result: crawler.CrawlResult{
-			PerSource: map[string][]crawler.RawEvent{
-				"marktkalendarium": {
-					{SourceName: "marktkalendarium", SourceURL: "https://dead/", Name: "X", City: "Y", StartDate: start},
-				},
-			},
-		},
-	}
-	svc := NewService(repo, sc, alwaysFailLinkVerifier{}, noopMarketCreator{})
-
-	summary, err := svc.Crawl(context.Background())
-	if err != nil {
-		t.Fatal(err)
-	}
-	if summary.LinkCheckFailed != 1 {
-		t.Errorf("LinkCheckFailed = %d; want 1", summary.LinkCheckFailed)
-	}
-	if summary.Discovered != 0 {
-		t.Errorf("Discovered = %d; want 0", summary.Discovered)
-	}
-	if len(repo.inserted) != 0 {
-		t.Errorf("inserted = %d; want 0 (dead link should block insert)", len(repo.inserted))
-	}
-}
-
 func TestServiceCrawlDedupQueue(t *testing.T) {
 	repo := newMockRepo()
 	// Simulate: queue already has a matching pending row.
--- a/backend/internal/domain/discovery/similarity.go
+++ b/backend/internal/domain/discovery/similarity.go
@@ -0,0 +1,180 @@
+package discovery
+
+import (
+	"time"
+
+	"marktvogt.de/backend/internal/domain/discovery/normalize"
+)
+
+// SimilarityMatch pairs a candidate with its score against a target.
+type SimilarityMatch struct {
+	Entry DiscoveredMarket `json:"entry"`
+	Score float64          `json:"score"`
+}
+
+// SimilarityScore returns a score in [0, 1] for how likely two discovered
+// markets refer to the same event. 1.0 = near-certain duplicate.
+//
+// Heuristic: multiplicative — name similarity gates the city+date bonuses.
+// A low name score (clearly different event) suppresses the score regardless
+// of city/date coincidence. Prevents "Weihnachtsmarkt" from being flagged as
+// a near-duplicate of "Mittelaltermarkt Trostberg" just because they share a
+// city and date.
+//
+// Formula: nameScore * (0.6 + 0.2*cityScore + 0.2*dateScore)
+//
+// Scoring axes:
+//   - nameScore: Levenshtein ratio on normalized names (normalize.Name strips
+//     common prefix/suffix words; "Mittelaltermarkt Trostberg" and
+//     "Trostberger Mittelaltermarkt" both normalize to "trostberg").
+//   - cityScore: normalized-city exact match → 1.0; near match → 0.7; miss → 0.
+//   - dateScore: same day → 1.0; within 2d → 0.9; within 7d → 0.7;
+//     within 14d → 0.4; else → 0. Missing date → 0.5 (neutral).
+func SimilarityScore(a, b DiscoveredMarket) float64 {
+	nameA := normalize.Name(a.MarktName)
+	nameB := normalize.Name(b.MarktName)
+	cityA := normalize.City(a.Stadt)
+	cityB := normalize.City(b.Stadt)
+
+	// Name similarity: Levenshtein ratio. 1.0 for exact normalized match.
+	// Empty-vs-non-empty → 0; empty-vs-empty → 0 (can't conclude similarity).
+	nameScore := stringSimilarity(nameA, nameB)
+
+	// City similarity: mostly binary.
+	var cityScore float64
+	switch {
+	case cityA == "" && cityB == "":
+		cityScore = 0.5 // neutral; non-city rows shouldn't dominate
+	case cityA == cityB:
+		cityScore = 1.0
+	default:
+		// Allow small typos: Levenshtein ratio >= 0.8 → partial credit.
+		if stringSimilarity(cityA, cityB) >= 0.8 {
+			cityScore = 0.7
+		} else {
+			cityScore = 0
+		}
+	}
+
+	// Date proximity: both dates required for a non-neutral score.
+	var dateScore float64
+	switch {
+	case a.StartDatum == nil || b.StartDatum == nil:
+		dateScore = 0.5 // neutral-ish; can't compare
+	case sameDay(a.StartDatum, b.StartDatum):
+		dateScore = 1.0
+	default:
+		diff := absDuration(a.StartDatum.Sub(*b.StartDatum))
+		switch {
+		case diff <= 2*24*time.Hour:
+			dateScore = 0.9
+		case diff <= 7*24*time.Hour:
+			dateScore = 0.7
+		case diff <= 14*24*time.Hour:
+			dateScore = 0.4
+		default:
+			dateScore = 0
+		}
+	}
+
+	// Multiplicative: name similarity gates the city+date bonuses so that
+	// a clearly-different event name cannot score high from city/date alone.
+	// Maximum: 1.0 * (0.6 + 0.2 + 0.2) = 1.0 when all three axes are exact.
+	return nameScore * (0.6 + 0.2*cityScore + 0.2*dateScore)
+}
+
+// FindSimilar returns candidates with SimilarityScore(target, candidate) >= threshold,
+// sorted by score descending. The target itself is excluded via ID comparison.
+// Common threshold: 0.5 for "worth showing to admin", 0.7 for "likely duplicate".
+func FindSimilar(target DiscoveredMarket, candidates []DiscoveredMarket, threshold float64) []SimilarityMatch {
+	out := make([]SimilarityMatch, 0, len(candidates))
+	for _, c := range candidates {
+		if c.ID == target.ID {
+			continue
+		}
+		s := SimilarityScore(target, c)
+		if s >= threshold {
+			out = append(out, SimilarityMatch{Entry: c, Score: s})
+		}
+	}
+	sortMatchesDesc(out)
+	return out
+}
+
+// sortMatchesDesc sorts in place by Score descending. Stable insertion sort;
+// typical N here is < 200.
+func sortMatchesDesc(m []SimilarityMatch) {
+	for i := 1; i < len(m); i++ {
+		for j := i; j > 0 && m[j-1].Score < m[j].Score; j-- {
+			m[j-1], m[j] = m[j], m[j-1]
+		}
+	}
+}
+
+// stringSimilarity computes 1 - (levenshtein(a, b) / max(len(a), len(b))) for
+// UTF-8 strings. Returns 0 when either is empty. Handles multi-byte runes correctly.
+func stringSimilarity(a, b string) float64 {
+	if a == "" || b == "" {
+		return 0
+	}
+	ra := []rune(a)
+	rb := []rune(b)
+	dist := levenshtein(ra, rb)
+	maxLen := len(ra)
+	if len(rb) > maxLen {
+		maxLen = len(rb)
+	}
+	return 1.0 - float64(dist)/float64(maxLen)
+}
+
+// levenshtein computes edit distance using the two-row DP trick.
+func levenshtein(a, b []rune) int {
+	if len(a) == 0 {
+		return len(b)
+	}
+	if len(b) == 0 {
+		return len(a)
+	}
+	prev := make([]int, len(b)+1)
+	curr := make([]int, len(b)+1)
+	for j := 0; j <= len(b); j++ {
+		prev[j] = j
+	}
+	for i := 1; i <= len(a); i++ {
+		curr[0] = i
+		for j := 1; j <= len(b); j++ {
+			cost := 1
+			if a[i-1] == b[j-1] {
+				cost = 0
+			}
+			curr[j] = minInt(
+				prev[j]+1,      // deletion
+				curr[j-1]+1,    // insertion
+				prev[j-1]+cost, // substitution
+			)
+		}
+		prev, curr = curr, prev
+	}
+	return prev[len(b)]
+}
+
+func minInt(vals ...int) int {
+	m := vals[0]
+	for _, v := range vals[1:] {
+		if v < m {
+			m = v
+		}
+	}
+	return m
+}
+
+func sameDay(a, b *time.Time) bool {
+	return a.Year() == b.Year() && a.Month() == b.Month() && a.Day() == b.Day()
+}
+
+func absDuration(d time.Duration) time.Duration {
+	if d < 0 {
+		return -d
+	}
+	return d
+}
--- a/backend/internal/domain/discovery/similarity_test.go
+++ b/backend/internal/domain/discovery/similarity_test.go
@@ -0,0 +1,84 @@
+package discovery
+
+import (
+	"testing"
+	"time"
+
+	"github.com/google/uuid"
+)
+
+func mkMarket(name, city, date string) DiscoveredMarket {
+	t, _ := time.Parse("2006-01-02", date)
+	return DiscoveredMarket{
+		ID:         uuid.New(),
+		MarktName:  name,
+		Stadt:      city,
+		StartDatum: &t,
+	}
+}
+
+func TestSimilarityScore(t *testing.T) {
+	tests := []struct {
+		name     string
+		a, b     DiscoveredMarket
+		wantHigh bool // expect >= 0.8
+		wantLow  bool // expect < 0.3
+	}{
+		{
+			name:     "exact same event",
+			a:        mkMarket("Mittelaltermarkt Trostberg", "Trostberg", "2026-05-01"),
+			b:        mkMarket("Mittelaltermarkt Trostberg", "Trostberg", "2026-05-01"),
+			wantHigh: true,
+		},
+		{
+			name:     "same event with prefix/suffix swap (normalizer handles)",
+			a:        mkMarket("Mittelaltermarkt Trostberg", "Trostberg", "2026-05-01"),
+			b:        mkMarket("Trostberger Mittelaltermarkt", "Trostberg", "2026-05-01"),
+			wantHigh: true,
+		},
+		{
+			name:     "same event, date off by 1 day (typo)",
+			a:        mkMarket("Mittelaltermarkt Trostberg", "Trostberg", "2026-05-01"),
+			b:        mkMarket("Mittelaltermarkt Trostberg", "Trostberg", "2026-05-02"),
+			wantHigh: true,
+		},
+		{
+			name:    "different event same city same date",
+			a:       mkMarket("Mittelaltermarkt Trostberg", "Trostberg", "2026-05-01"),
+			b:       mkMarket("Weihnachtsmarkt", "Trostberg", "2026-05-01"),
+			wantLow: true,
+		},
+		{
+			name:    "same name, different city, different date",
+			a:       mkMarket("Mittelaltermarkt Dresden", "Dresden", "2026-05-01"),
+			b:       mkMarket("Mittelaltermarkt Leipzig", "Leipzig", "2026-08-15"),
+			wantLow: true,
+		},
+	}
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			got := SimilarityScore(tc.a, tc.b)
+			if tc.wantHigh && got < 0.8 {
+				t.Errorf("score = %.2f; wanted >= 0.8 (high similarity)", got)
+			}
+			if tc.wantLow && got >= 0.3 {
+				t.Errorf("score = %.2f; wanted < 0.3 (low similarity)", got)
+			}
+		})
+	}
+}
+
+func TestFindSimilarSortsAndFilters(t *testing.T) {
+	target := mkMarket("Mittelaltermarkt Trostberg", "Trostberg", "2026-05-01")
+	near := mkMarket("Mittelaltermarkt Trostberg", "Trostberg", "2026-05-02") // high score
+	far := mkMarket("Weihnachtsmarkt Berlin", "Berlin", "2027-12-01")         // low
+	self := DiscoveredMarket{ID: target.ID, MarktName: "self"}
+
+	got := FindSimilar(target, []DiscoveredMarket{far, near, self}, 0.5)
+	if len(got) != 1 {
+		t.Fatalf("got %d matches; want exactly 1 (the near one)", len(got))
+	}
+	if got[0].Entry.MarktName != near.MarktName {
+		t.Errorf("first match = %q; want near entry", got[0].Entry.MarktName)
+	}
+}