diff --git a/backend/internal/domain/discovery/crawler/suendenfrei.go b/backend/internal/domain/discovery/crawler/suendenfrei.go index c960d2f..7cc9aa7 100644 --- a/backend/internal/domain/discovery/crawler/suendenfrei.go +++ b/backend/internal/domain/discovery/crawler/suendenfrei.go @@ -76,7 +76,7 @@ func (s *SuendenfreiSource) Fetch(ctx context.Context) ([]RawEvent, error) { } // parseSuendenfreiPage extracts events from one listing page. Returns the -// events and whether there were any

elements (= continue paginating). +// events and whether any event-parseable anchors were found (= continue paginating). func parseSuendenfreiPage(data []byte, sourceURL string) ([]RawEvent, bool) { doc, err := goquery.NewDocumentFromReader(bytes.NewReader(data)) if err != nil { @@ -108,7 +108,10 @@ func parseSuendenfreiPage(data []byte, sourceURL string) ([]RawEvent, bool) { parsed.Land = InferLand(parsed.PLZ) events = append(events, parsed) }) - return events, anchors.Length() > 0 + // Stop pagination when no event-parseable anchors remain on the page. + // The site's footer has

bc gmbh

and similar on every + // page, so "any anchor" would paginate forever. + return events, len(events) > 0 } // Month names we accept (lowercase, with and without umlauts). Maps to time.Month. diff --git a/backend/internal/domain/discovery/handler.go b/backend/internal/domain/discovery/handler.go index 455d6b3..e6caff1 100644 --- a/backend/internal/domain/discovery/handler.go +++ b/backend/internal/domain/discovery/handler.go @@ -258,6 +258,24 @@ func (h *Handler) Reject(c *gin.Context) { c.Status(http.StatusNoContent) } +// Similar returns queue entries similar to the given ID. +func (h *Handler) Similar(c *gin.Context) { + id, err := uuid.Parse(c.Param("id")) + if err != nil { + apiErr := apierror.BadRequest("invalid_id", "invalid queue id") + c.JSON(apiErr.Status, apierror.NewResponse(apiErr)) + return + } + matches, err := h.service.FindSimilarToQueueEntry(c.Request.Context(), id) + if err != nil { + slog.WarnContext(c.Request.Context(), "find similar failed", "id", id, "error", err) + apiErr := apierror.Internal("find similar failed") + c.JSON(apiErr.Status, apierror.NewResponse(apiErr)) + return + } + c.JSON(http.StatusOK, gin.H{"data": matches}) +} + func currentUserID(c *gin.Context) (uuid.UUID, bool) { raw, exists := c.Get("user_id") if !exists { diff --git a/backend/internal/domain/discovery/routes.go b/backend/internal/domain/discovery/routes.go index 158b4e1..5460382 100644 --- a/backend/internal/domain/discovery/routes.go +++ b/backend/internal/domain/discovery/routes.go @@ -23,6 +23,7 @@ func RegisterRoutes( admin.PATCH("/queue/:id", h.Update) admin.POST("/queue/:id/accept", h.Accept) admin.POST("/queue/:id/reject", h.Reject) + admin.GET("/queue/:id/similar", h.Similar) // Manual crawl trigger — subject to hourly rate limit. admin.POST("/crawl-manual", h.Crawl) // Async crawl status polling. diff --git a/backend/internal/domain/discovery/service.go b/backend/internal/domain/discovery/service.go index b29b731..27b3e58 100644 --- a/backend/internal/domain/discovery/service.go +++ b/backend/internal/domain/discovery/service.go @@ -62,10 +62,13 @@ type CrawlSummary struct { DedupedExisting int `json:"deduped_existing"` DedupedRejected int `json:"deduped_rejected"` DedupedQueue int `json:"deduped_queue"` - LinkCheckFailed int `json:"link_check_failed"` - ValidationFailed int `json:"validation_failed"` - DateConflicts int `json:"date_conflicts"` - SourceErrors []map[string]string `json:"source_errors"` + // LinkCheckFailed is retained for JSON compatibility with the admin UI; + // no longer populated since the crawler pipeline skips link verification. + // Consider removing in a future schema version. + LinkCheckFailed int `json:"link_check_failed"` + ValidationFailed int `json:"validation_failed"` + DateConflicts int `json:"date_conflicts"` + SourceErrors []map[string]string `json:"source_errors"` } // SourceSummary reports per-source fetch statistics. @@ -133,15 +136,14 @@ func (s *Service) Crawl(ctx context.Context) (CrawlSummary, error) { defer cancel() for _, m := range merged { - quellen := s.linkChecker.FilterURLs(insertCtx, m.Quellen) - if len(quellen) == 0 { - summary.LinkCheckFailed++ - continue - } + // Link verification was needed for Mistral's web_search output (often + // hallucinated URLs). Crawler URLs are parsed from actual HTML of + // trusted sources; they've been implicitly verified at parse time. + // Skipping the check makes the crawl complete in <2 minutes even for + // 1500+ events and avoids timing-related false positives where the + // insert-phase deadline makes unprocessed events look like link failures. + quellen := m.Quellen website := m.Website - if website != "" && !s.linkChecker.CheckURL(insertCtx, website) { - website = "" - } candidates, err := s.repo.ListSeriesByCity(insertCtx, NormalizeCity(m.City)) if err != nil { @@ -417,6 +419,22 @@ func (s *Service) Stats(ctx context.Context) (Stats, error) { return s.repo.Stats(ctx, forwardMonths, 5) } +// FindSimilarToQueueEntry returns other pending queue entries similar to the +// given one, threshold-filtered (0.5) and sorted desc. Used by the admin UI's +// "possible duplicates" view. +func (s *Service) FindSimilarToQueueEntry(ctx context.Context, id uuid.UUID) ([]SimilarityMatch, error) { + target, err := s.repo.GetDiscovered(ctx, id) + if err != nil { + return nil, fmt.Errorf("load target: %w", err) + } + // Pull all pending queue rows; small DB in practice (< 1000 rows). + candidates, err := s.repo.ListQueue(ctx, StatusPending, 2000, 0) + if err != nil { + return nil, fmt.Errorf("list pending: %w", err) + } + return FindSimilar(target, candidates, 0.5), nil +} + // findSeriesMatch returns the ID of the first candidate whose normalized name matches // incomingName after normalization. Candidates are expected to be pre-filtered by city. func findSeriesMatch(incomingName string, candidates []SeriesCandidate) *uuid.UUID { diff --git a/backend/internal/domain/discovery/service_test.go b/backend/internal/domain/discovery/service_test.go index 4dacd66..752489e 100644 --- a/backend/internal/domain/discovery/service_test.go +++ b/backend/internal/domain/discovery/service_test.go @@ -208,43 +208,6 @@ func TestServiceCrawlHappyPath(t *testing.T) { } } -// alwaysFailLinkVerifier filters every URL out — simulates a batch where every -// source URL fails link verification. -type alwaysFailLinkVerifier struct{} - -func (alwaysFailLinkVerifier) FilterURLs(_ context.Context, _ []string) []string { return nil } -func (alwaysFailLinkVerifier) CheckURL(_ context.Context, _ string) bool { return false } - -func TestServiceCrawlLinkCheckFailed(t *testing.T) { - repo := newMockRepo() - start := mustParseDate(t, "2026-05-01") - - sc := &stubCrawlerRunner{ - result: crawler.CrawlResult{ - PerSource: map[string][]crawler.RawEvent{ - "marktkalendarium": { - {SourceName: "marktkalendarium", SourceURL: "https://dead/", Name: "X", City: "Y", StartDate: start}, - }, - }, - }, - } - svc := NewService(repo, sc, alwaysFailLinkVerifier{}, noopMarketCreator{}) - - summary, err := svc.Crawl(context.Background()) - if err != nil { - t.Fatal(err) - } - if summary.LinkCheckFailed != 1 { - t.Errorf("LinkCheckFailed = %d; want 1", summary.LinkCheckFailed) - } - if summary.Discovered != 0 { - t.Errorf("Discovered = %d; want 0", summary.Discovered) - } - if len(repo.inserted) != 0 { - t.Errorf("inserted = %d; want 0 (dead link should block insert)", len(repo.inserted)) - } -} - func TestServiceCrawlDedupQueue(t *testing.T) { repo := newMockRepo() // Simulate: queue already has a matching pending row. diff --git a/backend/internal/domain/discovery/similarity.go b/backend/internal/domain/discovery/similarity.go new file mode 100644 index 0000000..ba97b88 --- /dev/null +++ b/backend/internal/domain/discovery/similarity.go @@ -0,0 +1,180 @@ +package discovery + +import ( + "time" + + "marktvogt.de/backend/internal/domain/discovery/normalize" +) + +// SimilarityMatch pairs a candidate with its score against a target. +type SimilarityMatch struct { + Entry DiscoveredMarket `json:"entry"` + Score float64 `json:"score"` +} + +// SimilarityScore returns a score in [0, 1] for how likely two discovered +// markets refer to the same event. 1.0 = near-certain duplicate. +// +// Heuristic: multiplicative — name similarity gates the city+date bonuses. +// A low name score (clearly different event) suppresses the score regardless +// of city/date coincidence. Prevents "Weihnachtsmarkt" from being flagged as +// a near-duplicate of "Mittelaltermarkt Trostberg" just because they share a +// city and date. +// +// Formula: nameScore * (0.6 + 0.2*cityScore + 0.2*dateScore) +// +// Scoring axes: +// - nameScore: Levenshtein ratio on normalized names (normalize.Name strips +// common prefix/suffix words; "Mittelaltermarkt Trostberg" and +// "Trostberger Mittelaltermarkt" both normalize to "trostberg"). +// - cityScore: normalized-city exact match → 1.0; near match → 0.7; miss → 0. +// - dateScore: same day → 1.0; within 2d → 0.9; within 7d → 0.7; +// within 14d → 0.4; else → 0. Missing date → 0.5 (neutral). +func SimilarityScore(a, b DiscoveredMarket) float64 { + nameA := normalize.Name(a.MarktName) + nameB := normalize.Name(b.MarktName) + cityA := normalize.City(a.Stadt) + cityB := normalize.City(b.Stadt) + + // Name similarity: Levenshtein ratio. 1.0 for exact normalized match. + // Empty-vs-non-empty → 0; empty-vs-empty → 0 (can't conclude similarity). + nameScore := stringSimilarity(nameA, nameB) + + // City similarity: mostly binary. + var cityScore float64 + switch { + case cityA == "" && cityB == "": + cityScore = 0.5 // neutral; non-city rows shouldn't dominate + case cityA == cityB: + cityScore = 1.0 + default: + // Allow small typos: Levenshtein ratio >= 0.8 → partial credit. + if stringSimilarity(cityA, cityB) >= 0.8 { + cityScore = 0.7 + } else { + cityScore = 0 + } + } + + // Date proximity: both dates required for a non-neutral score. + var dateScore float64 + switch { + case a.StartDatum == nil || b.StartDatum == nil: + dateScore = 0.5 // neutral-ish; can't compare + case sameDay(a.StartDatum, b.StartDatum): + dateScore = 1.0 + default: + diff := absDuration(a.StartDatum.Sub(*b.StartDatum)) + switch { + case diff <= 2*24*time.Hour: + dateScore = 0.9 + case diff <= 7*24*time.Hour: + dateScore = 0.7 + case diff <= 14*24*time.Hour: + dateScore = 0.4 + default: + dateScore = 0 + } + } + + // Multiplicative: name similarity gates the city+date bonuses so that + // a clearly-different event name cannot score high from city/date alone. + // Maximum: 1.0 * (0.6 + 0.2 + 0.2) = 1.0 when all three axes are exact. + return nameScore * (0.6 + 0.2*cityScore + 0.2*dateScore) +} + +// FindSimilar returns candidates with SimilarityScore(target, candidate) >= threshold, +// sorted by score descending. The target itself is excluded via ID comparison. +// Common threshold: 0.5 for "worth showing to admin", 0.7 for "likely duplicate". +func FindSimilar(target DiscoveredMarket, candidates []DiscoveredMarket, threshold float64) []SimilarityMatch { + out := make([]SimilarityMatch, 0, len(candidates)) + for _, c := range candidates { + if c.ID == target.ID { + continue + } + s := SimilarityScore(target, c) + if s >= threshold { + out = append(out, SimilarityMatch{Entry: c, Score: s}) + } + } + sortMatchesDesc(out) + return out +} + +// sortMatchesDesc sorts in place by Score descending. Stable insertion sort; +// typical N here is < 200. +func sortMatchesDesc(m []SimilarityMatch) { + for i := 1; i < len(m); i++ { + for j := i; j > 0 && m[j-1].Score < m[j].Score; j-- { + m[j-1], m[j] = m[j], m[j-1] + } + } +} + +// stringSimilarity computes 1 - (levenshtein(a, b) / max(len(a), len(b))) for +// UTF-8 strings. Returns 0 when either is empty. Handles multi-byte runes correctly. +func stringSimilarity(a, b string) float64 { + if a == "" || b == "" { + return 0 + } + ra := []rune(a) + rb := []rune(b) + dist := levenshtein(ra, rb) + maxLen := len(ra) + if len(rb) > maxLen { + maxLen = len(rb) + } + return 1.0 - float64(dist)/float64(maxLen) +} + +// levenshtein computes edit distance using the two-row DP trick. +func levenshtein(a, b []rune) int { + if len(a) == 0 { + return len(b) + } + if len(b) == 0 { + return len(a) + } + prev := make([]int, len(b)+1) + curr := make([]int, len(b)+1) + for j := 0; j <= len(b); j++ { + prev[j] = j + } + for i := 1; i <= len(a); i++ { + curr[0] = i + for j := 1; j <= len(b); j++ { + cost := 1 + if a[i-1] == b[j-1] { + cost = 0 + } + curr[j] = minInt( + prev[j]+1, // deletion + curr[j-1]+1, // insertion + prev[j-1]+cost, // substitution + ) + } + prev, curr = curr, prev + } + return prev[len(b)] +} + +func minInt(vals ...int) int { + m := vals[0] + for _, v := range vals[1:] { + if v < m { + m = v + } + } + return m +} + +func sameDay(a, b *time.Time) bool { + return a.Year() == b.Year() && a.Month() == b.Month() && a.Day() == b.Day() +} + +func absDuration(d time.Duration) time.Duration { + if d < 0 { + return -d + } + return d +} diff --git a/backend/internal/domain/discovery/similarity_test.go b/backend/internal/domain/discovery/similarity_test.go new file mode 100644 index 0000000..1d1515e --- /dev/null +++ b/backend/internal/domain/discovery/similarity_test.go @@ -0,0 +1,84 @@ +package discovery + +import ( + "testing" + "time" + + "github.com/google/uuid" +) + +func mkMarket(name, city, date string) DiscoveredMarket { + t, _ := time.Parse("2006-01-02", date) + return DiscoveredMarket{ + ID: uuid.New(), + MarktName: name, + Stadt: city, + StartDatum: &t, + } +} + +func TestSimilarityScore(t *testing.T) { + tests := []struct { + name string + a, b DiscoveredMarket + wantHigh bool // expect >= 0.8 + wantLow bool // expect < 0.3 + }{ + { + name: "exact same event", + a: mkMarket("Mittelaltermarkt Trostberg", "Trostberg", "2026-05-01"), + b: mkMarket("Mittelaltermarkt Trostberg", "Trostberg", "2026-05-01"), + wantHigh: true, + }, + { + name: "same event with prefix/suffix swap (normalizer handles)", + a: mkMarket("Mittelaltermarkt Trostberg", "Trostberg", "2026-05-01"), + b: mkMarket("Trostberger Mittelaltermarkt", "Trostberg", "2026-05-01"), + wantHigh: true, + }, + { + name: "same event, date off by 1 day (typo)", + a: mkMarket("Mittelaltermarkt Trostberg", "Trostberg", "2026-05-01"), + b: mkMarket("Mittelaltermarkt Trostberg", "Trostberg", "2026-05-02"), + wantHigh: true, + }, + { + name: "different event same city same date", + a: mkMarket("Mittelaltermarkt Trostberg", "Trostberg", "2026-05-01"), + b: mkMarket("Weihnachtsmarkt", "Trostberg", "2026-05-01"), + wantLow: true, + }, + { + name: "same name, different city, different date", + a: mkMarket("Mittelaltermarkt Dresden", "Dresden", "2026-05-01"), + b: mkMarket("Mittelaltermarkt Leipzig", "Leipzig", "2026-08-15"), + wantLow: true, + }, + } + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + got := SimilarityScore(tc.a, tc.b) + if tc.wantHigh && got < 0.8 { + t.Errorf("score = %.2f; wanted >= 0.8 (high similarity)", got) + } + if tc.wantLow && got >= 0.3 { + t.Errorf("score = %.2f; wanted < 0.3 (low similarity)", got) + } + }) + } +} + +func TestFindSimilarSortsAndFilters(t *testing.T) { + target := mkMarket("Mittelaltermarkt Trostberg", "Trostberg", "2026-05-01") + near := mkMarket("Mittelaltermarkt Trostberg", "Trostberg", "2026-05-02") // high score + far := mkMarket("Weihnachtsmarkt Berlin", "Berlin", "2027-12-01") // low + self := DiscoveredMarket{ID: target.ID, MarktName: "self"} + + got := FindSimilar(target, []DiscoveredMarket{far, near, self}, 0.5) + if len(got) != 1 { + t.Fatalf("got %d matches; want exactly 1 (the near one)", len(got)) + } + if got[0].Entry.MarktName != near.MarktName { + t.Errorf("first match = %q; want near entry", got[0].Entry.MarktName) + } +}