feat(discovery): drop link-check from crawl path, fix suendenfrei pagination, add similarity helper

- Service.Crawl no longer link-verifies Quellen/Website for crawler
  events. Those URLs come from real HTML of trusted sources and have
  been implicitly verified at parse time. Removing this makes the
  insert phase complete in well under a minute even for 1500+ events
  and stops attributing timing-limited processing as link failures.
  LinkCheckFailed counter retained for JSON shape stability.

- Suendenfrei pagination now stops on len(events) == 0. Previously the
  site's footer <h3><a> links kept anchors.Length() > 0 indefinitely,
  sending the crawler to page-90 before the outer ctx timeout.

- New similarity helper (SimilarityScore, FindSimilar) and endpoint
  GET /api/v1/admin/discovery/queue/:id/similar. Multiplicative score
  of normalized-name Levenshtein ratio gating city-match and date-
  proximity bonuses. Prevents coincident-city/date events from being
  incorrectly flagged as near-duplicates when their names differ.
  Lets admin review flag near-duplicates that slip past exact-match
  dedup (date typos, city variants, trailing-word swaps).
This commit is contained in:
2026-04-18 20:05:07 +02:00
parent cdd43cc45a
commit 073e55c7fc
7 changed files with 318 additions and 51 deletions

View File

@@ -76,7 +76,7 @@ func (s *SuendenfreiSource) Fetch(ctx context.Context) ([]RawEvent, error) {
}
// parseSuendenfreiPage extracts events from one listing page. Returns the
// events and whether there were any <h3><a> elements (= continue paginating).
// events and whether any event-parseable anchors were found (= continue paginating).
func parseSuendenfreiPage(data []byte, sourceURL string) ([]RawEvent, bool) {
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(data))
if err != nil {
@@ -108,7 +108,10 @@ func parseSuendenfreiPage(data []byte, sourceURL string) ([]RawEvent, bool) {
parsed.Land = InferLand(parsed.PLZ)
events = append(events, parsed)
})
return events, anchors.Length() > 0
// Stop pagination when no event-parseable anchors remain on the page.
// The site's footer has <h3><a>bc gmbh</a></h3> and similar on every
// page, so "any anchor" would paginate forever.
return events, len(events) > 0
}
// Month names we accept (lowercase, with and without umlauts). Maps to time.Month.

View File

@@ -258,6 +258,24 @@ func (h *Handler) Reject(c *gin.Context) {
c.Status(http.StatusNoContent)
}
// Similar returns queue entries similar to the given ID.
func (h *Handler) Similar(c *gin.Context) {
id, err := uuid.Parse(c.Param("id"))
if err != nil {
apiErr := apierror.BadRequest("invalid_id", "invalid queue id")
c.JSON(apiErr.Status, apierror.NewResponse(apiErr))
return
}
matches, err := h.service.FindSimilarToQueueEntry(c.Request.Context(), id)
if err != nil {
slog.WarnContext(c.Request.Context(), "find similar failed", "id", id, "error", err)
apiErr := apierror.Internal("find similar failed")
c.JSON(apiErr.Status, apierror.NewResponse(apiErr))
return
}
c.JSON(http.StatusOK, gin.H{"data": matches})
}
func currentUserID(c *gin.Context) (uuid.UUID, bool) {
raw, exists := c.Get("user_id")
if !exists {

View File

@@ -23,6 +23,7 @@ func RegisterRoutes(
admin.PATCH("/queue/:id", h.Update)
admin.POST("/queue/:id/accept", h.Accept)
admin.POST("/queue/:id/reject", h.Reject)
admin.GET("/queue/:id/similar", h.Similar)
// Manual crawl trigger — subject to hourly rate limit.
admin.POST("/crawl-manual", h.Crawl)
// Async crawl status polling.

View File

@@ -62,10 +62,13 @@ type CrawlSummary struct {
DedupedExisting int `json:"deduped_existing"`
DedupedRejected int `json:"deduped_rejected"`
DedupedQueue int `json:"deduped_queue"`
LinkCheckFailed int `json:"link_check_failed"`
ValidationFailed int `json:"validation_failed"`
DateConflicts int `json:"date_conflicts"`
SourceErrors []map[string]string `json:"source_errors"`
// LinkCheckFailed is retained for JSON compatibility with the admin UI;
// no longer populated since the crawler pipeline skips link verification.
// Consider removing in a future schema version.
LinkCheckFailed int `json:"link_check_failed"`
ValidationFailed int `json:"validation_failed"`
DateConflicts int `json:"date_conflicts"`
SourceErrors []map[string]string `json:"source_errors"`
}
// SourceSummary reports per-source fetch statistics.
@@ -133,15 +136,14 @@ func (s *Service) Crawl(ctx context.Context) (CrawlSummary, error) {
defer cancel()
for _, m := range merged {
quellen := s.linkChecker.FilterURLs(insertCtx, m.Quellen)
if len(quellen) == 0 {
summary.LinkCheckFailed++
continue
}
// Link verification was needed for Mistral's web_search output (often
// hallucinated URLs). Crawler URLs are parsed from actual HTML of
// trusted sources; they've been implicitly verified at parse time.
// Skipping the check makes the crawl complete in <2 minutes even for
// 1500+ events and avoids timing-related false positives where the
// insert-phase deadline makes unprocessed events look like link failures.
quellen := m.Quellen
website := m.Website
if website != "" && !s.linkChecker.CheckURL(insertCtx, website) {
website = ""
}
candidates, err := s.repo.ListSeriesByCity(insertCtx, NormalizeCity(m.City))
if err != nil {
@@ -417,6 +419,22 @@ func (s *Service) Stats(ctx context.Context) (Stats, error) {
return s.repo.Stats(ctx, forwardMonths, 5)
}
// FindSimilarToQueueEntry returns other pending queue entries similar to the
// given one, threshold-filtered (0.5) and sorted desc. Used by the admin UI's
// "possible duplicates" view.
func (s *Service) FindSimilarToQueueEntry(ctx context.Context, id uuid.UUID) ([]SimilarityMatch, error) {
target, err := s.repo.GetDiscovered(ctx, id)
if err != nil {
return nil, fmt.Errorf("load target: %w", err)
}
// Pull all pending queue rows; small DB in practice (< 1000 rows).
candidates, err := s.repo.ListQueue(ctx, StatusPending, 2000, 0)
if err != nil {
return nil, fmt.Errorf("list pending: %w", err)
}
return FindSimilar(target, candidates, 0.5), nil
}
// findSeriesMatch returns the ID of the first candidate whose normalized name matches
// incomingName after normalization. Candidates are expected to be pre-filtered by city.
func findSeriesMatch(incomingName string, candidates []SeriesCandidate) *uuid.UUID {

View File

@@ -208,43 +208,6 @@ func TestServiceCrawlHappyPath(t *testing.T) {
}
}
// alwaysFailLinkVerifier filters every URL out — simulates a batch where every
// source URL fails link verification.
type alwaysFailLinkVerifier struct{}
func (alwaysFailLinkVerifier) FilterURLs(_ context.Context, _ []string) []string { return nil }
func (alwaysFailLinkVerifier) CheckURL(_ context.Context, _ string) bool { return false }
func TestServiceCrawlLinkCheckFailed(t *testing.T) {
repo := newMockRepo()
start := mustParseDate(t, "2026-05-01")
sc := &stubCrawlerRunner{
result: crawler.CrawlResult{
PerSource: map[string][]crawler.RawEvent{
"marktkalendarium": {
{SourceName: "marktkalendarium", SourceURL: "https://dead/", Name: "X", City: "Y", StartDate: start},
},
},
},
}
svc := NewService(repo, sc, alwaysFailLinkVerifier{}, noopMarketCreator{})
summary, err := svc.Crawl(context.Background())
if err != nil {
t.Fatal(err)
}
if summary.LinkCheckFailed != 1 {
t.Errorf("LinkCheckFailed = %d; want 1", summary.LinkCheckFailed)
}
if summary.Discovered != 0 {
t.Errorf("Discovered = %d; want 0", summary.Discovered)
}
if len(repo.inserted) != 0 {
t.Errorf("inserted = %d; want 0 (dead link should block insert)", len(repo.inserted))
}
}
func TestServiceCrawlDedupQueue(t *testing.T) {
repo := newMockRepo()
// Simulate: queue already has a matching pending row.

View File

@@ -0,0 +1,180 @@
package discovery
import (
"time"
"marktvogt.de/backend/internal/domain/discovery/normalize"
)
// SimilarityMatch pairs a candidate with its score against a target.
type SimilarityMatch struct {
Entry DiscoveredMarket `json:"entry"`
Score float64 `json:"score"`
}
// SimilarityScore returns a score in [0, 1] for how likely two discovered
// markets refer to the same event. 1.0 = near-certain duplicate.
//
// Heuristic: multiplicative — name similarity gates the city+date bonuses.
// A low name score (clearly different event) suppresses the score regardless
// of city/date coincidence. Prevents "Weihnachtsmarkt" from being flagged as
// a near-duplicate of "Mittelaltermarkt Trostberg" just because they share a
// city and date.
//
// Formula: nameScore * (0.6 + 0.2*cityScore + 0.2*dateScore)
//
// Scoring axes:
// - nameScore: Levenshtein ratio on normalized names (normalize.Name strips
// common prefix/suffix words; "Mittelaltermarkt Trostberg" and
// "Trostberger Mittelaltermarkt" both normalize to "trostberg").
// - cityScore: normalized-city exact match → 1.0; near match → 0.7; miss → 0.
// - dateScore: same day → 1.0; within 2d → 0.9; within 7d → 0.7;
// within 14d → 0.4; else → 0. Missing date → 0.5 (neutral).
func SimilarityScore(a, b DiscoveredMarket) float64 {
nameA := normalize.Name(a.MarktName)
nameB := normalize.Name(b.MarktName)
cityA := normalize.City(a.Stadt)
cityB := normalize.City(b.Stadt)
// Name similarity: Levenshtein ratio. 1.0 for exact normalized match.
// Empty-vs-non-empty → 0; empty-vs-empty → 0 (can't conclude similarity).
nameScore := stringSimilarity(nameA, nameB)
// City similarity: mostly binary.
var cityScore float64
switch {
case cityA == "" && cityB == "":
cityScore = 0.5 // neutral; non-city rows shouldn't dominate
case cityA == cityB:
cityScore = 1.0
default:
// Allow small typos: Levenshtein ratio >= 0.8 → partial credit.
if stringSimilarity(cityA, cityB) >= 0.8 {
cityScore = 0.7
} else {
cityScore = 0
}
}
// Date proximity: both dates required for a non-neutral score.
var dateScore float64
switch {
case a.StartDatum == nil || b.StartDatum == nil:
dateScore = 0.5 // neutral-ish; can't compare
case sameDay(a.StartDatum, b.StartDatum):
dateScore = 1.0
default:
diff := absDuration(a.StartDatum.Sub(*b.StartDatum))
switch {
case diff <= 2*24*time.Hour:
dateScore = 0.9
case diff <= 7*24*time.Hour:
dateScore = 0.7
case diff <= 14*24*time.Hour:
dateScore = 0.4
default:
dateScore = 0
}
}
// Multiplicative: name similarity gates the city+date bonuses so that
// a clearly-different event name cannot score high from city/date alone.
// Maximum: 1.0 * (0.6 + 0.2 + 0.2) = 1.0 when all three axes are exact.
return nameScore * (0.6 + 0.2*cityScore + 0.2*dateScore)
}
// FindSimilar returns candidates with SimilarityScore(target, candidate) >= threshold,
// sorted by score descending. The target itself is excluded via ID comparison.
// Common threshold: 0.5 for "worth showing to admin", 0.7 for "likely duplicate".
func FindSimilar(target DiscoveredMarket, candidates []DiscoveredMarket, threshold float64) []SimilarityMatch {
out := make([]SimilarityMatch, 0, len(candidates))
for _, c := range candidates {
if c.ID == target.ID {
continue
}
s := SimilarityScore(target, c)
if s >= threshold {
out = append(out, SimilarityMatch{Entry: c, Score: s})
}
}
sortMatchesDesc(out)
return out
}
// sortMatchesDesc sorts in place by Score descending. Stable insertion sort;
// typical N here is < 200.
func sortMatchesDesc(m []SimilarityMatch) {
for i := 1; i < len(m); i++ {
for j := i; j > 0 && m[j-1].Score < m[j].Score; j-- {
m[j-1], m[j] = m[j], m[j-1]
}
}
}
// stringSimilarity computes 1 - (levenshtein(a, b) / max(len(a), len(b))) for
// UTF-8 strings. Returns 0 when either is empty. Handles multi-byte runes correctly.
func stringSimilarity(a, b string) float64 {
if a == "" || b == "" {
return 0
}
ra := []rune(a)
rb := []rune(b)
dist := levenshtein(ra, rb)
maxLen := len(ra)
if len(rb) > maxLen {
maxLen = len(rb)
}
return 1.0 - float64(dist)/float64(maxLen)
}
// levenshtein computes edit distance using the two-row DP trick.
func levenshtein(a, b []rune) int {
if len(a) == 0 {
return len(b)
}
if len(b) == 0 {
return len(a)
}
prev := make([]int, len(b)+1)
curr := make([]int, len(b)+1)
for j := 0; j <= len(b); j++ {
prev[j] = j
}
for i := 1; i <= len(a); i++ {
curr[0] = i
for j := 1; j <= len(b); j++ {
cost := 1
if a[i-1] == b[j-1] {
cost = 0
}
curr[j] = minInt(
prev[j]+1, // deletion
curr[j-1]+1, // insertion
prev[j-1]+cost, // substitution
)
}
prev, curr = curr, prev
}
return prev[len(b)]
}
func minInt(vals ...int) int {
m := vals[0]
for _, v := range vals[1:] {
if v < m {
m = v
}
}
return m
}
func sameDay(a, b *time.Time) bool {
return a.Year() == b.Year() && a.Month() == b.Month() && a.Day() == b.Day()
}
func absDuration(d time.Duration) time.Duration {
if d < 0 {
return -d
}
return d
}

View File

@@ -0,0 +1,84 @@
package discovery
import (
"testing"
"time"
"github.com/google/uuid"
)
func mkMarket(name, city, date string) DiscoveredMarket {
t, _ := time.Parse("2006-01-02", date)
return DiscoveredMarket{
ID: uuid.New(),
MarktName: name,
Stadt: city,
StartDatum: &t,
}
}
func TestSimilarityScore(t *testing.T) {
tests := []struct {
name string
a, b DiscoveredMarket
wantHigh bool // expect >= 0.8
wantLow bool // expect < 0.3
}{
{
name: "exact same event",
a: mkMarket("Mittelaltermarkt Trostberg", "Trostberg", "2026-05-01"),
b: mkMarket("Mittelaltermarkt Trostberg", "Trostberg", "2026-05-01"),
wantHigh: true,
},
{
name: "same event with prefix/suffix swap (normalizer handles)",
a: mkMarket("Mittelaltermarkt Trostberg", "Trostberg", "2026-05-01"),
b: mkMarket("Trostberger Mittelaltermarkt", "Trostberg", "2026-05-01"),
wantHigh: true,
},
{
name: "same event, date off by 1 day (typo)",
a: mkMarket("Mittelaltermarkt Trostberg", "Trostberg", "2026-05-01"),
b: mkMarket("Mittelaltermarkt Trostberg", "Trostberg", "2026-05-02"),
wantHigh: true,
},
{
name: "different event same city same date",
a: mkMarket("Mittelaltermarkt Trostberg", "Trostberg", "2026-05-01"),
b: mkMarket("Weihnachtsmarkt", "Trostberg", "2026-05-01"),
wantLow: true,
},
{
name: "same name, different city, different date",
a: mkMarket("Mittelaltermarkt Dresden", "Dresden", "2026-05-01"),
b: mkMarket("Mittelaltermarkt Leipzig", "Leipzig", "2026-08-15"),
wantLow: true,
},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
got := SimilarityScore(tc.a, tc.b)
if tc.wantHigh && got < 0.8 {
t.Errorf("score = %.2f; wanted >= 0.8 (high similarity)", got)
}
if tc.wantLow && got >= 0.3 {
t.Errorf("score = %.2f; wanted < 0.3 (low similarity)", got)
}
})
}
}
func TestFindSimilarSortsAndFilters(t *testing.T) {
target := mkMarket("Mittelaltermarkt Trostberg", "Trostberg", "2026-05-01")
near := mkMarket("Mittelaltermarkt Trostberg", "Trostberg", "2026-05-02") // high score
far := mkMarket("Weihnachtsmarkt Berlin", "Berlin", "2027-12-01") // low
self := DiscoveredMarket{ID: target.ID, MarktName: "self"}
got := FindSimilar(target, []DiscoveredMarket{far, near, self}, 0.5)
if len(got) != 1 {
t.Fatalf("got %d matches; want exactly 1 (the near one)", len(got))
}
if got[0].Entry.MarktName != near.MarktName {
t.Errorf("first match = %q; want near entry", got[0].Entry.MarktName)
}
}