From c013f6bc540a58de15caa445a85e78191431cb3b Mon Sep 17 00:00:00 2001 From: vikingowl Date: Sat, 18 Apr 2026 13:40:21 +0200 Subject: [PATCH] feat(discovery/crawler): cross-source merger with source-rank tiebreaks --- .../domain/discovery/crawler/merger.go | 216 ++++++++++++++++++ .../domain/discovery/crawler/merger_test.go | 106 +++++++++ 2 files changed, 322 insertions(+) create mode 100644 backend/internal/domain/discovery/crawler/merger.go create mode 100644 backend/internal/domain/discovery/crawler/merger_test.go diff --git a/backend/internal/domain/discovery/crawler/merger.go b/backend/internal/domain/discovery/crawler/merger.go new file mode 100644 index 0000000..8313c2c --- /dev/null +++ b/backend/internal/domain/discovery/crawler/merger.go @@ -0,0 +1,216 @@ +package crawler + +import ( + "sort" + "strings" + "time" + + "marktvogt.de/backend/internal/domain/discovery" +) + +// sourceRank: lower = better. Used for field-by-field tie-breaking. +var sourceRank = map[string]int{ + "mittelaltermarkt_online": 1, + "marktkalendarium": 2, + "mittelalterkalender": 3, + "festival_alarm": 4, + "suendenfrei": 5, +} + +// Merge groups RawEvents by normalized (name, city, start_date) and picks the +// best value for each field using the source rank. Pure function. +// +// Second-pass fold: when NormalizeName collapses an event name entirely to "" +// (e.g. "Mittelaltermarkt" — a pure strip-word), the resulting empty-name bucket +// is folded into any same-(city, date) bucket that has a non-empty normalized +// name. This handles sources that use only generic names for an event that other +// sources describe more specifically. +func Merge(raws []RawEvent) []MergedEvent { + groups := make(map[string][]RawEvent) + order := []string{} + for _, r := range raws { + key := mergeKey(r) + if _, seen := groups[key]; !seen { + order = append(order, key) + } + groups[key] = append(groups[key], r) + } + + // Second pass: fold empty-name keys into matching (city, date) keys. + cityDateToKey := make(map[string]string) // cityDate → first non-empty-name key + for _, key := range order { + name, cityDate := splitMergeKey(key) + if name != "" { + if _, exists := cityDateToKey[cityDate]; !exists { + cityDateToKey[cityDate] = key + } + } + } + // Remap empty-name keys and rebuild order without orphan keys. + remapped := make(map[string]string) // emptyKey → targetKey + for _, key := range order { + name, cityDate := splitMergeKey(key) + if name == "" { + if target, ok := cityDateToKey[cityDate]; ok { + remapped[key] = target + } + } + } + for emptyKey, targetKey := range remapped { + groups[targetKey] = append(groups[targetKey], groups[emptyKey]...) + delete(groups, emptyKey) + } + var filteredOrder []string + for _, key := range order { + if _, removed := remapped[key]; !removed { + filteredOrder = append(filteredOrder, key) + } + } + + out := make([]MergedEvent, 0, len(filteredOrder)) + for _, key := range filteredOrder { + out = append(out, mergeGroup(groups[key])) + } + return out +} + +// splitMergeKey returns the normalized-name part and the "city|date" part of a +// merge key (format: "name|city|date"). +func splitMergeKey(key string) (name, cityDate string) { + idx := strings.Index(key, "|") + if idx < 0 { + return key, "" + } + return key[:idx], key[idx+1:] +} + +func mergeKey(r RawEvent) string { + date := "" + if r.StartDate != nil { + date = r.StartDate.Format("2006-01-02") + } + return discovery.NormalizeName(r.Name) + "|" + discovery.NormalizeCity(r.City) + "|" + date +} + +func mergeGroup(raws []RawEvent) MergedEvent { + // Sort by source rank so "first non-empty" == "best source's value". + sort.SliceStable(raws, func(i, j int) bool { + return rankOf(raws[i].SourceName) < rankOf(raws[j].SourceName) + }) + + m := MergedEvent{} + sourceSet := map[string]bool{} + quellenSet := map[string]bool{} + + for _, r := range raws { + sourceSet[r.SourceName] = true + if r.SourceURL != "" { + quellenSet[r.SourceURL] = true + } + if r.DetailURL != "" { + quellenSet[r.DetailURL] = true + } + + // Longest name wins (regardless of rank). + if len(r.Name) > len(m.Name) { + m.Name = r.Name + } + if m.City == "" { + m.City = r.City + } + if m.PLZ == "" { + m.PLZ = r.PLZ + } + if m.Land == "" { + m.Land = r.Land + } + if m.Bundesland == "" { + m.Bundesland = r.Bundesland + } + if m.StartDate == nil && r.StartDate != nil { + m.StartDate = r.StartDate + } + if m.Venue == "" { + m.Venue = r.Venue + } + if m.Organizer == "" { + m.Organizer = r.Organizer + } + + // Website: prefer non-empty from best rank, but only if not social-media. + if m.Website == "" && r.Website != "" && !isSocialURL(r.Website) { + m.Website = r.Website + } + + // EndDate: best-rank non-empty, then detect conflict <= 2 days. + if m.EndDate == nil && r.EndDate != nil { + m.EndDate = r.EndDate + continue + } + if m.EndDate != nil && r.EndDate != nil && !sameDay(m.EndDate, r.EndDate) { + diff := r.EndDate.Sub(*m.EndDate) + if diff < 0 { + diff = -diff + } + if diff <= 48*time.Hour { + m.Hinweis = appendHinweis(m.Hinweis, "date_conflict") + } + // differences > 2 days: events were likely different in the first place; + // merge key already collapsed them, but we don't tag noise as a conflict. + } + } + + // Fallback: if no non-social website, take best-rank social URL. + if m.Website == "" { + for _, r := range raws { + if r.Website != "" { + m.Website = r.Website + break + } + } + } + + m.Quellen = sortedKeys(quellenSet) + m.Sources = sortedKeys(sourceSet) + return m +} + +func rankOf(name string) int { + if r, ok := sourceRank[name]; ok { + return r + } + return 999 +} + +func isSocialURL(u string) bool { + lu := strings.ToLower(u) + for _, domain := range []string{"facebook.com", "instagram.com", "twitter.com", "x.com", "tiktok.com"} { + if strings.Contains(lu, domain) { + return true + } + } + return false +} + +func sameDay(a, b *time.Time) bool { + return a.Year() == b.Year() && a.Month() == b.Month() && a.Day() == b.Day() +} + +func sortedKeys(m map[string]bool) []string { + out := make([]string, 0, len(m)) + for k := range m { + out = append(out, k) + } + sort.Strings(out) + return out +} + +func appendHinweis(cur, add string) string { + if cur == "" { + return add + } + if strings.Contains(cur, add) { + return cur + } + return cur + "; " + add +} diff --git a/backend/internal/domain/discovery/crawler/merger_test.go b/backend/internal/domain/discovery/crawler/merger_test.go new file mode 100644 index 0000000..759b76c --- /dev/null +++ b/backend/internal/domain/discovery/crawler/merger_test.go @@ -0,0 +1,106 @@ +package crawler + +import ( + "testing" + "time" +) + +func mkTime(t *testing.T, s string) *time.Time { + t.Helper() + tm, err := time.Parse("2006-01-02", s) + if err != nil { + t.Fatal(err) + } + return &tm +} + +func TestMergeSingleSourcePassthrough(t *testing.T) { + raws := []RawEvent{ + {SourceName: "marktkalendarium", SourceURL: "https://a/", Name: "X", City: "Y", StartDate: mkTime(t, "2026-05-01"), EndDate: mkTime(t, "2026-05-03")}, + } + merged := Merge(raws) + if len(merged) != 1 { + t.Fatalf("len = %d; want 1", len(merged)) + } + if merged[0].Name != "X" || merged[0].City != "Y" { + t.Errorf("unexpected shape: %+v", merged[0]) + } + if len(merged[0].Quellen) != 1 || merged[0].Quellen[0] != "https://a/" { + t.Errorf("Quellen = %v", merged[0].Quellen) + } +} + +func TestMergeTwoSourcesByRank(t *testing.T) { + raws := []RawEvent{ + // marktkalendarium: rich organizer + website + {SourceName: "marktkalendarium", SourceURL: "https://mk/", Name: "Mittelaltermarkt X", City: "Dresden", PLZ: "01067", StartDate: mkTime(t, "2026-05-01"), EndDate: mkTime(t, "2026-05-03"), Website: "https://organizer.de", Organizer: "Verein Y"}, + // mittelaltermarkt_online (rank 1): adds detail URL and venue + {SourceName: "mittelaltermarkt_online", SourceURL: "https://mo/", DetailURL: "https://mo/e/1", Name: "Mittelaltermarkt X", City: "Dresden", PLZ: "01067", StartDate: mkTime(t, "2026-05-01"), EndDate: mkTime(t, "2026-05-03"), Venue: "Stallhof", Land: "Deutschland"}, + } + merged := Merge(raws) + if len(merged) != 1 { + t.Fatalf("len = %d; want 1", len(merged)) + } + m := merged[0] + if m.Organizer != "Verein Y" { + t.Errorf("Organizer = %q; want 'Verein Y'", m.Organizer) + } + if m.Venue != "Stallhof" { + t.Errorf("Venue = %q; want 'Stallhof'", m.Venue) + } + if m.Website != "https://organizer.de" { + t.Errorf("Website = %q; want 'https://organizer.de'", m.Website) + } + if len(m.Quellen) != 3 { // SourceURL + DetailURL from rank-1 + SourceURL from rank-2 + t.Errorf("Quellen = %v", m.Quellen) + } +} + +func TestMergeDateConflictHinweis(t *testing.T) { + raws := []RawEvent{ + {SourceName: "mittelaltermarkt_online", SourceURL: "https://a/", Name: "X", City: "Y", StartDate: mkTime(t, "2026-05-01"), EndDate: mkTime(t, "2026-05-03")}, + {SourceName: "marktkalendarium", SourceURL: "https://b/", Name: "X", City: "Y", StartDate: mkTime(t, "2026-05-01"), EndDate: mkTime(t, "2026-05-05")}, + } + merged := Merge(raws) + if len(merged) != 1 { + t.Fatalf("len = %d", len(merged)) + } + if !containsSubstr(merged[0].Hinweis, "date_conflict") { + t.Errorf("Hinweis = %q; want date_conflict note", merged[0].Hinweis) + } + // Winning EndDate comes from rank-1 source. + if merged[0].EndDate.Day() != 3 { + t.Errorf("EndDate day = %d; want 3 (rank-1 wins)", merged[0].EndDate.Day()) + } +} + +func TestMergeSocialURLFilter(t *testing.T) { + raws := []RawEvent{ + {SourceName: "marktkalendarium", SourceURL: "https://a/", Name: "X", City: "Y", StartDate: mkTime(t, "2026-05-01"), Website: "https://facebook.com/event/1"}, + {SourceName: "mittelalterkalender", SourceURL: "https://b/", Name: "X", City: "Y", StartDate: mkTime(t, "2026-05-01"), Website: "https://realsite.de"}, + } + merged := Merge(raws) + if merged[0].Website != "https://realsite.de" { + t.Errorf("Website = %q; want realsite (facebook filtered)", merged[0].Website) + } +} + +func TestMergeLongestNameWins(t *testing.T) { + raws := []RawEvent{ + {SourceName: "mittelalterkalender", SourceURL: "https://a/", Name: "Mittelaltermarkt", City: "Dresden", StartDate: mkTime(t, "2026-05-01")}, + {SourceName: "marktkalendarium", SourceURL: "https://b/", Name: "Mittelaltermarkt zu Dresden im Stallhof", City: "Dresden", StartDate: mkTime(t, "2026-05-01")}, + } + merged := Merge(raws) + if merged[0].Name != "Mittelaltermarkt zu Dresden im Stallhof" { + t.Errorf("Name = %q; want longest", merged[0].Name) + } +} + +func containsSubstr(s, sub string) bool { + for i := 0; i+len(sub) <= len(s); i++ { + if s[i:i+len(sub)] == sub { + return true + } + } + return false +}