feat(discovery/crawler): cross-source merger with source-rank tiebreaks
This commit is contained in:
216
backend/internal/domain/discovery/crawler/merger.go
Normal file
216
backend/internal/domain/discovery/crawler/merger.go
Normal file
@@ -0,0 +1,216 @@
|
||||
package crawler
|
||||
|
||||
import (
|
||||
"sort"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"marktvogt.de/backend/internal/domain/discovery"
|
||||
)
|
||||
|
||||
// sourceRank: lower = better. Used for field-by-field tie-breaking.
|
||||
var sourceRank = map[string]int{
|
||||
"mittelaltermarkt_online": 1,
|
||||
"marktkalendarium": 2,
|
||||
"mittelalterkalender": 3,
|
||||
"festival_alarm": 4,
|
||||
"suendenfrei": 5,
|
||||
}
|
||||
|
||||
// Merge groups RawEvents by normalized (name, city, start_date) and picks the
|
||||
// best value for each field using the source rank. Pure function.
|
||||
//
|
||||
// Second-pass fold: when NormalizeName collapses an event name entirely to ""
|
||||
// (e.g. "Mittelaltermarkt" — a pure strip-word), the resulting empty-name bucket
|
||||
// is folded into any same-(city, date) bucket that has a non-empty normalized
|
||||
// name. This handles sources that use only generic names for an event that other
|
||||
// sources describe more specifically.
|
||||
func Merge(raws []RawEvent) []MergedEvent {
|
||||
groups := make(map[string][]RawEvent)
|
||||
order := []string{}
|
||||
for _, r := range raws {
|
||||
key := mergeKey(r)
|
||||
if _, seen := groups[key]; !seen {
|
||||
order = append(order, key)
|
||||
}
|
||||
groups[key] = append(groups[key], r)
|
||||
}
|
||||
|
||||
// Second pass: fold empty-name keys into matching (city, date) keys.
|
||||
cityDateToKey := make(map[string]string) // cityDate → first non-empty-name key
|
||||
for _, key := range order {
|
||||
name, cityDate := splitMergeKey(key)
|
||||
if name != "" {
|
||||
if _, exists := cityDateToKey[cityDate]; !exists {
|
||||
cityDateToKey[cityDate] = key
|
||||
}
|
||||
}
|
||||
}
|
||||
// Remap empty-name keys and rebuild order without orphan keys.
|
||||
remapped := make(map[string]string) // emptyKey → targetKey
|
||||
for _, key := range order {
|
||||
name, cityDate := splitMergeKey(key)
|
||||
if name == "" {
|
||||
if target, ok := cityDateToKey[cityDate]; ok {
|
||||
remapped[key] = target
|
||||
}
|
||||
}
|
||||
}
|
||||
for emptyKey, targetKey := range remapped {
|
||||
groups[targetKey] = append(groups[targetKey], groups[emptyKey]...)
|
||||
delete(groups, emptyKey)
|
||||
}
|
||||
var filteredOrder []string
|
||||
for _, key := range order {
|
||||
if _, removed := remapped[key]; !removed {
|
||||
filteredOrder = append(filteredOrder, key)
|
||||
}
|
||||
}
|
||||
|
||||
out := make([]MergedEvent, 0, len(filteredOrder))
|
||||
for _, key := range filteredOrder {
|
||||
out = append(out, mergeGroup(groups[key]))
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// splitMergeKey returns the normalized-name part and the "city|date" part of a
|
||||
// merge key (format: "name|city|date").
|
||||
func splitMergeKey(key string) (name, cityDate string) {
|
||||
idx := strings.Index(key, "|")
|
||||
if idx < 0 {
|
||||
return key, ""
|
||||
}
|
||||
return key[:idx], key[idx+1:]
|
||||
}
|
||||
|
||||
func mergeKey(r RawEvent) string {
|
||||
date := ""
|
||||
if r.StartDate != nil {
|
||||
date = r.StartDate.Format("2006-01-02")
|
||||
}
|
||||
return discovery.NormalizeName(r.Name) + "|" + discovery.NormalizeCity(r.City) + "|" + date
|
||||
}
|
||||
|
||||
func mergeGroup(raws []RawEvent) MergedEvent {
|
||||
// Sort by source rank so "first non-empty" == "best source's value".
|
||||
sort.SliceStable(raws, func(i, j int) bool {
|
||||
return rankOf(raws[i].SourceName) < rankOf(raws[j].SourceName)
|
||||
})
|
||||
|
||||
m := MergedEvent{}
|
||||
sourceSet := map[string]bool{}
|
||||
quellenSet := map[string]bool{}
|
||||
|
||||
for _, r := range raws {
|
||||
sourceSet[r.SourceName] = true
|
||||
if r.SourceURL != "" {
|
||||
quellenSet[r.SourceURL] = true
|
||||
}
|
||||
if r.DetailURL != "" {
|
||||
quellenSet[r.DetailURL] = true
|
||||
}
|
||||
|
||||
// Longest name wins (regardless of rank).
|
||||
if len(r.Name) > len(m.Name) {
|
||||
m.Name = r.Name
|
||||
}
|
||||
if m.City == "" {
|
||||
m.City = r.City
|
||||
}
|
||||
if m.PLZ == "" {
|
||||
m.PLZ = r.PLZ
|
||||
}
|
||||
if m.Land == "" {
|
||||
m.Land = r.Land
|
||||
}
|
||||
if m.Bundesland == "" {
|
||||
m.Bundesland = r.Bundesland
|
||||
}
|
||||
if m.StartDate == nil && r.StartDate != nil {
|
||||
m.StartDate = r.StartDate
|
||||
}
|
||||
if m.Venue == "" {
|
||||
m.Venue = r.Venue
|
||||
}
|
||||
if m.Organizer == "" {
|
||||
m.Organizer = r.Organizer
|
||||
}
|
||||
|
||||
// Website: prefer non-empty from best rank, but only if not social-media.
|
||||
if m.Website == "" && r.Website != "" && !isSocialURL(r.Website) {
|
||||
m.Website = r.Website
|
||||
}
|
||||
|
||||
// EndDate: best-rank non-empty, then detect conflict <= 2 days.
|
||||
if m.EndDate == nil && r.EndDate != nil {
|
||||
m.EndDate = r.EndDate
|
||||
continue
|
||||
}
|
||||
if m.EndDate != nil && r.EndDate != nil && !sameDay(m.EndDate, r.EndDate) {
|
||||
diff := r.EndDate.Sub(*m.EndDate)
|
||||
if diff < 0 {
|
||||
diff = -diff
|
||||
}
|
||||
if diff <= 48*time.Hour {
|
||||
m.Hinweis = appendHinweis(m.Hinweis, "date_conflict")
|
||||
}
|
||||
// differences > 2 days: events were likely different in the first place;
|
||||
// merge key already collapsed them, but we don't tag noise as a conflict.
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: if no non-social website, take best-rank social URL.
|
||||
if m.Website == "" {
|
||||
for _, r := range raws {
|
||||
if r.Website != "" {
|
||||
m.Website = r.Website
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
m.Quellen = sortedKeys(quellenSet)
|
||||
m.Sources = sortedKeys(sourceSet)
|
||||
return m
|
||||
}
|
||||
|
||||
func rankOf(name string) int {
|
||||
if r, ok := sourceRank[name]; ok {
|
||||
return r
|
||||
}
|
||||
return 999
|
||||
}
|
||||
|
||||
func isSocialURL(u string) bool {
|
||||
lu := strings.ToLower(u)
|
||||
for _, domain := range []string{"facebook.com", "instagram.com", "twitter.com", "x.com", "tiktok.com"} {
|
||||
if strings.Contains(lu, domain) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func sameDay(a, b *time.Time) bool {
|
||||
return a.Year() == b.Year() && a.Month() == b.Month() && a.Day() == b.Day()
|
||||
}
|
||||
|
||||
func sortedKeys(m map[string]bool) []string {
|
||||
out := make([]string, 0, len(m))
|
||||
for k := range m {
|
||||
out = append(out, k)
|
||||
}
|
||||
sort.Strings(out)
|
||||
return out
|
||||
}
|
||||
|
||||
func appendHinweis(cur, add string) string {
|
||||
if cur == "" {
|
||||
return add
|
||||
}
|
||||
if strings.Contains(cur, add) {
|
||||
return cur
|
||||
}
|
||||
return cur + "; " + add
|
||||
}
|
||||
106
backend/internal/domain/discovery/crawler/merger_test.go
Normal file
106
backend/internal/domain/discovery/crawler/merger_test.go
Normal file
@@ -0,0 +1,106 @@
|
||||
package crawler
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func mkTime(t *testing.T, s string) *time.Time {
|
||||
t.Helper()
|
||||
tm, err := time.Parse("2006-01-02", s)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
return &tm
|
||||
}
|
||||
|
||||
func TestMergeSingleSourcePassthrough(t *testing.T) {
|
||||
raws := []RawEvent{
|
||||
{SourceName: "marktkalendarium", SourceURL: "https://a/", Name: "X", City: "Y", StartDate: mkTime(t, "2026-05-01"), EndDate: mkTime(t, "2026-05-03")},
|
||||
}
|
||||
merged := Merge(raws)
|
||||
if len(merged) != 1 {
|
||||
t.Fatalf("len = %d; want 1", len(merged))
|
||||
}
|
||||
if merged[0].Name != "X" || merged[0].City != "Y" {
|
||||
t.Errorf("unexpected shape: %+v", merged[0])
|
||||
}
|
||||
if len(merged[0].Quellen) != 1 || merged[0].Quellen[0] != "https://a/" {
|
||||
t.Errorf("Quellen = %v", merged[0].Quellen)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMergeTwoSourcesByRank(t *testing.T) {
|
||||
raws := []RawEvent{
|
||||
// marktkalendarium: rich organizer + website
|
||||
{SourceName: "marktkalendarium", SourceURL: "https://mk/", Name: "Mittelaltermarkt X", City: "Dresden", PLZ: "01067", StartDate: mkTime(t, "2026-05-01"), EndDate: mkTime(t, "2026-05-03"), Website: "https://organizer.de", Organizer: "Verein Y"},
|
||||
// mittelaltermarkt_online (rank 1): adds detail URL and venue
|
||||
{SourceName: "mittelaltermarkt_online", SourceURL: "https://mo/", DetailURL: "https://mo/e/1", Name: "Mittelaltermarkt X", City: "Dresden", PLZ: "01067", StartDate: mkTime(t, "2026-05-01"), EndDate: mkTime(t, "2026-05-03"), Venue: "Stallhof", Land: "Deutschland"},
|
||||
}
|
||||
merged := Merge(raws)
|
||||
if len(merged) != 1 {
|
||||
t.Fatalf("len = %d; want 1", len(merged))
|
||||
}
|
||||
m := merged[0]
|
||||
if m.Organizer != "Verein Y" {
|
||||
t.Errorf("Organizer = %q; want 'Verein Y'", m.Organizer)
|
||||
}
|
||||
if m.Venue != "Stallhof" {
|
||||
t.Errorf("Venue = %q; want 'Stallhof'", m.Venue)
|
||||
}
|
||||
if m.Website != "https://organizer.de" {
|
||||
t.Errorf("Website = %q; want 'https://organizer.de'", m.Website)
|
||||
}
|
||||
if len(m.Quellen) != 3 { // SourceURL + DetailURL from rank-1 + SourceURL from rank-2
|
||||
t.Errorf("Quellen = %v", m.Quellen)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMergeDateConflictHinweis(t *testing.T) {
|
||||
raws := []RawEvent{
|
||||
{SourceName: "mittelaltermarkt_online", SourceURL: "https://a/", Name: "X", City: "Y", StartDate: mkTime(t, "2026-05-01"), EndDate: mkTime(t, "2026-05-03")},
|
||||
{SourceName: "marktkalendarium", SourceURL: "https://b/", Name: "X", City: "Y", StartDate: mkTime(t, "2026-05-01"), EndDate: mkTime(t, "2026-05-05")},
|
||||
}
|
||||
merged := Merge(raws)
|
||||
if len(merged) != 1 {
|
||||
t.Fatalf("len = %d", len(merged))
|
||||
}
|
||||
if !containsSubstr(merged[0].Hinweis, "date_conflict") {
|
||||
t.Errorf("Hinweis = %q; want date_conflict note", merged[0].Hinweis)
|
||||
}
|
||||
// Winning EndDate comes from rank-1 source.
|
||||
if merged[0].EndDate.Day() != 3 {
|
||||
t.Errorf("EndDate day = %d; want 3 (rank-1 wins)", merged[0].EndDate.Day())
|
||||
}
|
||||
}
|
||||
|
||||
func TestMergeSocialURLFilter(t *testing.T) {
|
||||
raws := []RawEvent{
|
||||
{SourceName: "marktkalendarium", SourceURL: "https://a/", Name: "X", City: "Y", StartDate: mkTime(t, "2026-05-01"), Website: "https://facebook.com/event/1"},
|
||||
{SourceName: "mittelalterkalender", SourceURL: "https://b/", Name: "X", City: "Y", StartDate: mkTime(t, "2026-05-01"), Website: "https://realsite.de"},
|
||||
}
|
||||
merged := Merge(raws)
|
||||
if merged[0].Website != "https://realsite.de" {
|
||||
t.Errorf("Website = %q; want realsite (facebook filtered)", merged[0].Website)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMergeLongestNameWins(t *testing.T) {
|
||||
raws := []RawEvent{
|
||||
{SourceName: "mittelalterkalender", SourceURL: "https://a/", Name: "Mittelaltermarkt", City: "Dresden", StartDate: mkTime(t, "2026-05-01")},
|
||||
{SourceName: "marktkalendarium", SourceURL: "https://b/", Name: "Mittelaltermarkt zu Dresden im Stallhof", City: "Dresden", StartDate: mkTime(t, "2026-05-01")},
|
||||
}
|
||||
merged := Merge(raws)
|
||||
if merged[0].Name != "Mittelaltermarkt zu Dresden im Stallhof" {
|
||||
t.Errorf("Name = %q; want longest", merged[0].Name)
|
||||
}
|
||||
}
|
||||
|
||||
func containsSubstr(s, sub string) bool {
|
||||
for i := 0; i+len(sub) <= len(s); i++ {
|
||||
if s[i:i+len(sub)] == sub {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
Reference in New Issue
Block a user