feat(discovery/crawler): cross-source merger with source-rank tiebreaks

This commit is contained in:
2026-04-18 13:40:21 +02:00
parent 3aed982e1c
commit c013f6bc54
2 changed files with 322 additions and 0 deletions

View File

@@ -0,0 +1,216 @@
package crawler
import (
"sort"
"strings"
"time"
"marktvogt.de/backend/internal/domain/discovery"
)
// sourceRank: lower = better. Used for field-by-field tie-breaking.
var sourceRank = map[string]int{
"mittelaltermarkt_online": 1,
"marktkalendarium": 2,
"mittelalterkalender": 3,
"festival_alarm": 4,
"suendenfrei": 5,
}
// Merge groups RawEvents by normalized (name, city, start_date) and picks the
// best value for each field using the source rank. Pure function.
//
// Second-pass fold: when NormalizeName collapses an event name entirely to ""
// (e.g. "Mittelaltermarkt" — a pure strip-word), the resulting empty-name bucket
// is folded into any same-(city, date) bucket that has a non-empty normalized
// name. This handles sources that use only generic names for an event that other
// sources describe more specifically.
func Merge(raws []RawEvent) []MergedEvent {
groups := make(map[string][]RawEvent)
order := []string{}
for _, r := range raws {
key := mergeKey(r)
if _, seen := groups[key]; !seen {
order = append(order, key)
}
groups[key] = append(groups[key], r)
}
// Second pass: fold empty-name keys into matching (city, date) keys.
cityDateToKey := make(map[string]string) // cityDate → first non-empty-name key
for _, key := range order {
name, cityDate := splitMergeKey(key)
if name != "" {
if _, exists := cityDateToKey[cityDate]; !exists {
cityDateToKey[cityDate] = key
}
}
}
// Remap empty-name keys and rebuild order without orphan keys.
remapped := make(map[string]string) // emptyKey → targetKey
for _, key := range order {
name, cityDate := splitMergeKey(key)
if name == "" {
if target, ok := cityDateToKey[cityDate]; ok {
remapped[key] = target
}
}
}
for emptyKey, targetKey := range remapped {
groups[targetKey] = append(groups[targetKey], groups[emptyKey]...)
delete(groups, emptyKey)
}
var filteredOrder []string
for _, key := range order {
if _, removed := remapped[key]; !removed {
filteredOrder = append(filteredOrder, key)
}
}
out := make([]MergedEvent, 0, len(filteredOrder))
for _, key := range filteredOrder {
out = append(out, mergeGroup(groups[key]))
}
return out
}
// splitMergeKey returns the normalized-name part and the "city|date" part of a
// merge key (format: "name|city|date").
func splitMergeKey(key string) (name, cityDate string) {
idx := strings.Index(key, "|")
if idx < 0 {
return key, ""
}
return key[:idx], key[idx+1:]
}
func mergeKey(r RawEvent) string {
date := ""
if r.StartDate != nil {
date = r.StartDate.Format("2006-01-02")
}
return discovery.NormalizeName(r.Name) + "|" + discovery.NormalizeCity(r.City) + "|" + date
}
func mergeGroup(raws []RawEvent) MergedEvent {
// Sort by source rank so "first non-empty" == "best source's value".
sort.SliceStable(raws, func(i, j int) bool {
return rankOf(raws[i].SourceName) < rankOf(raws[j].SourceName)
})
m := MergedEvent{}
sourceSet := map[string]bool{}
quellenSet := map[string]bool{}
for _, r := range raws {
sourceSet[r.SourceName] = true
if r.SourceURL != "" {
quellenSet[r.SourceURL] = true
}
if r.DetailURL != "" {
quellenSet[r.DetailURL] = true
}
// Longest name wins (regardless of rank).
if len(r.Name) > len(m.Name) {
m.Name = r.Name
}
if m.City == "" {
m.City = r.City
}
if m.PLZ == "" {
m.PLZ = r.PLZ
}
if m.Land == "" {
m.Land = r.Land
}
if m.Bundesland == "" {
m.Bundesland = r.Bundesland
}
if m.StartDate == nil && r.StartDate != nil {
m.StartDate = r.StartDate
}
if m.Venue == "" {
m.Venue = r.Venue
}
if m.Organizer == "" {
m.Organizer = r.Organizer
}
// Website: prefer non-empty from best rank, but only if not social-media.
if m.Website == "" && r.Website != "" && !isSocialURL(r.Website) {
m.Website = r.Website
}
// EndDate: best-rank non-empty, then detect conflict <= 2 days.
if m.EndDate == nil && r.EndDate != nil {
m.EndDate = r.EndDate
continue
}
if m.EndDate != nil && r.EndDate != nil && !sameDay(m.EndDate, r.EndDate) {
diff := r.EndDate.Sub(*m.EndDate)
if diff < 0 {
diff = -diff
}
if diff <= 48*time.Hour {
m.Hinweis = appendHinweis(m.Hinweis, "date_conflict")
}
// differences > 2 days: events were likely different in the first place;
// merge key already collapsed them, but we don't tag noise as a conflict.
}
}
// Fallback: if no non-social website, take best-rank social URL.
if m.Website == "" {
for _, r := range raws {
if r.Website != "" {
m.Website = r.Website
break
}
}
}
m.Quellen = sortedKeys(quellenSet)
m.Sources = sortedKeys(sourceSet)
return m
}
func rankOf(name string) int {
if r, ok := sourceRank[name]; ok {
return r
}
return 999
}
func isSocialURL(u string) bool {
lu := strings.ToLower(u)
for _, domain := range []string{"facebook.com", "instagram.com", "twitter.com", "x.com", "tiktok.com"} {
if strings.Contains(lu, domain) {
return true
}
}
return false
}
func sameDay(a, b *time.Time) bool {
return a.Year() == b.Year() && a.Month() == b.Month() && a.Day() == b.Day()
}
func sortedKeys(m map[string]bool) []string {
out := make([]string, 0, len(m))
for k := range m {
out = append(out, k)
}
sort.Strings(out)
return out
}
func appendHinweis(cur, add string) string {
if cur == "" {
return add
}
if strings.Contains(cur, add) {
return cur
}
return cur + "; " + add
}

View File

@@ -0,0 +1,106 @@
package crawler
import (
"testing"
"time"
)
func mkTime(t *testing.T, s string) *time.Time {
t.Helper()
tm, err := time.Parse("2006-01-02", s)
if err != nil {
t.Fatal(err)
}
return &tm
}
func TestMergeSingleSourcePassthrough(t *testing.T) {
raws := []RawEvent{
{SourceName: "marktkalendarium", SourceURL: "https://a/", Name: "X", City: "Y", StartDate: mkTime(t, "2026-05-01"), EndDate: mkTime(t, "2026-05-03")},
}
merged := Merge(raws)
if len(merged) != 1 {
t.Fatalf("len = %d; want 1", len(merged))
}
if merged[0].Name != "X" || merged[0].City != "Y" {
t.Errorf("unexpected shape: %+v", merged[0])
}
if len(merged[0].Quellen) != 1 || merged[0].Quellen[0] != "https://a/" {
t.Errorf("Quellen = %v", merged[0].Quellen)
}
}
func TestMergeTwoSourcesByRank(t *testing.T) {
raws := []RawEvent{
// marktkalendarium: rich organizer + website
{SourceName: "marktkalendarium", SourceURL: "https://mk/", Name: "Mittelaltermarkt X", City: "Dresden", PLZ: "01067", StartDate: mkTime(t, "2026-05-01"), EndDate: mkTime(t, "2026-05-03"), Website: "https://organizer.de", Organizer: "Verein Y"},
// mittelaltermarkt_online (rank 1): adds detail URL and venue
{SourceName: "mittelaltermarkt_online", SourceURL: "https://mo/", DetailURL: "https://mo/e/1", Name: "Mittelaltermarkt X", City: "Dresden", PLZ: "01067", StartDate: mkTime(t, "2026-05-01"), EndDate: mkTime(t, "2026-05-03"), Venue: "Stallhof", Land: "Deutschland"},
}
merged := Merge(raws)
if len(merged) != 1 {
t.Fatalf("len = %d; want 1", len(merged))
}
m := merged[0]
if m.Organizer != "Verein Y" {
t.Errorf("Organizer = %q; want 'Verein Y'", m.Organizer)
}
if m.Venue != "Stallhof" {
t.Errorf("Venue = %q; want 'Stallhof'", m.Venue)
}
if m.Website != "https://organizer.de" {
t.Errorf("Website = %q; want 'https://organizer.de'", m.Website)
}
if len(m.Quellen) != 3 { // SourceURL + DetailURL from rank-1 + SourceURL from rank-2
t.Errorf("Quellen = %v", m.Quellen)
}
}
func TestMergeDateConflictHinweis(t *testing.T) {
raws := []RawEvent{
{SourceName: "mittelaltermarkt_online", SourceURL: "https://a/", Name: "X", City: "Y", StartDate: mkTime(t, "2026-05-01"), EndDate: mkTime(t, "2026-05-03")},
{SourceName: "marktkalendarium", SourceURL: "https://b/", Name: "X", City: "Y", StartDate: mkTime(t, "2026-05-01"), EndDate: mkTime(t, "2026-05-05")},
}
merged := Merge(raws)
if len(merged) != 1 {
t.Fatalf("len = %d", len(merged))
}
if !containsSubstr(merged[0].Hinweis, "date_conflict") {
t.Errorf("Hinweis = %q; want date_conflict note", merged[0].Hinweis)
}
// Winning EndDate comes from rank-1 source.
if merged[0].EndDate.Day() != 3 {
t.Errorf("EndDate day = %d; want 3 (rank-1 wins)", merged[0].EndDate.Day())
}
}
func TestMergeSocialURLFilter(t *testing.T) {
raws := []RawEvent{
{SourceName: "marktkalendarium", SourceURL: "https://a/", Name: "X", City: "Y", StartDate: mkTime(t, "2026-05-01"), Website: "https://facebook.com/event/1"},
{SourceName: "mittelalterkalender", SourceURL: "https://b/", Name: "X", City: "Y", StartDate: mkTime(t, "2026-05-01"), Website: "https://realsite.de"},
}
merged := Merge(raws)
if merged[0].Website != "https://realsite.de" {
t.Errorf("Website = %q; want realsite (facebook filtered)", merged[0].Website)
}
}
func TestMergeLongestNameWins(t *testing.T) {
raws := []RawEvent{
{SourceName: "mittelalterkalender", SourceURL: "https://a/", Name: "Mittelaltermarkt", City: "Dresden", StartDate: mkTime(t, "2026-05-01")},
{SourceName: "marktkalendarium", SourceURL: "https://b/", Name: "Mittelaltermarkt zu Dresden im Stallhof", City: "Dresden", StartDate: mkTime(t, "2026-05-01")},
}
merged := Merge(raws)
if merged[0].Name != "Mittelaltermarkt zu Dresden im Stallhof" {
t.Errorf("Name = %q; want longest", merged[0].Name)
}
}
func containsSubstr(s, sub string) bool {
for i := 0; i+len(sub) <= len(s); i++ {
if s[i:i+len(sub)] == sub {
return true
}
}
return false
}