From 91c058105e88ac552f070a57bef47d9d753f5a52 Mon Sep 17 00:00:00 2001 From: vikingowl Date: Sat, 18 Apr 2026 12:24:49 +0200 Subject: [PATCH] feat(discovery/crawler): mittelalterkalender.info parser --- .../discovery/crawler/mittelalterkalender.go | 127 ++++++++++++++++++ .../crawler/mittelalterkalender_test.go | 38 ++++++ 2 files changed, 165 insertions(+) create mode 100644 backend/internal/domain/discovery/crawler/mittelalterkalender.go create mode 100644 backend/internal/domain/discovery/crawler/mittelalterkalender_test.go diff --git a/backend/internal/domain/discovery/crawler/mittelalterkalender.go b/backend/internal/domain/discovery/crawler/mittelalterkalender.go new file mode 100644 index 0000000..9ce64e7 --- /dev/null +++ b/backend/internal/domain/discovery/crawler/mittelalterkalender.go @@ -0,0 +1,127 @@ +package crawler + +import ( + "bytes" + "context" + "fmt" + "strings" + "time" + + "github.com/PuerkitoBio/goquery" +) + +// MittelalterkalenderSource scrapes www.mittelalterkalender.info. Page has +// twelve monthly s; each has columns: Beginn | Ende | Titel | PLZ | Ort | +// [Details link]. +type MittelalterkalenderSource struct { + fetcher *Fetcher + urls []string +} + +func NewMittelalterkalender(f *Fetcher, urls []string) *MittelalterkalenderSource { + return &MittelalterkalenderSource{fetcher: f, urls: urls} +} + +func (s *MittelalterkalenderSource) Name() string { return "mittelalterkalender" } + +func (s *MittelalterkalenderSource) Fetch(ctx context.Context) ([]RawEvent, error) { + var all []RawEvent + for i, url := range s.urls { + if i > 0 { + if err := sleepCtx(ctx, 2*time.Second); err != nil { + return all, err + } + } + body, err := s.fetcher.Get(ctx, url, "") + if err != nil { + return all, fmt.Errorf("mittelalterkalender %s: %w", url, err) + } + events, err := parseMittelalterkalender(body, url) + if err != nil { + return all, fmt.Errorf("mittelalterkalender parse %s: %w", url, err) + } + all = append(all, events...) + } + return all, nil +} + +func parseMittelalterkalender(data []byte, sourceURL string) ([]RawEvent, error) { + doc, err := goquery.NewDocumentFromReader(bytes.NewReader(data)) + if err != nil { + return nil, err + } + var events []RawEvent + doc.Find("table tr").Each(func(_ int, tr *goquery.Selection) { + cells := tr.Find("td") + if cells.Length() < 5 { + return + } + // First cell contains start date followed by " bis " span; extract just the date. + beginnText := strings.TrimSpace(cells.Eq(0).Text()) + // Remove the "bis" suffix (cell text is "DD.MM.YYYY bis") + if idx := strings.Index(beginnText, " bis"); idx > 0 { + beginnText = beginnText[:idx] + } + beginn := strings.TrimSpace(beginnText) + ende := strings.TrimSpace(cells.Eq(1).Text()) + titel := strings.TrimSpace(cells.Eq(2).Text()) + plz := strings.TrimSpace(cells.Eq(3).Text()) + ort := strings.TrimSpace(cells.Eq(4).Text()) + + if titel == "" || beginn == "" { + return + } + start := parseDEDate(beginn) + if start == nil { + return + } + end := parseDEDate(ende) + + detailURL := "" + if cells.Length() >= 6 { + href, ok := cells.Eq(5).Find("a").First().Attr("href") + if ok { + detailURL = resolveURL(sourceURL, strings.TrimSpace(href)) + } + } + + events = append(events, RawEvent{ + SourceName: "mittelalterkalender", + SourceURL: sourceURL, + DetailURL: detailURL, + Name: titel, + City: ort, + PLZ: plz, + Land: InferLand(plz), + StartDate: start, + EndDate: end, + }) + }) + return events, nil +} + +// resolveURL joins a relative href against the source URL. Leaves absolute +// URLs untouched; empty input returns empty. +func resolveURL(source, href string) string { + if href == "" { + return "" + } + if strings.HasPrefix(href, "http://") || strings.HasPrefix(href, "https://") { + return href + } + if strings.HasPrefix(href, "/") { + // Strip path from source, keep scheme://host. + // Simple impl — source is always a full URL from our config. + end := strings.Index(source[len("https://"):], "/") + if end < 0 { + return source + href + } + return source[:len("https://")+end] + href + } + // Relative to current dir — drop filename from source. + lastSlash := strings.LastIndex(source, "/") + if lastSlash < 0 { + return source + "/" + href + } + return source[:lastSlash+1] + href +} diff --git a/backend/internal/domain/discovery/crawler/mittelalterkalender_test.go b/backend/internal/domain/discovery/crawler/mittelalterkalender_test.go new file mode 100644 index 0000000..0b0b926 --- /dev/null +++ b/backend/internal/domain/discovery/crawler/mittelalterkalender_test.go @@ -0,0 +1,38 @@ +package crawler + +import ( + "os" + "testing" +) + +func TestMittelalterkalenderParse(t *testing.T) { + data, err := os.ReadFile("testdata/mittelalterkalender.html") + if err != nil { + t.Fatal(err) + } + events, err := parseMittelalterkalender(data, "https://www.mittelalterkalender.info/mittelaltermarkt/mittelalterfeste-2026-nach-datum.php") + if err != nil { + t.Fatalf("parse: %v", err) + } + t.Logf("Parsed %d events", len(events)) + if len(events) < 10 { + t.Fatalf("got %d events; expected at least 10", len(events)) + } + e := events[0] + if e.SourceName != "mittelalterkalender" { + t.Errorf("SourceName = %q", e.SourceName) + } + if e.Name == "" { + t.Error("Name empty") + } + if e.City == "" { + t.Error("City empty") + } + if e.StartDate == nil { + t.Error("StartDate nil") + } + // Land inferred from PLZ via InferLand. + if e.Land == "" && e.PLZ != "" { + t.Errorf("Land empty but PLZ=%q", e.PLZ) + } +}