feat(discovery/crawler): mittelalterkalender.info parser

This commit is contained in:
2026-04-18 12:24:49 +02:00
parent e6ec97c09d
commit 91c058105e
2 changed files with 165 additions and 0 deletions

View File

@@ -0,0 +1,127 @@
package crawler
import (
"bytes"
"context"
"fmt"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
)
// MittelalterkalenderSource scrapes www.mittelalterkalender.info. Page has
// twelve monthly <table>s; each has columns: Beginn | Ende | Titel | PLZ | Ort |
// [Details link].
type MittelalterkalenderSource struct {
fetcher *Fetcher
urls []string
}
func NewMittelalterkalender(f *Fetcher, urls []string) *MittelalterkalenderSource {
return &MittelalterkalenderSource{fetcher: f, urls: urls}
}
func (s *MittelalterkalenderSource) Name() string { return "mittelalterkalender" }
func (s *MittelalterkalenderSource) Fetch(ctx context.Context) ([]RawEvent, error) {
var all []RawEvent
for i, url := range s.urls {
if i > 0 {
if err := sleepCtx(ctx, 2*time.Second); err != nil {
return all, err
}
}
body, err := s.fetcher.Get(ctx, url, "")
if err != nil {
return all, fmt.Errorf("mittelalterkalender %s: %w", url, err)
}
events, err := parseMittelalterkalender(body, url)
if err != nil {
return all, fmt.Errorf("mittelalterkalender parse %s: %w", url, err)
}
all = append(all, events...)
}
return all, nil
}
func parseMittelalterkalender(data []byte, sourceURL string) ([]RawEvent, error) {
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(data))
if err != nil {
return nil, err
}
var events []RawEvent
doc.Find("table tr").Each(func(_ int, tr *goquery.Selection) {
cells := tr.Find("td")
if cells.Length() < 5 {
return
}
// First cell contains start date followed by " bis " span; extract just the date.
beginnText := strings.TrimSpace(cells.Eq(0).Text())
// Remove the "bis" suffix (cell text is "DD.MM.YYYY bis")
if idx := strings.Index(beginnText, " bis"); idx > 0 {
beginnText = beginnText[:idx]
}
beginn := strings.TrimSpace(beginnText)
ende := strings.TrimSpace(cells.Eq(1).Text())
titel := strings.TrimSpace(cells.Eq(2).Text())
plz := strings.TrimSpace(cells.Eq(3).Text())
ort := strings.TrimSpace(cells.Eq(4).Text())
if titel == "" || beginn == "" {
return
}
start := parseDEDate(beginn)
if start == nil {
return
}
end := parseDEDate(ende)
detailURL := ""
if cells.Length() >= 6 {
href, ok := cells.Eq(5).Find("a").First().Attr("href")
if ok {
detailURL = resolveURL(sourceURL, strings.TrimSpace(href))
}
}
events = append(events, RawEvent{
SourceName: "mittelalterkalender",
SourceURL: sourceURL,
DetailURL: detailURL,
Name: titel,
City: ort,
PLZ: plz,
Land: InferLand(plz),
StartDate: start,
EndDate: end,
})
})
return events, nil
}
// resolveURL joins a relative href against the source URL. Leaves absolute
// URLs untouched; empty input returns empty.
func resolveURL(source, href string) string {
if href == "" {
return ""
}
if strings.HasPrefix(href, "http://") || strings.HasPrefix(href, "https://") {
return href
}
if strings.HasPrefix(href, "/") {
// Strip path from source, keep scheme://host.
// Simple impl — source is always a full URL from our config.
end := strings.Index(source[len("https://"):], "/")
if end < 0 {
return source + href
}
return source[:len("https://")+end] + href
}
// Relative to current dir — drop filename from source.
lastSlash := strings.LastIndex(source, "/")
if lastSlash < 0 {
return source + "/" + href
}
return source[:lastSlash+1] + href
}

View File

@@ -0,0 +1,38 @@
package crawler
import (
"os"
"testing"
)
func TestMittelalterkalenderParse(t *testing.T) {
data, err := os.ReadFile("testdata/mittelalterkalender.html")
if err != nil {
t.Fatal(err)
}
events, err := parseMittelalterkalender(data, "https://www.mittelalterkalender.info/mittelaltermarkt/mittelalterfeste-2026-nach-datum.php")
if err != nil {
t.Fatalf("parse: %v", err)
}
t.Logf("Parsed %d events", len(events))
if len(events) < 10 {
t.Fatalf("got %d events; expected at least 10", len(events))
}
e := events[0]
if e.SourceName != "mittelalterkalender" {
t.Errorf("SourceName = %q", e.SourceName)
}
if e.Name == "" {
t.Error("Name empty")
}
if e.City == "" {
t.Error("City empty")
}
if e.StartDate == nil {
t.Error("StartDate nil")
}
// Land inferred from PLZ via InferLand.
if e.Land == "" && e.PLZ != "" {
t.Errorf("Land empty but PLZ=%q", e.PLZ)
}
}