feat(discovery/crawler): mittelalterkalender.info parser
This commit is contained in:
127
backend/internal/domain/discovery/crawler/mittelalterkalender.go
Normal file
127
backend/internal/domain/discovery/crawler/mittelalterkalender.go
Normal file
@@ -0,0 +1,127 @@
|
||||
package crawler
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
)
|
||||
|
||||
// MittelalterkalenderSource scrapes www.mittelalterkalender.info. Page has
|
||||
// twelve monthly <table>s; each has columns: Beginn | Ende | Titel | PLZ | Ort |
|
||||
// [Details link].
|
||||
type MittelalterkalenderSource struct {
|
||||
fetcher *Fetcher
|
||||
urls []string
|
||||
}
|
||||
|
||||
func NewMittelalterkalender(f *Fetcher, urls []string) *MittelalterkalenderSource {
|
||||
return &MittelalterkalenderSource{fetcher: f, urls: urls}
|
||||
}
|
||||
|
||||
func (s *MittelalterkalenderSource) Name() string { return "mittelalterkalender" }
|
||||
|
||||
func (s *MittelalterkalenderSource) Fetch(ctx context.Context) ([]RawEvent, error) {
|
||||
var all []RawEvent
|
||||
for i, url := range s.urls {
|
||||
if i > 0 {
|
||||
if err := sleepCtx(ctx, 2*time.Second); err != nil {
|
||||
return all, err
|
||||
}
|
||||
}
|
||||
body, err := s.fetcher.Get(ctx, url, "")
|
||||
if err != nil {
|
||||
return all, fmt.Errorf("mittelalterkalender %s: %w", url, err)
|
||||
}
|
||||
events, err := parseMittelalterkalender(body, url)
|
||||
if err != nil {
|
||||
return all, fmt.Errorf("mittelalterkalender parse %s: %w", url, err)
|
||||
}
|
||||
all = append(all, events...)
|
||||
}
|
||||
return all, nil
|
||||
}
|
||||
|
||||
func parseMittelalterkalender(data []byte, sourceURL string) ([]RawEvent, error) {
|
||||
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(data))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
var events []RawEvent
|
||||
doc.Find("table tr").Each(func(_ int, tr *goquery.Selection) {
|
||||
cells := tr.Find("td")
|
||||
if cells.Length() < 5 {
|
||||
return
|
||||
}
|
||||
// First cell contains start date followed by " bis " span; extract just the date.
|
||||
beginnText := strings.TrimSpace(cells.Eq(0).Text())
|
||||
// Remove the "bis" suffix (cell text is "DD.MM.YYYY bis")
|
||||
if idx := strings.Index(beginnText, " bis"); idx > 0 {
|
||||
beginnText = beginnText[:idx]
|
||||
}
|
||||
beginn := strings.TrimSpace(beginnText)
|
||||
ende := strings.TrimSpace(cells.Eq(1).Text())
|
||||
titel := strings.TrimSpace(cells.Eq(2).Text())
|
||||
plz := strings.TrimSpace(cells.Eq(3).Text())
|
||||
ort := strings.TrimSpace(cells.Eq(4).Text())
|
||||
|
||||
if titel == "" || beginn == "" {
|
||||
return
|
||||
}
|
||||
start := parseDEDate(beginn)
|
||||
if start == nil {
|
||||
return
|
||||
}
|
||||
end := parseDEDate(ende)
|
||||
|
||||
detailURL := ""
|
||||
if cells.Length() >= 6 {
|
||||
href, ok := cells.Eq(5).Find("a").First().Attr("href")
|
||||
if ok {
|
||||
detailURL = resolveURL(sourceURL, strings.TrimSpace(href))
|
||||
}
|
||||
}
|
||||
|
||||
events = append(events, RawEvent{
|
||||
SourceName: "mittelalterkalender",
|
||||
SourceURL: sourceURL,
|
||||
DetailURL: detailURL,
|
||||
Name: titel,
|
||||
City: ort,
|
||||
PLZ: plz,
|
||||
Land: InferLand(plz),
|
||||
StartDate: start,
|
||||
EndDate: end,
|
||||
})
|
||||
})
|
||||
return events, nil
|
||||
}
|
||||
|
||||
// resolveURL joins a relative href against the source URL. Leaves absolute
|
||||
// URLs untouched; empty input returns empty.
|
||||
func resolveURL(source, href string) string {
|
||||
if href == "" {
|
||||
return ""
|
||||
}
|
||||
if strings.HasPrefix(href, "http://") || strings.HasPrefix(href, "https://") {
|
||||
return href
|
||||
}
|
||||
if strings.HasPrefix(href, "/") {
|
||||
// Strip path from source, keep scheme://host.
|
||||
// Simple impl — source is always a full URL from our config.
|
||||
end := strings.Index(source[len("https://"):], "/")
|
||||
if end < 0 {
|
||||
return source + href
|
||||
}
|
||||
return source[:len("https://")+end] + href
|
||||
}
|
||||
// Relative to current dir — drop filename from source.
|
||||
lastSlash := strings.LastIndex(source, "/")
|
||||
if lastSlash < 0 {
|
||||
return source + "/" + href
|
||||
}
|
||||
return source[:lastSlash+1] + href
|
||||
}
|
||||
@@ -0,0 +1,38 @@
|
||||
package crawler
|
||||
|
||||
import (
|
||||
"os"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestMittelalterkalenderParse(t *testing.T) {
|
||||
data, err := os.ReadFile("testdata/mittelalterkalender.html")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
events, err := parseMittelalterkalender(data, "https://www.mittelalterkalender.info/mittelaltermarkt/mittelalterfeste-2026-nach-datum.php")
|
||||
if err != nil {
|
||||
t.Fatalf("parse: %v", err)
|
||||
}
|
||||
t.Logf("Parsed %d events", len(events))
|
||||
if len(events) < 10 {
|
||||
t.Fatalf("got %d events; expected at least 10", len(events))
|
||||
}
|
||||
e := events[0]
|
||||
if e.SourceName != "mittelalterkalender" {
|
||||
t.Errorf("SourceName = %q", e.SourceName)
|
||||
}
|
||||
if e.Name == "" {
|
||||
t.Error("Name empty")
|
||||
}
|
||||
if e.City == "" {
|
||||
t.Error("City empty")
|
||||
}
|
||||
if e.StartDate == nil {
|
||||
t.Error("StartDate nil")
|
||||
}
|
||||
// Land inferred from PLZ via InferLand.
|
||||
if e.Land == "" && e.PLZ != "" {
|
||||
t.Errorf("Land empty but PLZ=%q", e.PLZ)
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user