feat(discovery/crawler): log unparseable suendenfrei entries at INFO

This commit is contained in:
2026-04-18 13:33:51 +02:00
parent 2163621415
commit 3aed982e1c

View File

@@ -4,6 +4,7 @@ import (
"bytes"
"context"
"fmt"
"log/slog"
"regexp"
"strconv"
"strings"
@@ -14,12 +15,20 @@ import (
// SuendenfreiSource scrapes www.suendenfrei.tv/veranstaltungen. Events are
// <h3><a> with free-form text. A regex parses "<date-range> <name> [in <PLZ> <city>]".
// Unparseable entries are silently skipped.
// Unparseable entries are logged at INFO and skipped; Ship 2's local LLM is
// the long-term rescue for prose that doesn't fit the regex.
type SuendenfreiSource struct {
fetcher *Fetcher
baseURL string
}
func truncateForLog(s string, n int) string {
if len(s) <= n {
return s
}
return s[:n] + "..."
}
func NewSuendenfrei(f *Fetcher, baseURL string) *SuendenfreiSource {
return &SuendenfreiSource{fetcher: f, baseURL: baseURL}
}
@@ -82,6 +91,14 @@ func parseSuendenfreiPage(data []byte, sourceURL string) ([]RawEvent, bool) {
}
parsed, ok := parseSuendenfreiHeader(text)
if !ok {
// Unparseable entries are real — footer links, layout anchors,
// or event headers in a prose shape our regex doesn't cover.
// Log at INFO so operators can grep the count post-run without
// introducing a counter we'd have to thread through CrawlSummary.
slog.Info("suendenfrei: unparseable h3 anchor; skipping",
"source_url", sourceURL,
"text", truncateForLog(text, 120),
)
return
}
href, _ := a.Attr("href")