From 3aed982e1c3f484e2365f5adbbe0d6a9e95bc8d0 Mon Sep 17 00:00:00 2001 From: vikingowl Date: Sat, 18 Apr 2026 13:33:51 +0200 Subject: [PATCH] feat(discovery/crawler): log unparseable suendenfrei entries at INFO --- .../domain/discovery/crawler/suendenfrei.go | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/backend/internal/domain/discovery/crawler/suendenfrei.go b/backend/internal/domain/discovery/crawler/suendenfrei.go index 0cfc93c..c960d2f 100644 --- a/backend/internal/domain/discovery/crawler/suendenfrei.go +++ b/backend/internal/domain/discovery/crawler/suendenfrei.go @@ -4,6 +4,7 @@ import ( "bytes" "context" "fmt" + "log/slog" "regexp" "strconv" "strings" @@ -14,12 +15,20 @@ import ( // SuendenfreiSource scrapes www.suendenfrei.tv/veranstaltungen. Events are //

with free-form text. A regex parses " [in ]". -// Unparseable entries are silently skipped. +// Unparseable entries are logged at INFO and skipped; Ship 2's local LLM is +// the long-term rescue for prose that doesn't fit the regex. type SuendenfreiSource struct { fetcher *Fetcher baseURL string } +func truncateForLog(s string, n int) string { + if len(s) <= n { + return s + } + return s[:n] + "..." +} + func NewSuendenfrei(f *Fetcher, baseURL string) *SuendenfreiSource { return &SuendenfreiSource{fetcher: f, baseURL: baseURL} } @@ -82,6 +91,14 @@ func parseSuendenfreiPage(data []byte, sourceURL string) ([]RawEvent, bool) { } parsed, ok := parseSuendenfreiHeader(text) if !ok { + // Unparseable entries are real — footer links, layout anchors, + // or event headers in a prose shape our regex doesn't cover. + // Log at INFO so operators can grep the count post-run without + // introducing a counter we'd have to thread through CrawlSummary. + slog.Info("suendenfrei: unparseable h3 anchor; skipping", + "source_url", sourceURL, + "text", truncateForLog(text, 120), + ) return } href, _ := a.Attr("href")