feat(discovery/crawler): log unparseable suendenfrei entries at INFO

2026-04-18 13:33:51 +02:00
parent 2163621415
commit 3aed982e1c
1 changed files with 18 additions and 1 deletions
--- a/backend/internal/domain/discovery/crawler/suendenfrei.go
+++ b/backend/internal/domain/discovery/crawler/suendenfrei.go
@@ -4,6 +4,7 @@ import (
 	"bytes"
 	"context"
 	"fmt"
+	"log/slog"
 	"regexp"
 	"strconv"
 	"strings"
@@ -14,12 +15,20 @@ import (

 // SuendenfreiSource scrapes www.suendenfrei.tv/veranstaltungen. Events are
 // <h3><a> with free-form text. A regex parses "<date-range> <name> [in <PLZ> <city>]".
-// Unparseable entries are silently skipped.
+// Unparseable entries are logged at INFO and skipped; Ship 2's local LLM is
+// the long-term rescue for prose that doesn't fit the regex.
 type SuendenfreiSource struct {
 	fetcher *Fetcher
 	baseURL string
 }

+func truncateForLog(s string, n int) string {
+	if len(s) <= n {
+		return s
+	}
+	return s[:n] + "..."
+}
+
 func NewSuendenfrei(f *Fetcher, baseURL string) *SuendenfreiSource {
 	return &SuendenfreiSource{fetcher: f, baseURL: baseURL}
 }
@@ -82,6 +91,14 @@ func parseSuendenfreiPage(data []byte, sourceURL string) ([]RawEvent, bool) {
 		}
 		parsed, ok := parseSuendenfreiHeader(text)
 		if !ok {
+			// Unparseable entries are real — footer links, layout anchors,
+			// or event headers in a prose shape our regex doesn't cover.
+			// Log at INFO so operators can grep the count post-run without
+			// introducing a counter we'd have to thread through CrawlSummary.
+			slog.Info("suendenfrei: unparseable h3 anchor; skipping",
+				"source_url", sourceURL,
+				"text", truncateForLog(text, 120),
+			)
 			return
 		}
 		href, _ := a.Attr("href")