feat(discovery/crawler): log unparseable suendenfrei entries at INFO
This commit is contained in:
@@ -4,6 +4,7 @@ import (
|
||||
"bytes"
|
||||
"context"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
@@ -14,12 +15,20 @@ import (
|
||||
|
||||
// SuendenfreiSource scrapes www.suendenfrei.tv/veranstaltungen. Events are
|
||||
// <h3><a> with free-form text. A regex parses "<date-range> <name> [in <PLZ> <city>]".
|
||||
// Unparseable entries are silently skipped.
|
||||
// Unparseable entries are logged at INFO and skipped; Ship 2's local LLM is
|
||||
// the long-term rescue for prose that doesn't fit the regex.
|
||||
type SuendenfreiSource struct {
|
||||
fetcher *Fetcher
|
||||
baseURL string
|
||||
}
|
||||
|
||||
func truncateForLog(s string, n int) string {
|
||||
if len(s) <= n {
|
||||
return s
|
||||
}
|
||||
return s[:n] + "..."
|
||||
}
|
||||
|
||||
func NewSuendenfrei(f *Fetcher, baseURL string) *SuendenfreiSource {
|
||||
return &SuendenfreiSource{fetcher: f, baseURL: baseURL}
|
||||
}
|
||||
@@ -82,6 +91,14 @@ func parseSuendenfreiPage(data []byte, sourceURL string) ([]RawEvent, bool) {
|
||||
}
|
||||
parsed, ok := parseSuendenfreiHeader(text)
|
||||
if !ok {
|
||||
// Unparseable entries are real — footer links, layout anchors,
|
||||
// or event headers in a prose shape our regex doesn't cover.
|
||||
// Log at INFO so operators can grep the count post-run without
|
||||
// introducing a counter we'd have to thread through CrawlSummary.
|
||||
slog.Info("suendenfrei: unparseable h3 anchor; skipping",
|
||||
"source_url", sourceURL,
|
||||
"text", truncateForLog(text, 120),
|
||||
)
|
||||
return
|
||||
}
|
||||
href, _ := a.Attr("href")
|
||||
|
||||
Reference in New Issue
Block a user