Completes the manual two-pass enrichment flow: the crawl-enrich-all
button (MR 3) fills deterministic fields across the queue; this MR
adds a per-row "AI" button that scrapes the row's quellen URLs and
asks Mistral to fill category, opening_hours, description.
Flow per click:
1. Load row, compute CacheKey(name_normalized, stadt, year).
2. Cache hit -> skip LLM, merge cached payload onto current
crawl-enrich base, persist, return.
3. Miss -> scrape up to 5 quellen URLs via pkg/scrape (goquery
text extraction, 4000-char truncation), concatenate into labeled
blocks, call ai.Client.Pass2 with JSON response format.
4. Parse response into Enrichment{category, opening_hours,
description}, stamp provenance=llm + model + token counts.
5. Cache the raw LLM payload (not the merged one) under the tuple
key with DefaultCacheTTL=30d, so later re-crawls can layer new
crawl-enrich bases on the same cached answer.
6. Merge(crawl, llm) -- crawl fields survive. Persist via
SetEnrichment(status=done). Return merged to the operator.
ErrNoScrapedContent fails fast when zero URLs return usable text;
LLMs without grounding hallucinate, and a 400-style operator error is
better than inventing details. Individual scrape failures don't halt
the flow as long as at least one source succeeds.
pkg/scrape (new, reusable)
- Client.Fetch: HTTP GET, strip script/style/nav/footer/aside via
goquery, gather body text, collapse whitespace, truncate.
DefaultTimeout=10s, DefaultMaxChars=4000. User-Agent configurable.
- Tests cover noise stripping, whitespace collapsing, truncation,
body-less fragments.
enrich.MistralLLMEnricher
- Takes ai.Client + Scraper (both injectable; tests use stubs).
- Prompt: English system instructions asking for JSON-only output
with category/opening_hours/description in German. User prompt
includes markt identifiers, already-filled fields (so the LLM
doesn't waste tokens re-deriving them), and scraped blocks.
- Tests: happy path, all-scrapes-fail (-> ErrNoScrapedContent),
partial-scrape-success, empty LLM fields yield no provenance,
URL cap at 5.
Service.RunLLMEnrichOne + handler POST /admin/discovery/queue/:id/
enrich (sync, 30s timeout). NewService gains llm enrich.LLMEnricher
param; routes.go constructs a MistralLLMEnricher when ai.Client is
enabled, falls back to NoopLLMEnricher otherwise.
UI: per-row AI button next to Similar, tracks per-row pending state
via a Set<string>, disables the button while the request is in
flight and shows "AI..." label. Success invalidates the page, the
row's expanded view picks up the new category/opening_hours/
description fields with llm provenance tags. Inline error message on
the row if the enrich action fails.
151 lines
4.7 KiB
Go
151 lines
4.7 KiB
Go
// Package scrape fetches a web page and extracts its visible text for LLM
|
||
// context. Not intended for structured data extraction — use goquery directly
|
||
// when you want specific fields.
|
||
//
|
||
// The extraction strategy is deliberately simple: drop scripts/styles/nav/
|
||
// footer, walk the remaining body text, collapse runs of whitespace, truncate.
|
||
// Good enough for feeding a market-event page into a prompt; bad for
|
||
// anything that depends on document structure.
|
||
package scrape
|
||
|
||
import (
|
||
"bytes"
|
||
"context"
|
||
"fmt"
|
||
"io"
|
||
"net/http"
|
||
"strings"
|
||
"time"
|
||
|
||
"github.com/PuerkitoBio/goquery"
|
||
)
|
||
|
||
// DefaultTimeout caps individual HTTP fetches.
|
||
const DefaultTimeout = 10 * time.Second
|
||
|
||
// DefaultMaxChars bounds the extracted text length. LLM prompts have a token
|
||
// budget; cutting here keeps the prompt deterministic and prevents a single
|
||
// huge page from dominating a multi-URL context. 4000 chars ≈ 1000 tokens
|
||
// assuming German-ish content.
|
||
const DefaultMaxChars = 4000
|
||
|
||
// Client wraps an *http.Client + output bound. Zero-value is usable — it
|
||
// falls back to http.DefaultClient behavior with DefaultTimeout applied via
|
||
// the per-request context.
|
||
type Client struct {
|
||
HTTP *http.Client
|
||
MaxChars int
|
||
// UserAgent, if set, overrides the default. Leaving empty lets net/http
|
||
// use its default — which some servers block; callers that scrape a lot
|
||
// of third-party pages should set a descriptive string.
|
||
UserAgent string
|
||
}
|
||
|
||
// New constructs a Client with sane defaults.
|
||
func New(userAgent string) *Client {
|
||
return &Client{
|
||
HTTP: &http.Client{
|
||
Timeout: DefaultTimeout,
|
||
CheckRedirect: func(req *http.Request, via []*http.Request) error {
|
||
if len(via) >= 5 {
|
||
return http.ErrUseLastResponse
|
||
}
|
||
return nil
|
||
},
|
||
},
|
||
MaxChars: DefaultMaxChars,
|
||
UserAgent: userAgent,
|
||
}
|
||
}
|
||
|
||
// Fetch retrieves the URL and returns the visible text, truncated to MaxChars.
|
||
// Non-2xx responses and HTML parse failures return an error — caller decides
|
||
// whether to continue with other URLs or fail the whole operation.
|
||
func (c *Client) Fetch(ctx context.Context, url string) (string, error) {
|
||
client := c.HTTP
|
||
if client == nil {
|
||
client = &http.Client{Timeout: DefaultTimeout}
|
||
}
|
||
maxChars := c.MaxChars
|
||
if maxChars <= 0 {
|
||
maxChars = DefaultMaxChars
|
||
}
|
||
|
||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
|
||
if err != nil {
|
||
return "", fmt.Errorf("new request %s: %w", url, err)
|
||
}
|
||
if c.UserAgent != "" {
|
||
req.Header.Set("User-Agent", c.UserAgent)
|
||
}
|
||
req.Header.Set("Accept", "text/html,application/xhtml+xml;q=0.9,*/*;q=0.8")
|
||
req.Header.Set("Accept-Language", "de-DE,de;q=0.9,en;q=0.8")
|
||
|
||
resp, err := client.Do(req)
|
||
if err != nil {
|
||
return "", fmt.Errorf("fetch %s: %w", url, err)
|
||
}
|
||
defer func() { _ = resp.Body.Close() }()
|
||
|
||
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
||
return "", fmt.Errorf("fetch %s: status %d", url, resp.StatusCode)
|
||
}
|
||
|
||
// Cap the body read at a generous multiple of maxChars so a misbehaving
|
||
// server streaming gigabytes can't OOM us. 10x is headroom for whitespace
|
||
// stripping to still produce maxChars of useful text.
|
||
body, err := io.ReadAll(io.LimitReader(resp.Body, int64(maxChars)*10))
|
||
if err != nil {
|
||
return "", fmt.Errorf("read body %s: %w", url, err)
|
||
}
|
||
|
||
return extractText(body, maxChars)
|
||
}
|
||
|
||
// extractText walks a parsed HTML body, drops noise nodes, gathers visible
|
||
// text, collapses whitespace, and truncates. Exported as a package-level
|
||
// helper so tests can exercise the HTML→text path without a live HTTP server.
|
||
func extractText(htmlBytes []byte, maxChars int) (string, error) {
|
||
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(htmlBytes))
|
||
if err != nil {
|
||
return "", fmt.Errorf("parse html: %w", err)
|
||
}
|
||
|
||
// Remove noise before walking — script/style carry code, nav/footer/aside
|
||
// carry boilerplate that pollutes the LLM context.
|
||
doc.Find("script, style, nav, footer, aside, noscript, iframe").Remove()
|
||
|
||
text := strings.TrimSpace(doc.Find("body").Text())
|
||
if text == "" {
|
||
// Some sites don't use <body> explicitly (fragments, malformed docs);
|
||
// fall back to document-level text.
|
||
text = strings.TrimSpace(doc.Text())
|
||
}
|
||
text = collapseWhitespace(text)
|
||
if len(text) > maxChars {
|
||
text = text[:maxChars]
|
||
}
|
||
return text, nil
|
||
}
|
||
|
||
// collapseWhitespace replaces every run of whitespace with a single space.
|
||
// Preserves word boundaries but drops the layout that HTML text extraction
|
||
// leaves behind (tabs, long runs of newlines, non-breaking spaces).
|
||
func collapseWhitespace(s string) string {
|
||
var b strings.Builder
|
||
b.Grow(len(s))
|
||
inSpace := false
|
||
for _, r := range s {
|
||
if r == ' ' || r == '\t' || r == '\n' || r == '\r' || r == ' ' {
|
||
if !inSpace {
|
||
b.WriteByte(' ')
|
||
inSpace = true
|
||
}
|
||
continue
|
||
}
|
||
b.WriteRune(r)
|
||
inSpace = false
|
||
}
|
||
return strings.TrimSpace(b.String())
|
||
}
|