marktvogt.de/backend/internal/pkg/scrape/scrape.go

// Package scrape fetches a web page and extracts its visible text for LLM
// context. Not intended for structured data extraction — use goquery directly
// when you want specific fields.
//
// The extraction strategy is deliberately simple: drop scripts/styles/nav/
// footer, walk the remaining body text, collapse runs of whitespace, truncate.
// Good enough for feeding a market-event page into a prompt; bad for
// anything that depends on document structure.
package scrape

import (
	"bytes"
	"context"
	"fmt"
	"io"
	"net/http"
	"strings"
	"time"

	"github.com/PuerkitoBio/goquery"

	"marktvogt.de/backend/internal/pkg/safehttp"
)

// DefaultTimeout caps individual HTTP fetches.
const DefaultTimeout = 10 * time.Second

// DefaultMaxChars bounds the extracted text length. LLM prompts have a token
// budget; cutting here keeps the prompt deterministic and prevents a single
// huge page from dominating a multi-URL context. 4000 chars ≈ 1000 tokens
// assuming German-ish content.
const DefaultMaxChars = 4000

// Client wraps an *http.Client + output bound. Zero-value is usable — it
// falls back to http.DefaultClient behavior with DefaultTimeout applied via
// the per-request context.
type Client struct {
	HTTP     *http.Client
	MaxChars int
	// UserAgent, if set, overrides the default. Leaving empty lets net/http
	// use its default — which some servers block; callers that scrape a lot
	// of third-party pages should set a descriptive string.
	UserAgent string
}

// New constructs a Client with sane defaults. The HTTP transport is built by
// safehttp so the scraper cannot dial RFC1918, loopback, link-local, or
// cloud-metadata IPs even when redirects point at them (audit C6).
func New(userAgent string) *Client {
	return &Client{
		HTTP: safehttp.NewClient(safehttp.Config{
			Timeout:      DefaultTimeout,
			MaxRedirects: 5,
		}),
		MaxChars:  DefaultMaxChars,
		UserAgent: userAgent,
	}
}

// NewForTesting returns a scraper that DOES allow private/loopback addresses,
// for integration tests that use httptest.Server on 127.0.0.1. Never use this
// in production code paths — production must always go through New().
func NewForTesting(userAgent string) *Client {
	return &Client{
		HTTP: safehttp.NewClient(safehttp.Config{
			Timeout:               DefaultTimeout,
			MaxRedirects:          5,
			AllowPrivateAddresses: true,
		}),
		MaxChars:  DefaultMaxChars,
		UserAgent: userAgent,
	}
}

// Fetch retrieves the URL and returns the visible text, truncated to MaxChars.
// Non-2xx responses and HTML parse failures return an error — caller decides
// whether to continue with other URLs or fail the whole operation.
func (c *Client) Fetch(ctx context.Context, url string) (string, error) {
	client := c.HTTP
	if client == nil {
		client = &http.Client{Timeout: DefaultTimeout}
	}
	maxChars := c.MaxChars
	if maxChars <= 0 {
		maxChars = DefaultMaxChars
	}

	req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
	if err != nil {
		return "", fmt.Errorf("new request %s: %w", url, err)
	}
	if c.UserAgent != "" {
		req.Header.Set("User-Agent", c.UserAgent)
	}
	req.Header.Set("Accept", "text/html,application/xhtml+xml;q=0.9,*/*;q=0.8")
	req.Header.Set("Accept-Language", "de-DE,de;q=0.9,en;q=0.8")

	resp, err := client.Do(req)
	if err != nil {
		return "", fmt.Errorf("fetch %s: %w", url, err)
	}
	defer func() { _ = resp.Body.Close() }()

	if resp.StatusCode < 200 || resp.StatusCode >= 300 {
		return "", fmt.Errorf("fetch %s: status %d", url, resp.StatusCode)
	}

	// Cap the body read at a generous multiple of maxChars so a misbehaving
	// server streaming gigabytes can't OOM us. 10x is headroom for whitespace
	// stripping to still produce maxChars of useful text.
	body, err := io.ReadAll(io.LimitReader(resp.Body, int64(maxChars)*10))
	if err != nil {
		return "", fmt.Errorf("read body %s: %w", url, err)
	}

	return extractText(body, maxChars)
}

// extractText walks a parsed HTML body, drops noise nodes, gathers visible
// text, collapses whitespace, and truncates. Exported as a package-level
// helper so tests can exercise the HTML→text path without a live HTTP server.
func extractText(htmlBytes []byte, maxChars int) (string, error) {
	doc, err := goquery.NewDocumentFromReader(bytes.NewReader(htmlBytes))
	if err != nil {
		return "", fmt.Errorf("parse html: %w", err)
	}

	// Remove noise before walking — script/style carry code, nav/footer/aside
	// carry boilerplate that pollutes the LLM context.
	doc.Find("script, style, nav, footer, aside, noscript, iframe").Remove()

	text := strings.TrimSpace(doc.Find("body").Text())
	if text == "" {
		// Some sites don't use <body> explicitly (fragments, malformed docs);
		// fall back to document-level text.
		text = strings.TrimSpace(doc.Text())
	}
	text = collapseWhitespace(text)
	if len(text) > maxChars {
		text = text[:maxChars]
	}
	return text, nil
}

// collapseWhitespace replaces every run of whitespace with a single space.
// Preserves word boundaries but drops the layout that HTML text extraction
// leaves behind (tabs, long runs of newlines, non-breaking spaces).
func collapseWhitespace(s string) string {
	var b strings.Builder
	b.Grow(len(s))
	inSpace := false
	for _, r := range s {
		if r == ' ' || r == '\t' || r == '\n' || r == '\r' || r == ' ' {
			if !inSpace {
				b.WriteByte(' ')
				inSpace = true
			}
			continue
		}
		b.WriteRune(r)
		inSpace = false
	}
	return strings.TrimSpace(b.String())
}