Implements the remediation pass described in planning/19-security-audit-2026-04-30.md. All Critical findings and the Wave 1-4 High findings are closed; PoC tests added; full backend test suite green; helm chart lints clean. Wave 1 - Auth & identity - C1 OAuth state nonce: PutOAuthState / ConsumeOAuthState (valkey, GETDEL single-use, 15min TTL); Callback rejects missing/forged/cross- provider state before token exchange. - C2 OAuth identity linking: refuse silent linking to existing user unless info.EmailVerified is true. fetchGitHubUser now consults the /user/emails endpoint for the verified flag (no more hardcoded true); fetchFacebookUser sets EmailVerified=false (FB exposes no per-email verification flag). - H1 Magic-link verify: replaced Get + MarkUsed with a single atomic UPDATE...RETURNING (ConsumeMagicLink) - TOCTOU-free. - H2 TOTP code replay: MarkTOTPCodeConsumed (valkey SET NX, 120s TTL) prevents replay of a successfully validated code; fails closed on transient store errors. - H3 Backup-code orphan: DisableTOTP now also wipes totp_backup_codes. Wave 2 - Middleware & network - C3 CORS/CSRF regex anchoring: NewCORSConfig wraps each pattern with \A...\z so substring spoofing of origins is impossible. - H4 ClientIP: server reads APP_TRUSTED_PROXIES; gin SetTrustedProxies is called explicitly (empty default = no proxy trust). - H11 Body limit + DisallowUnknownFields: BodyLimitBytes middleware (1 MiB default) wraps every request; validate.BindJSON now uses a json.Decoder with DisallowUnknownFields and rejects trailing tokens; 413 envelope on body-limit overflow. - H16 NetworkPolicy: backend.networkPolicy.enabled defaults to true; new web-networkpolicy.yaml restricts web pod ingress to nginx-gateway and egress to backend service + DNS + 443. Wave 3 - Encryption at rest - C4 TOTP secrets: CreateTOTPSecret writes encrypted secret_v2; GetTOTPSecret prefers v2 with legacy fallback. - C5 OAuth tokens: migration 000033 adds *_v2 columns; CreateOAuthAccount and UpdateOAuthTokens write encrypted; GetOAuthAccount reads v2 with legacy fallback. - M1 Domain separation: crypto.DeriveKeyFor(secret, purpose) replaces single-purpose DeriveKey; settings, totp, oauth each use a distinct HKDF-derived subkey. DeriveKey kept as back-compat alias for settings. Wave 4 - Input & AI safety - C6 SSRF: new pkg/safehttp refuses to dial RFC1918, loopback, link- local, ULA, multicast, unspecified, or cloud-metadata IPs; scheme allowlist (http/https). Wired into pkg/scrape, discovery LinkChecker, and imageURLReachable. NewForTesting opt-in for httptest. - H13 PromptGuard German + Unicode: NFKC + Cf-class strip pre-pass closes zero-width and full-width-homoglyph bypasses; new German rules for ignoriere/missachte/vergiss/role-escalation/prompt-exfil/verbatim; Gemma-style and pipe-delimited chat-template tokens covered; source-fence rule prevents '=== Quelle:' splice in scraped text. - H14 BudgetGate: new ai.BudgetGate interface; UsageRepo.CheckBudget reads today's SUM(estimated_cost_usd) (10s cache) and refuses calls when AI_DAILY_CAP_USD is exceeded; GeminiProvider.Chat checks the gate before contacting Gemini. OAuth routes remain disabled in server/routes.go, so C1/C2 are not actively reachable today; fixes ensure correctness when re-enabled.
165 lines
5.3 KiB
Go
165 lines
5.3 KiB
Go
// Package scrape fetches a web page and extracts its visible text for LLM
|
||
// context. Not intended for structured data extraction — use goquery directly
|
||
// when you want specific fields.
|
||
//
|
||
// The extraction strategy is deliberately simple: drop scripts/styles/nav/
|
||
// footer, walk the remaining body text, collapse runs of whitespace, truncate.
|
||
// Good enough for feeding a market-event page into a prompt; bad for
|
||
// anything that depends on document structure.
|
||
package scrape
|
||
|
||
import (
|
||
"bytes"
|
||
"context"
|
||
"fmt"
|
||
"io"
|
||
"net/http"
|
||
"strings"
|
||
"time"
|
||
|
||
"github.com/PuerkitoBio/goquery"
|
||
|
||
"marktvogt.de/backend/internal/pkg/safehttp"
|
||
)
|
||
|
||
// DefaultTimeout caps individual HTTP fetches.
|
||
const DefaultTimeout = 10 * time.Second
|
||
|
||
// DefaultMaxChars bounds the extracted text length. LLM prompts have a token
|
||
// budget; cutting here keeps the prompt deterministic and prevents a single
|
||
// huge page from dominating a multi-URL context. 4000 chars ≈ 1000 tokens
|
||
// assuming German-ish content.
|
||
const DefaultMaxChars = 4000
|
||
|
||
// Client wraps an *http.Client + output bound. Zero-value is usable — it
|
||
// falls back to http.DefaultClient behavior with DefaultTimeout applied via
|
||
// the per-request context.
|
||
type Client struct {
|
||
HTTP *http.Client
|
||
MaxChars int
|
||
// UserAgent, if set, overrides the default. Leaving empty lets net/http
|
||
// use its default — which some servers block; callers that scrape a lot
|
||
// of third-party pages should set a descriptive string.
|
||
UserAgent string
|
||
}
|
||
|
||
// New constructs a Client with sane defaults. The HTTP transport is built by
|
||
// safehttp so the scraper cannot dial RFC1918, loopback, link-local, or
|
||
// cloud-metadata IPs even when redirects point at them (audit C6).
|
||
func New(userAgent string) *Client {
|
||
return &Client{
|
||
HTTP: safehttp.NewClient(safehttp.Config{
|
||
Timeout: DefaultTimeout,
|
||
MaxRedirects: 5,
|
||
}),
|
||
MaxChars: DefaultMaxChars,
|
||
UserAgent: userAgent,
|
||
}
|
||
}
|
||
|
||
// NewForTesting returns a scraper that DOES allow private/loopback addresses,
|
||
// for integration tests that use httptest.Server on 127.0.0.1. Never use this
|
||
// in production code paths — production must always go through New().
|
||
func NewForTesting(userAgent string) *Client {
|
||
return &Client{
|
||
HTTP: safehttp.NewClient(safehttp.Config{
|
||
Timeout: DefaultTimeout,
|
||
MaxRedirects: 5,
|
||
AllowPrivateAddresses: true,
|
||
}),
|
||
MaxChars: DefaultMaxChars,
|
||
UserAgent: userAgent,
|
||
}
|
||
}
|
||
|
||
// Fetch retrieves the URL and returns the visible text, truncated to MaxChars.
|
||
// Non-2xx responses and HTML parse failures return an error — caller decides
|
||
// whether to continue with other URLs or fail the whole operation.
|
||
func (c *Client) Fetch(ctx context.Context, url string) (string, error) {
|
||
client := c.HTTP
|
||
if client == nil {
|
||
client = &http.Client{Timeout: DefaultTimeout}
|
||
}
|
||
maxChars := c.MaxChars
|
||
if maxChars <= 0 {
|
||
maxChars = DefaultMaxChars
|
||
}
|
||
|
||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
|
||
if err != nil {
|
||
return "", fmt.Errorf("new request %s: %w", url, err)
|
||
}
|
||
if c.UserAgent != "" {
|
||
req.Header.Set("User-Agent", c.UserAgent)
|
||
}
|
||
req.Header.Set("Accept", "text/html,application/xhtml+xml;q=0.9,*/*;q=0.8")
|
||
req.Header.Set("Accept-Language", "de-DE,de;q=0.9,en;q=0.8")
|
||
|
||
resp, err := client.Do(req)
|
||
if err != nil {
|
||
return "", fmt.Errorf("fetch %s: %w", url, err)
|
||
}
|
||
defer func() { _ = resp.Body.Close() }()
|
||
|
||
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
||
return "", fmt.Errorf("fetch %s: status %d", url, resp.StatusCode)
|
||
}
|
||
|
||
// Cap the body read at a generous multiple of maxChars so a misbehaving
|
||
// server streaming gigabytes can't OOM us. 10x is headroom for whitespace
|
||
// stripping to still produce maxChars of useful text.
|
||
body, err := io.ReadAll(io.LimitReader(resp.Body, int64(maxChars)*10))
|
||
if err != nil {
|
||
return "", fmt.Errorf("read body %s: %w", url, err)
|
||
}
|
||
|
||
return extractText(body, maxChars)
|
||
}
|
||
|
||
// extractText walks a parsed HTML body, drops noise nodes, gathers visible
|
||
// text, collapses whitespace, and truncates. Exported as a package-level
|
||
// helper so tests can exercise the HTML→text path without a live HTTP server.
|
||
func extractText(htmlBytes []byte, maxChars int) (string, error) {
|
||
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(htmlBytes))
|
||
if err != nil {
|
||
return "", fmt.Errorf("parse html: %w", err)
|
||
}
|
||
|
||
// Remove noise before walking — script/style carry code, nav/footer/aside
|
||
// carry boilerplate that pollutes the LLM context.
|
||
doc.Find("script, style, nav, footer, aside, noscript, iframe").Remove()
|
||
|
||
text := strings.TrimSpace(doc.Find("body").Text())
|
||
if text == "" {
|
||
// Some sites don't use <body> explicitly (fragments, malformed docs);
|
||
// fall back to document-level text.
|
||
text = strings.TrimSpace(doc.Text())
|
||
}
|
||
text = collapseWhitespace(text)
|
||
if len(text) > maxChars {
|
||
text = text[:maxChars]
|
||
}
|
||
return text, nil
|
||
}
|
||
|
||
// collapseWhitespace replaces every run of whitespace with a single space.
|
||
// Preserves word boundaries but drops the layout that HTML text extraction
|
||
// leaves behind (tabs, long runs of newlines, non-breaking spaces).
|
||
func collapseWhitespace(s string) string {
|
||
var b strings.Builder
|
||
b.Grow(len(s))
|
||
inSpace := false
|
||
for _, r := range s {
|
||
if r == ' ' || r == '\t' || r == '\n' || r == '\r' || r == ' ' {
|
||
if !inSpace {
|
||
b.WriteByte(' ')
|
||
inSpace = true
|
||
}
|
||
continue
|
||
}
|
||
b.WriteRune(r)
|
||
inSpace = false
|
||
}
|
||
return strings.TrimSpace(b.String())
|
||
}
|