Files
vessel/backend/internal/api/fetcher.go
vikingowl c2214aef96 feat(tools): graceful truncation for fetch_url with LLM retry hints
- Remove curl --max-filesize to avoid hard failures on large pages
- Add Truncated/OriginalSize fields to FetchResult for all fetch methods
- Return truncation info in proxy response (truncated, originalSize, returnedSize)
- Add timeout parameter to fetch_url tool (default 30s, max 120s)
- Increase default maxLength from 5KB to 50KB, allow up to 2MB
- Include _hint in response guiding LLM to retry with larger maxLength

Instead of failing when content exceeds limits, the tool now returns
truncated content with guidance for the LLM to request more if needed.
2026-01-02 19:14:35 +01:00

700 lines
19 KiB
Go

package api
import (
"bytes"
"context"
"fmt"
"io"
"log"
"net/http"
"net/http/cookiejar"
"os/exec"
"regexp"
"strings"
"sync"
"time"
"github.com/chromedp/chromedp"
)
// FetchMethod represents the method used to fetch URLs
type FetchMethod string
const (
FetchMethodCurl FetchMethod = "curl"
FetchMethodWget FetchMethod = "wget"
FetchMethodChrome FetchMethod = "chrome"
FetchMethodNative FetchMethod = "native"
)
// FetchResult contains the result of a URL fetch
type FetchResult struct {
Content string
ContentType string
FinalURL string
StatusCode int
Method FetchMethod
Truncated bool // True if content was truncated due to MaxLength
OriginalSize int // Original size before truncation (0 if not truncated)
}
// FetchOptions configures the fetch behavior
type FetchOptions struct {
MaxLength int
Timeout time.Duration
UserAgent string
Headers map[string]string
FollowRedirects bool
// ForceHeadless forces using headless browser even if curl succeeds
ForceHeadless bool
// WaitForSelector waits for a specific CSS selector before capturing content
WaitForSelector string
// WaitTime is additional time to wait for JS to render (default 2s for headless)
WaitTime time.Duration
}
// DefaultFetchOptions returns sensible defaults
func DefaultFetchOptions() FetchOptions {
return FetchOptions{
MaxLength: 500000, // 500KB
Timeout: 30 * time.Second,
UserAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
Headers: make(map[string]string),
FollowRedirects: true,
WaitTime: 2 * time.Second,
}
}
// Fetcher provides URL fetching with multiple backend support
type Fetcher struct {
curlPath string
wgetPath string
wgetIsBusyBox bool // BusyBox wget has limited options
chromePath string
httpClient *http.Client
method FetchMethod
hasChrome bool
mu sync.RWMutex
// chromedp allocator context (reused for efficiency)
allocCtx context.Context
allocCancel context.CancelFunc
}
var (
globalFetcher *Fetcher
fetcherOnce sync.Once
)
// GetFetcher returns the singleton Fetcher instance
func GetFetcher() *Fetcher {
fetcherOnce.Do(func() {
globalFetcher = NewFetcher()
})
return globalFetcher
}
// NewFetcher creates a new Fetcher, detecting available tools
func NewFetcher() *Fetcher {
f := &Fetcher{}
f.detectTools()
f.initHTTPClient()
f.initChromeDp()
return f
}
// detectTools checks which external tools are available
func (f *Fetcher) detectTools() {
f.mu.Lock()
defer f.mu.Unlock()
// Check for curl
if path, err := exec.LookPath("curl"); err == nil {
f.curlPath = path
f.method = FetchMethodCurl
}
// Check for wget
if path, err := exec.LookPath("wget"); err == nil {
f.wgetPath = path
// Check if it's BusyBox wget (has limited options)
versionCmd := exec.Command(path, "--version")
versionOut, _ := versionCmd.CombinedOutput()
f.wgetIsBusyBox = strings.Contains(string(versionOut), "BusyBox")
if f.wgetIsBusyBox {
log.Printf("[Fetcher] Found BusyBox wget (limited options)")
}
if f.method == "" {
f.method = FetchMethodWget
}
}
// Check for Chrome/Chromium (for headless browser support)
chromePaths := []string{
"google-chrome",
"google-chrome-stable",
"chromium",
"chromium-browser",
"/usr/bin/google-chrome",
"/usr/bin/chromium",
"/usr/bin/chromium-browser",
"/snap/bin/chromium",
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
}
for _, p := range chromePaths {
if path, err := exec.LookPath(p); err == nil {
f.chromePath = path
f.hasChrome = true
log.Printf("[Fetcher] Found Chrome at: %s", path)
break
}
}
// Fall back to native if nothing else available
if f.method == "" {
f.method = FetchMethodNative
}
}
// initHTTPClient sets up the native Go HTTP client with cookie support
func (f *Fetcher) initHTTPClient() {
jar, _ := cookiejar.New(nil)
f.httpClient = &http.Client{
Jar: jar,
Timeout: 30 * time.Second,
CheckRedirect: func(req *http.Request, via []*http.Request) error {
if len(via) >= 10 {
return fmt.Errorf("too many redirects")
}
return nil
},
}
}
// initChromeDp initializes the chromedp allocator if Chrome is available
func (f *Fetcher) initChromeDp() {
if !f.hasChrome {
return
}
// Create a persistent allocator context for reuse
opts := append(chromedp.DefaultExecAllocatorOptions[:],
chromedp.Flag("headless", true),
chromedp.Flag("disable-gpu", true),
chromedp.Flag("no-sandbox", true),
chromedp.Flag("disable-dev-shm-usage", true),
chromedp.Flag("disable-extensions", true),
chromedp.Flag("disable-background-networking", true),
chromedp.Flag("disable-sync", true),
chromedp.Flag("disable-translate", true),
chromedp.Flag("mute-audio", true),
chromedp.Flag("hide-scrollbars", true),
chromedp.UserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"),
)
if f.chromePath != "" {
opts = append(opts, chromedp.ExecPath(f.chromePath))
}
f.allocCtx, f.allocCancel = chromedp.NewExecAllocator(context.Background(), opts...)
log.Printf("[Fetcher] Chrome headless browser initialized")
}
// Close cleans up resources
func (f *Fetcher) Close() {
if f.allocCancel != nil {
f.allocCancel()
}
}
// Method returns the current primary fetch method being used
func (f *Fetcher) Method() FetchMethod {
f.mu.RLock()
defer f.mu.RUnlock()
return f.method
}
// HasChrome returns whether headless Chrome is available
func (f *Fetcher) HasChrome() bool {
f.mu.RLock()
defer f.mu.RUnlock()
return f.hasChrome
}
// Fetch fetches a URL using the best available method
// For most sites, uses curl/wget. Falls back to headless browser for JS-heavy sites.
func (f *Fetcher) Fetch(ctx context.Context, url string, opts FetchOptions) (*FetchResult, error) {
// If force headless is set and Chrome is available, use it directly
if opts.ForceHeadless && f.hasChrome {
return f.fetchWithChrome(ctx, url, opts)
}
// Try fast methods first
result, err := f.fetchFast(ctx, url, opts)
if err != nil {
return nil, err
}
// Check if content looks like a JS-rendered page that needs headless browser
if f.hasChrome && f.isJSRenderedPage(result.Content) {
log.Printf("[Fetcher] Content appears to be JS-rendered, trying headless browser for: %s", url)
headlessResult, headlessErr := f.fetchWithChrome(ctx, url, opts)
if headlessErr == nil && len(headlessResult.Content) > len(result.Content) {
return headlessResult, nil
}
// If headless failed or got less content, return original
if headlessErr != nil {
log.Printf("[Fetcher] Headless browser failed: %v, using original content", headlessErr)
}
}
return result, nil
}
// fetchFast tries curl, wget, or native HTTP in order
func (f *Fetcher) fetchFast(ctx context.Context, url string, opts FetchOptions) (*FetchResult, error) {
f.mu.RLock()
curlPath := f.curlPath
wgetPath := f.wgetPath
method := f.method
f.mu.RUnlock()
switch method {
case FetchMethodCurl:
return f.fetchWithCurl(ctx, url, curlPath, opts)
case FetchMethodWget:
return f.fetchWithWget(ctx, url, wgetPath, opts)
default:
return f.fetchNative(ctx, url, opts)
}
}
// isJSRenderedPage checks if the content appears to be a JS-rendered page
// that hasn't actually rendered its content yet
func (f *Fetcher) isJSRenderedPage(content string) bool {
// Too short content often indicates JS rendering needed
if len(strings.TrimSpace(content)) < 500 {
return true
}
// Common patterns indicating JS-only rendering
jsPatterns := []string{
`<div id="root"></div>`,
`<div id="app"></div>`,
`<div id="__next"></div>`,
`<div id="__nuxt"></div>`,
`noscript`,
`"Loading..."`,
`"loading..."`,
`window.__INITIAL_STATE__`,
`window.__NUXT__`,
`window.__NEXT_DATA__`,
}
contentLower := strings.ToLower(content)
for _, pattern := range jsPatterns {
if strings.Contains(contentLower, strings.ToLower(pattern)) {
// Found JS pattern, but also check if there's substantial content
// Extract text content (very rough)
textContent := stripHTMLTags(content)
if len(strings.TrimSpace(textContent)) < 1000 {
return true
}
}
}
// Check for common documentation sites that need JS
jsHeavySites := []string{
"docs.rs",
"reactjs.org",
"vuejs.org",
"angular.io",
"nextjs.org",
"vercel.com",
"netlify.com",
}
for _, site := range jsHeavySites {
if strings.Contains(content, site) {
textContent := stripHTMLTags(content)
if len(strings.TrimSpace(textContent)) < 2000 {
return true
}
}
}
return false
}
// stripHTMLTags removes HTML tags from content (rough extraction)
func stripHTMLTags(content string) string {
// Remove script and style tags with their content
scriptRe := regexp.MustCompile(`(?is)<script[^>]*>.*?</script>`)
content = scriptRe.ReplaceAllString(content, "")
styleRe := regexp.MustCompile(`(?is)<style[^>]*>.*?</style>`)
content = styleRe.ReplaceAllString(content, "")
// Remove all remaining tags
tagRe := regexp.MustCompile(`<[^>]*>`)
content = tagRe.ReplaceAllString(content, " ")
// Collapse whitespace
spaceRe := regexp.MustCompile(`\s+`)
content = spaceRe.ReplaceAllString(content, " ")
return strings.TrimSpace(content)
}
// fetchWithChrome uses headless Chrome to fetch and render the page
func (f *Fetcher) fetchWithChrome(ctx context.Context, url string, opts FetchOptions) (*FetchResult, error) {
if !f.hasChrome || f.allocCtx == nil {
return nil, fmt.Errorf("headless Chrome not available")
}
// Create a timeout context
timeout := opts.Timeout
if timeout == 0 {
timeout = 30 * time.Second
}
ctx, cancel := context.WithTimeout(ctx, timeout)
defer cancel()
// Create a new browser context from the allocator
browserCtx, browserCancel := chromedp.NewContext(f.allocCtx)
defer browserCancel()
var content string
var finalURL string
// Wait time for JS to render
waitTime := opts.WaitTime
if waitTime == 0 {
waitTime = 2 * time.Second
}
// Build the actions
actions := []chromedp.Action{
chromedp.Navigate(url),
}
// Wait for specific selector if provided
if opts.WaitForSelector != "" {
actions = append(actions, chromedp.WaitVisible(opts.WaitForSelector, chromedp.ByQuery))
} else {
// Default: wait for body to be visible and give JS time to render
actions = append(actions,
chromedp.WaitVisible("body", chromedp.ByQuery),
chromedp.Sleep(waitTime),
)
}
// Get the final URL and content
actions = append(actions,
chromedp.Location(&finalURL),
chromedp.OuterHTML("html", &content, chromedp.ByQuery),
)
// Execute
if err := chromedp.Run(browserCtx, actions...); err != nil {
return nil, fmt.Errorf("chromedp failed: %w", err)
}
// Truncate if needed
var truncated bool
var originalSize int
if len(content) > opts.MaxLength {
originalSize = len(content)
content = content[:opts.MaxLength]
truncated = true
}
return &FetchResult{
Content: content,
ContentType: "text/html",
FinalURL: finalURL,
StatusCode: 200,
Method: FetchMethodChrome,
Truncated: truncated,
OriginalSize: originalSize,
}, nil
}
// fetchWithCurl uses curl to fetch the URL
func (f *Fetcher) fetchWithCurl(ctx context.Context, url string, curlPath string, opts FetchOptions) (*FetchResult, error) {
args := []string{
"-sS", // Silent but show errors
"-L", // Follow redirects
"--max-time", fmt.Sprintf("%d", int(opts.Timeout.Seconds())),
"-A", opts.UserAgent, // User agent
"-w", "\n---CURL_INFO---\n%{content_type}\n%{url_effective}\n%{http_code}", // Output metadata
"--compressed", // Accept compressed responses
}
// Add custom headers
for key, value := range opts.Headers {
args = append(args, "-H", fmt.Sprintf("%s: %s", key, value))
}
// Add common headers for better compatibility
args = append(args,
"-H", "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"-H", "Accept-Language: en-US,en;q=0.5",
"-H", "DNT: 1",
"-H", "Connection: keep-alive",
"-H", "Upgrade-Insecure-Requests: 1",
)
args = append(args, url)
cmd := exec.CommandContext(ctx, curlPath, args...)
var stdout, stderr bytes.Buffer
cmd.Stdout = &stdout
cmd.Stderr = &stderr
if err := cmd.Run(); err != nil {
// Check if it's a context cancellation
if ctx.Err() != nil {
return nil, ctx.Err()
}
return nil, fmt.Errorf("curl failed: %s - %s", err.Error(), stderr.String())
}
output := stdout.String()
// Parse the output - content and metadata are separated by ---CURL_INFO---
parts := strings.Split(output, "\n---CURL_INFO---\n")
if len(parts) != 2 {
return nil, fmt.Errorf("unexpected curl output format")
}
content := parts[0]
metaLines := strings.Split(strings.TrimSpace(parts[1]), "\n")
if len(metaLines) < 3 {
return nil, fmt.Errorf("incomplete curl metadata")
}
contentType := metaLines[0]
finalURL := metaLines[1]
statusCode := 200
fmt.Sscanf(metaLines[2], "%d", &statusCode)
// Truncate content if needed
var truncated bool
var originalSize int
if len(content) > opts.MaxLength {
originalSize = len(content)
content = content[:opts.MaxLength]
truncated = true
}
return &FetchResult{
Content: content,
ContentType: contentType,
FinalURL: finalURL,
StatusCode: statusCode,
Method: FetchMethodCurl,
Truncated: truncated,
OriginalSize: originalSize,
}, nil
}
// fetchWithWget uses wget to fetch the URL
func (f *Fetcher) fetchWithWget(ctx context.Context, url string, wgetPath string, opts FetchOptions) (*FetchResult, error) {
f.mu.RLock()
isBusyBox := f.wgetIsBusyBox
f.mu.RUnlock()
var args []string
if isBusyBox {
// BusyBox wget has limited options - use short flags only
args = []string{
"-q", // Quiet
"-O", "-", // Output to stdout
"-T", fmt.Sprintf("%d", int(opts.Timeout.Seconds())), // Timeout
"-U", opts.UserAgent, // User agent
}
// BusyBox wget doesn't support custom headers or max-redirect
} else {
// GNU wget supports full options
args = []string{
"-q", // Quiet
"-O", "-", // Output to stdout
"--timeout", fmt.Sprintf("%d", int(opts.Timeout.Seconds())),
"--user-agent", opts.UserAgent,
"--max-redirect", "10", // Follow up to 10 redirects
"--header", "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"--header", "Accept-Language: en-US,en;q=0.5",
}
// Add custom headers (GNU wget only)
for key, value := range opts.Headers {
args = append(args, "--header", fmt.Sprintf("%s: %s", key, value))
}
}
args = append(args, url)
cmd := exec.CommandContext(ctx, wgetPath, args...)
var stdout, stderr bytes.Buffer
cmd.Stdout = &stdout
cmd.Stderr = &stderr
if err := cmd.Run(); err != nil {
if ctx.Err() != nil {
return nil, ctx.Err()
}
return nil, fmt.Errorf("wget failed: %s - %s", err.Error(), stderr.String())
}
content := stdout.String()
// Truncate content if needed
var truncated bool
var originalSize int
if len(content) > opts.MaxLength {
originalSize = len(content)
content = content[:opts.MaxLength]
truncated = true
}
// wget doesn't easily provide metadata, so we use defaults
return &FetchResult{
Content: content,
ContentType: "text/html", // Assume HTML (wget doesn't easily give us this)
FinalURL: url, // wget doesn't easily give us the final URL
StatusCode: 200,
Method: FetchMethodWget,
Truncated: truncated,
OriginalSize: originalSize,
}, nil
}
// fetchNative uses Go's native http.Client with enhanced capabilities
func (f *Fetcher) fetchNative(ctx context.Context, url string, opts FetchOptions) (*FetchResult, error) {
// Create request with context
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
return nil, fmt.Errorf("failed to create request: %w", err)
}
// Set headers
req.Header.Set("User-Agent", opts.UserAgent)
req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
req.Header.Set("Accept-Language", "en-US,en;q=0.5")
req.Header.Set("Accept-Encoding", "gzip, deflate")
req.Header.Set("DNT", "1")
req.Header.Set("Connection", "keep-alive")
req.Header.Set("Upgrade-Insecure-Requests", "1")
// Add custom headers
for key, value := range opts.Headers {
req.Header.Set(key, value)
}
// Create a client with custom timeout
client := &http.Client{
Jar: f.httpClient.Jar,
Timeout: opts.Timeout,
CheckRedirect: func(req *http.Request, via []*http.Request) error {
if !opts.FollowRedirects {
return http.ErrUseLastResponse
}
if len(via) >= 10 {
return fmt.Errorf("too many redirects")
}
return nil
},
}
// Execute request
resp, err := client.Do(req)
if err != nil {
return nil, fmt.Errorf("request failed: %w", err)
}
defer resp.Body.Close()
// Read body with limit + 1 byte to detect truncation
body, err := io.ReadAll(io.LimitReader(resp.Body, int64(opts.MaxLength)+1))
if err != nil {
return nil, fmt.Errorf("failed to read response: %w", err)
}
var truncated bool
var originalSize int
if len(body) > opts.MaxLength {
originalSize = len(body) // Note: this is just maxLength+1, not true original
body = body[:opts.MaxLength]
truncated = true
}
return &FetchResult{
Content: string(body),
ContentType: resp.Header.Get("Content-Type"),
FinalURL: resp.Request.URL.String(),
StatusCode: resp.StatusCode,
Method: FetchMethodNative,
Truncated: truncated,
OriginalSize: originalSize,
}, nil
}
// FetchWithHeadless explicitly uses headless browser (for API use)
func (f *Fetcher) FetchWithHeadless(ctx context.Context, url string, opts FetchOptions) (*FetchResult, error) {
if !f.hasChrome {
return nil, fmt.Errorf("headless Chrome not available - Chrome/Chromium not found")
}
return f.fetchWithChrome(ctx, url, opts)
}
// TryFetchWithFallback attempts to fetch using all available methods
func (f *Fetcher) TryFetchWithFallback(ctx context.Context, url string, opts FetchOptions) (*FetchResult, error) {
f.mu.RLock()
curlPath := f.curlPath
wgetPath := f.wgetPath
hasChrome := f.hasChrome
f.mu.RUnlock()
var lastErr error
// Try curl first if available
if curlPath != "" {
result, err := f.fetchWithCurl(ctx, url, curlPath, opts)
if err == nil {
return result, nil
}
lastErr = fmt.Errorf("curl: %w", err)
}
// Try wget if available
if wgetPath != "" {
result, err := f.fetchWithWget(ctx, url, wgetPath, opts)
if err == nil {
return result, nil
}
lastErr = fmt.Errorf("wget: %w", err)
}
// Try native HTTP
result, err := f.fetchNative(ctx, url, opts)
if err == nil {
return result, nil
}
lastErr = fmt.Errorf("native: %w", err)
// Last resort: try headless Chrome
if hasChrome {
result, err := f.fetchWithChrome(ctx, url, opts)
if err == nil {
return result, nil
}
lastErr = fmt.Errorf("chrome: %w", err)
}
return nil, fmt.Errorf("all fetch methods failed: %v", lastErr)
}