- Remove curl --max-filesize to avoid hard failures on large pages - Add Truncated/OriginalSize fields to FetchResult for all fetch methods - Return truncation info in proxy response (truncated, originalSize, returnedSize) - Add timeout parameter to fetch_url tool (default 30s, max 120s) - Increase default maxLength from 5KB to 50KB, allow up to 2MB - Include _hint in response guiding LLM to retry with larger maxLength Instead of failing when content exceeds limits, the tool now returns truncated content with guidance for the LLM to request more if needed.
700 lines
19 KiB
Go
700 lines
19 KiB
Go
package api
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"fmt"
|
|
"io"
|
|
"log"
|
|
"net/http"
|
|
"net/http/cookiejar"
|
|
"os/exec"
|
|
"regexp"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/chromedp/chromedp"
|
|
)
|
|
|
|
// FetchMethod represents the method used to fetch URLs
|
|
type FetchMethod string
|
|
|
|
const (
|
|
FetchMethodCurl FetchMethod = "curl"
|
|
FetchMethodWget FetchMethod = "wget"
|
|
FetchMethodChrome FetchMethod = "chrome"
|
|
FetchMethodNative FetchMethod = "native"
|
|
)
|
|
|
|
// FetchResult contains the result of a URL fetch
|
|
type FetchResult struct {
|
|
Content string
|
|
ContentType string
|
|
FinalURL string
|
|
StatusCode int
|
|
Method FetchMethod
|
|
Truncated bool // True if content was truncated due to MaxLength
|
|
OriginalSize int // Original size before truncation (0 if not truncated)
|
|
}
|
|
|
|
// FetchOptions configures the fetch behavior
|
|
type FetchOptions struct {
|
|
MaxLength int
|
|
Timeout time.Duration
|
|
UserAgent string
|
|
Headers map[string]string
|
|
FollowRedirects bool
|
|
// ForceHeadless forces using headless browser even if curl succeeds
|
|
ForceHeadless bool
|
|
// WaitForSelector waits for a specific CSS selector before capturing content
|
|
WaitForSelector string
|
|
// WaitTime is additional time to wait for JS to render (default 2s for headless)
|
|
WaitTime time.Duration
|
|
}
|
|
|
|
// DefaultFetchOptions returns sensible defaults
|
|
func DefaultFetchOptions() FetchOptions {
|
|
return FetchOptions{
|
|
MaxLength: 500000, // 500KB
|
|
Timeout: 30 * time.Second,
|
|
UserAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
Headers: make(map[string]string),
|
|
FollowRedirects: true,
|
|
WaitTime: 2 * time.Second,
|
|
}
|
|
}
|
|
|
|
// Fetcher provides URL fetching with multiple backend support
|
|
type Fetcher struct {
|
|
curlPath string
|
|
wgetPath string
|
|
wgetIsBusyBox bool // BusyBox wget has limited options
|
|
chromePath string
|
|
httpClient *http.Client
|
|
method FetchMethod
|
|
hasChrome bool
|
|
mu sync.RWMutex
|
|
|
|
// chromedp allocator context (reused for efficiency)
|
|
allocCtx context.Context
|
|
allocCancel context.CancelFunc
|
|
}
|
|
|
|
var (
|
|
globalFetcher *Fetcher
|
|
fetcherOnce sync.Once
|
|
)
|
|
|
|
// GetFetcher returns the singleton Fetcher instance
|
|
func GetFetcher() *Fetcher {
|
|
fetcherOnce.Do(func() {
|
|
globalFetcher = NewFetcher()
|
|
})
|
|
return globalFetcher
|
|
}
|
|
|
|
// NewFetcher creates a new Fetcher, detecting available tools
|
|
func NewFetcher() *Fetcher {
|
|
f := &Fetcher{}
|
|
f.detectTools()
|
|
f.initHTTPClient()
|
|
f.initChromeDp()
|
|
return f
|
|
}
|
|
|
|
// detectTools checks which external tools are available
|
|
func (f *Fetcher) detectTools() {
|
|
f.mu.Lock()
|
|
defer f.mu.Unlock()
|
|
|
|
// Check for curl
|
|
if path, err := exec.LookPath("curl"); err == nil {
|
|
f.curlPath = path
|
|
f.method = FetchMethodCurl
|
|
}
|
|
|
|
// Check for wget
|
|
if path, err := exec.LookPath("wget"); err == nil {
|
|
f.wgetPath = path
|
|
// Check if it's BusyBox wget (has limited options)
|
|
versionCmd := exec.Command(path, "--version")
|
|
versionOut, _ := versionCmd.CombinedOutput()
|
|
f.wgetIsBusyBox = strings.Contains(string(versionOut), "BusyBox")
|
|
if f.wgetIsBusyBox {
|
|
log.Printf("[Fetcher] Found BusyBox wget (limited options)")
|
|
}
|
|
if f.method == "" {
|
|
f.method = FetchMethodWget
|
|
}
|
|
}
|
|
|
|
// Check for Chrome/Chromium (for headless browser support)
|
|
chromePaths := []string{
|
|
"google-chrome",
|
|
"google-chrome-stable",
|
|
"chromium",
|
|
"chromium-browser",
|
|
"/usr/bin/google-chrome",
|
|
"/usr/bin/chromium",
|
|
"/usr/bin/chromium-browser",
|
|
"/snap/bin/chromium",
|
|
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
|
|
}
|
|
|
|
for _, p := range chromePaths {
|
|
if path, err := exec.LookPath(p); err == nil {
|
|
f.chromePath = path
|
|
f.hasChrome = true
|
|
log.Printf("[Fetcher] Found Chrome at: %s", path)
|
|
break
|
|
}
|
|
}
|
|
|
|
// Fall back to native if nothing else available
|
|
if f.method == "" {
|
|
f.method = FetchMethodNative
|
|
}
|
|
}
|
|
|
|
// initHTTPClient sets up the native Go HTTP client with cookie support
|
|
func (f *Fetcher) initHTTPClient() {
|
|
jar, _ := cookiejar.New(nil)
|
|
|
|
f.httpClient = &http.Client{
|
|
Jar: jar,
|
|
Timeout: 30 * time.Second,
|
|
CheckRedirect: func(req *http.Request, via []*http.Request) error {
|
|
if len(via) >= 10 {
|
|
return fmt.Errorf("too many redirects")
|
|
}
|
|
return nil
|
|
},
|
|
}
|
|
}
|
|
|
|
// initChromeDp initializes the chromedp allocator if Chrome is available
|
|
func (f *Fetcher) initChromeDp() {
|
|
if !f.hasChrome {
|
|
return
|
|
}
|
|
|
|
// Create a persistent allocator context for reuse
|
|
opts := append(chromedp.DefaultExecAllocatorOptions[:],
|
|
chromedp.Flag("headless", true),
|
|
chromedp.Flag("disable-gpu", true),
|
|
chromedp.Flag("no-sandbox", true),
|
|
chromedp.Flag("disable-dev-shm-usage", true),
|
|
chromedp.Flag("disable-extensions", true),
|
|
chromedp.Flag("disable-background-networking", true),
|
|
chromedp.Flag("disable-sync", true),
|
|
chromedp.Flag("disable-translate", true),
|
|
chromedp.Flag("mute-audio", true),
|
|
chromedp.Flag("hide-scrollbars", true),
|
|
chromedp.UserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"),
|
|
)
|
|
|
|
if f.chromePath != "" {
|
|
opts = append(opts, chromedp.ExecPath(f.chromePath))
|
|
}
|
|
|
|
f.allocCtx, f.allocCancel = chromedp.NewExecAllocator(context.Background(), opts...)
|
|
log.Printf("[Fetcher] Chrome headless browser initialized")
|
|
}
|
|
|
|
// Close cleans up resources
|
|
func (f *Fetcher) Close() {
|
|
if f.allocCancel != nil {
|
|
f.allocCancel()
|
|
}
|
|
}
|
|
|
|
// Method returns the current primary fetch method being used
|
|
func (f *Fetcher) Method() FetchMethod {
|
|
f.mu.RLock()
|
|
defer f.mu.RUnlock()
|
|
return f.method
|
|
}
|
|
|
|
// HasChrome returns whether headless Chrome is available
|
|
func (f *Fetcher) HasChrome() bool {
|
|
f.mu.RLock()
|
|
defer f.mu.RUnlock()
|
|
return f.hasChrome
|
|
}
|
|
|
|
// Fetch fetches a URL using the best available method
|
|
// For most sites, uses curl/wget. Falls back to headless browser for JS-heavy sites.
|
|
func (f *Fetcher) Fetch(ctx context.Context, url string, opts FetchOptions) (*FetchResult, error) {
|
|
// If force headless is set and Chrome is available, use it directly
|
|
if opts.ForceHeadless && f.hasChrome {
|
|
return f.fetchWithChrome(ctx, url, opts)
|
|
}
|
|
|
|
// Try fast methods first
|
|
result, err := f.fetchFast(ctx, url, opts)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Check if content looks like a JS-rendered page that needs headless browser
|
|
if f.hasChrome && f.isJSRenderedPage(result.Content) {
|
|
log.Printf("[Fetcher] Content appears to be JS-rendered, trying headless browser for: %s", url)
|
|
headlessResult, headlessErr := f.fetchWithChrome(ctx, url, opts)
|
|
if headlessErr == nil && len(headlessResult.Content) > len(result.Content) {
|
|
return headlessResult, nil
|
|
}
|
|
// If headless failed or got less content, return original
|
|
if headlessErr != nil {
|
|
log.Printf("[Fetcher] Headless browser failed: %v, using original content", headlessErr)
|
|
}
|
|
}
|
|
|
|
return result, nil
|
|
}
|
|
|
|
// fetchFast tries curl, wget, or native HTTP in order
|
|
func (f *Fetcher) fetchFast(ctx context.Context, url string, opts FetchOptions) (*FetchResult, error) {
|
|
f.mu.RLock()
|
|
curlPath := f.curlPath
|
|
wgetPath := f.wgetPath
|
|
method := f.method
|
|
f.mu.RUnlock()
|
|
|
|
switch method {
|
|
case FetchMethodCurl:
|
|
return f.fetchWithCurl(ctx, url, curlPath, opts)
|
|
case FetchMethodWget:
|
|
return f.fetchWithWget(ctx, url, wgetPath, opts)
|
|
default:
|
|
return f.fetchNative(ctx, url, opts)
|
|
}
|
|
}
|
|
|
|
// isJSRenderedPage checks if the content appears to be a JS-rendered page
|
|
// that hasn't actually rendered its content yet
|
|
func (f *Fetcher) isJSRenderedPage(content string) bool {
|
|
// Too short content often indicates JS rendering needed
|
|
if len(strings.TrimSpace(content)) < 500 {
|
|
return true
|
|
}
|
|
|
|
// Common patterns indicating JS-only rendering
|
|
jsPatterns := []string{
|
|
`<div id="root"></div>`,
|
|
`<div id="app"></div>`,
|
|
`<div id="__next"></div>`,
|
|
`<div id="__nuxt"></div>`,
|
|
`noscript`,
|
|
`"Loading..."`,
|
|
`"loading..."`,
|
|
`window.__INITIAL_STATE__`,
|
|
`window.__NUXT__`,
|
|
`window.__NEXT_DATA__`,
|
|
}
|
|
|
|
contentLower := strings.ToLower(content)
|
|
for _, pattern := range jsPatterns {
|
|
if strings.Contains(contentLower, strings.ToLower(pattern)) {
|
|
// Found JS pattern, but also check if there's substantial content
|
|
// Extract text content (very rough)
|
|
textContent := stripHTMLTags(content)
|
|
if len(strings.TrimSpace(textContent)) < 1000 {
|
|
return true
|
|
}
|
|
}
|
|
}
|
|
|
|
// Check for common documentation sites that need JS
|
|
jsHeavySites := []string{
|
|
"docs.rs",
|
|
"reactjs.org",
|
|
"vuejs.org",
|
|
"angular.io",
|
|
"nextjs.org",
|
|
"vercel.com",
|
|
"netlify.com",
|
|
}
|
|
|
|
for _, site := range jsHeavySites {
|
|
if strings.Contains(content, site) {
|
|
textContent := stripHTMLTags(content)
|
|
if len(strings.TrimSpace(textContent)) < 2000 {
|
|
return true
|
|
}
|
|
}
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
// stripHTMLTags removes HTML tags from content (rough extraction)
|
|
func stripHTMLTags(content string) string {
|
|
// Remove script and style tags with their content
|
|
scriptRe := regexp.MustCompile(`(?is)<script[^>]*>.*?</script>`)
|
|
content = scriptRe.ReplaceAllString(content, "")
|
|
|
|
styleRe := regexp.MustCompile(`(?is)<style[^>]*>.*?</style>`)
|
|
content = styleRe.ReplaceAllString(content, "")
|
|
|
|
// Remove all remaining tags
|
|
tagRe := regexp.MustCompile(`<[^>]*>`)
|
|
content = tagRe.ReplaceAllString(content, " ")
|
|
|
|
// Collapse whitespace
|
|
spaceRe := regexp.MustCompile(`\s+`)
|
|
content = spaceRe.ReplaceAllString(content, " ")
|
|
|
|
return strings.TrimSpace(content)
|
|
}
|
|
|
|
// fetchWithChrome uses headless Chrome to fetch and render the page
|
|
func (f *Fetcher) fetchWithChrome(ctx context.Context, url string, opts FetchOptions) (*FetchResult, error) {
|
|
if !f.hasChrome || f.allocCtx == nil {
|
|
return nil, fmt.Errorf("headless Chrome not available")
|
|
}
|
|
|
|
// Create a timeout context
|
|
timeout := opts.Timeout
|
|
if timeout == 0 {
|
|
timeout = 30 * time.Second
|
|
}
|
|
ctx, cancel := context.WithTimeout(ctx, timeout)
|
|
defer cancel()
|
|
|
|
// Create a new browser context from the allocator
|
|
browserCtx, browserCancel := chromedp.NewContext(f.allocCtx)
|
|
defer browserCancel()
|
|
|
|
var content string
|
|
var finalURL string
|
|
|
|
// Wait time for JS to render
|
|
waitTime := opts.WaitTime
|
|
if waitTime == 0 {
|
|
waitTime = 2 * time.Second
|
|
}
|
|
|
|
// Build the actions
|
|
actions := []chromedp.Action{
|
|
chromedp.Navigate(url),
|
|
}
|
|
|
|
// Wait for specific selector if provided
|
|
if opts.WaitForSelector != "" {
|
|
actions = append(actions, chromedp.WaitVisible(opts.WaitForSelector, chromedp.ByQuery))
|
|
} else {
|
|
// Default: wait for body to be visible and give JS time to render
|
|
actions = append(actions,
|
|
chromedp.WaitVisible("body", chromedp.ByQuery),
|
|
chromedp.Sleep(waitTime),
|
|
)
|
|
}
|
|
|
|
// Get the final URL and content
|
|
actions = append(actions,
|
|
chromedp.Location(&finalURL),
|
|
chromedp.OuterHTML("html", &content, chromedp.ByQuery),
|
|
)
|
|
|
|
// Execute
|
|
if err := chromedp.Run(browserCtx, actions...); err != nil {
|
|
return nil, fmt.Errorf("chromedp failed: %w", err)
|
|
}
|
|
|
|
// Truncate if needed
|
|
var truncated bool
|
|
var originalSize int
|
|
if len(content) > opts.MaxLength {
|
|
originalSize = len(content)
|
|
content = content[:opts.MaxLength]
|
|
truncated = true
|
|
}
|
|
|
|
return &FetchResult{
|
|
Content: content,
|
|
ContentType: "text/html",
|
|
FinalURL: finalURL,
|
|
StatusCode: 200,
|
|
Method: FetchMethodChrome,
|
|
Truncated: truncated,
|
|
OriginalSize: originalSize,
|
|
}, nil
|
|
}
|
|
|
|
// fetchWithCurl uses curl to fetch the URL
|
|
func (f *Fetcher) fetchWithCurl(ctx context.Context, url string, curlPath string, opts FetchOptions) (*FetchResult, error) {
|
|
args := []string{
|
|
"-sS", // Silent but show errors
|
|
"-L", // Follow redirects
|
|
"--max-time", fmt.Sprintf("%d", int(opts.Timeout.Seconds())),
|
|
"-A", opts.UserAgent, // User agent
|
|
"-w", "\n---CURL_INFO---\n%{content_type}\n%{url_effective}\n%{http_code}", // Output metadata
|
|
"--compressed", // Accept compressed responses
|
|
}
|
|
|
|
// Add custom headers
|
|
for key, value := range opts.Headers {
|
|
args = append(args, "-H", fmt.Sprintf("%s: %s", key, value))
|
|
}
|
|
|
|
// Add common headers for better compatibility
|
|
args = append(args,
|
|
"-H", "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
|
"-H", "Accept-Language: en-US,en;q=0.5",
|
|
"-H", "DNT: 1",
|
|
"-H", "Connection: keep-alive",
|
|
"-H", "Upgrade-Insecure-Requests: 1",
|
|
)
|
|
|
|
args = append(args, url)
|
|
|
|
cmd := exec.CommandContext(ctx, curlPath, args...)
|
|
var stdout, stderr bytes.Buffer
|
|
cmd.Stdout = &stdout
|
|
cmd.Stderr = &stderr
|
|
|
|
if err := cmd.Run(); err != nil {
|
|
// Check if it's a context cancellation
|
|
if ctx.Err() != nil {
|
|
return nil, ctx.Err()
|
|
}
|
|
return nil, fmt.Errorf("curl failed: %s - %s", err.Error(), stderr.String())
|
|
}
|
|
|
|
output := stdout.String()
|
|
|
|
// Parse the output - content and metadata are separated by ---CURL_INFO---
|
|
parts := strings.Split(output, "\n---CURL_INFO---\n")
|
|
if len(parts) != 2 {
|
|
return nil, fmt.Errorf("unexpected curl output format")
|
|
}
|
|
|
|
content := parts[0]
|
|
metaLines := strings.Split(strings.TrimSpace(parts[1]), "\n")
|
|
|
|
if len(metaLines) < 3 {
|
|
return nil, fmt.Errorf("incomplete curl metadata")
|
|
}
|
|
|
|
contentType := metaLines[0]
|
|
finalURL := metaLines[1]
|
|
statusCode := 200
|
|
fmt.Sscanf(metaLines[2], "%d", &statusCode)
|
|
|
|
// Truncate content if needed
|
|
var truncated bool
|
|
var originalSize int
|
|
if len(content) > opts.MaxLength {
|
|
originalSize = len(content)
|
|
content = content[:opts.MaxLength]
|
|
truncated = true
|
|
}
|
|
|
|
return &FetchResult{
|
|
Content: content,
|
|
ContentType: contentType,
|
|
FinalURL: finalURL,
|
|
StatusCode: statusCode,
|
|
Method: FetchMethodCurl,
|
|
Truncated: truncated,
|
|
OriginalSize: originalSize,
|
|
}, nil
|
|
}
|
|
|
|
// fetchWithWget uses wget to fetch the URL
|
|
func (f *Fetcher) fetchWithWget(ctx context.Context, url string, wgetPath string, opts FetchOptions) (*FetchResult, error) {
|
|
f.mu.RLock()
|
|
isBusyBox := f.wgetIsBusyBox
|
|
f.mu.RUnlock()
|
|
|
|
var args []string
|
|
|
|
if isBusyBox {
|
|
// BusyBox wget has limited options - use short flags only
|
|
args = []string{
|
|
"-q", // Quiet
|
|
"-O", "-", // Output to stdout
|
|
"-T", fmt.Sprintf("%d", int(opts.Timeout.Seconds())), // Timeout
|
|
"-U", opts.UserAgent, // User agent
|
|
}
|
|
// BusyBox wget doesn't support custom headers or max-redirect
|
|
} else {
|
|
// GNU wget supports full options
|
|
args = []string{
|
|
"-q", // Quiet
|
|
"-O", "-", // Output to stdout
|
|
"--timeout", fmt.Sprintf("%d", int(opts.Timeout.Seconds())),
|
|
"--user-agent", opts.UserAgent,
|
|
"--max-redirect", "10", // Follow up to 10 redirects
|
|
"--header", "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
"--header", "Accept-Language: en-US,en;q=0.5",
|
|
}
|
|
|
|
// Add custom headers (GNU wget only)
|
|
for key, value := range opts.Headers {
|
|
args = append(args, "--header", fmt.Sprintf("%s: %s", key, value))
|
|
}
|
|
}
|
|
|
|
args = append(args, url)
|
|
|
|
cmd := exec.CommandContext(ctx, wgetPath, args...)
|
|
var stdout, stderr bytes.Buffer
|
|
cmd.Stdout = &stdout
|
|
cmd.Stderr = &stderr
|
|
|
|
if err := cmd.Run(); err != nil {
|
|
if ctx.Err() != nil {
|
|
return nil, ctx.Err()
|
|
}
|
|
return nil, fmt.Errorf("wget failed: %s - %s", err.Error(), stderr.String())
|
|
}
|
|
|
|
content := stdout.String()
|
|
|
|
// Truncate content if needed
|
|
var truncated bool
|
|
var originalSize int
|
|
if len(content) > opts.MaxLength {
|
|
originalSize = len(content)
|
|
content = content[:opts.MaxLength]
|
|
truncated = true
|
|
}
|
|
|
|
// wget doesn't easily provide metadata, so we use defaults
|
|
return &FetchResult{
|
|
Content: content,
|
|
ContentType: "text/html", // Assume HTML (wget doesn't easily give us this)
|
|
FinalURL: url, // wget doesn't easily give us the final URL
|
|
StatusCode: 200,
|
|
Method: FetchMethodWget,
|
|
Truncated: truncated,
|
|
OriginalSize: originalSize,
|
|
}, nil
|
|
}
|
|
|
|
// fetchNative uses Go's native http.Client with enhanced capabilities
|
|
func (f *Fetcher) fetchNative(ctx context.Context, url string, opts FetchOptions) (*FetchResult, error) {
|
|
// Create request with context
|
|
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to create request: %w", err)
|
|
}
|
|
|
|
// Set headers
|
|
req.Header.Set("User-Agent", opts.UserAgent)
|
|
req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
|
|
req.Header.Set("Accept-Language", "en-US,en;q=0.5")
|
|
req.Header.Set("Accept-Encoding", "gzip, deflate")
|
|
req.Header.Set("DNT", "1")
|
|
req.Header.Set("Connection", "keep-alive")
|
|
req.Header.Set("Upgrade-Insecure-Requests", "1")
|
|
|
|
// Add custom headers
|
|
for key, value := range opts.Headers {
|
|
req.Header.Set(key, value)
|
|
}
|
|
|
|
// Create a client with custom timeout
|
|
client := &http.Client{
|
|
Jar: f.httpClient.Jar,
|
|
Timeout: opts.Timeout,
|
|
CheckRedirect: func(req *http.Request, via []*http.Request) error {
|
|
if !opts.FollowRedirects {
|
|
return http.ErrUseLastResponse
|
|
}
|
|
if len(via) >= 10 {
|
|
return fmt.Errorf("too many redirects")
|
|
}
|
|
return nil
|
|
},
|
|
}
|
|
|
|
// Execute request
|
|
resp, err := client.Do(req)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("request failed: %w", err)
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
// Read body with limit + 1 byte to detect truncation
|
|
body, err := io.ReadAll(io.LimitReader(resp.Body, int64(opts.MaxLength)+1))
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to read response: %w", err)
|
|
}
|
|
|
|
var truncated bool
|
|
var originalSize int
|
|
if len(body) > opts.MaxLength {
|
|
originalSize = len(body) // Note: this is just maxLength+1, not true original
|
|
body = body[:opts.MaxLength]
|
|
truncated = true
|
|
}
|
|
|
|
return &FetchResult{
|
|
Content: string(body),
|
|
ContentType: resp.Header.Get("Content-Type"),
|
|
FinalURL: resp.Request.URL.String(),
|
|
StatusCode: resp.StatusCode,
|
|
Method: FetchMethodNative,
|
|
Truncated: truncated,
|
|
OriginalSize: originalSize,
|
|
}, nil
|
|
}
|
|
|
|
// FetchWithHeadless explicitly uses headless browser (for API use)
|
|
func (f *Fetcher) FetchWithHeadless(ctx context.Context, url string, opts FetchOptions) (*FetchResult, error) {
|
|
if !f.hasChrome {
|
|
return nil, fmt.Errorf("headless Chrome not available - Chrome/Chromium not found")
|
|
}
|
|
return f.fetchWithChrome(ctx, url, opts)
|
|
}
|
|
|
|
// TryFetchWithFallback attempts to fetch using all available methods
|
|
func (f *Fetcher) TryFetchWithFallback(ctx context.Context, url string, opts FetchOptions) (*FetchResult, error) {
|
|
f.mu.RLock()
|
|
curlPath := f.curlPath
|
|
wgetPath := f.wgetPath
|
|
hasChrome := f.hasChrome
|
|
f.mu.RUnlock()
|
|
|
|
var lastErr error
|
|
|
|
// Try curl first if available
|
|
if curlPath != "" {
|
|
result, err := f.fetchWithCurl(ctx, url, curlPath, opts)
|
|
if err == nil {
|
|
return result, nil
|
|
}
|
|
lastErr = fmt.Errorf("curl: %w", err)
|
|
}
|
|
|
|
// Try wget if available
|
|
if wgetPath != "" {
|
|
result, err := f.fetchWithWget(ctx, url, wgetPath, opts)
|
|
if err == nil {
|
|
return result, nil
|
|
}
|
|
lastErr = fmt.Errorf("wget: %w", err)
|
|
}
|
|
|
|
// Try native HTTP
|
|
result, err := f.fetchNative(ctx, url, opts)
|
|
if err == nil {
|
|
return result, nil
|
|
}
|
|
lastErr = fmt.Errorf("native: %w", err)
|
|
|
|
// Last resort: try headless Chrome
|
|
if hasChrome {
|
|
result, err := f.fetchWithChrome(ctx, url, opts)
|
|
if err == nil {
|
|
return result, nil
|
|
}
|
|
lastErr = fmt.Errorf("chrome: %w", err)
|
|
}
|
|
|
|
return nil, fmt.Errorf("all fetch methods failed: %v", lastErr)
|
|
}
|