vessel/backend/internal/api/fetcher.go

package api

import (
	"bytes"
	"context"
	"fmt"
	"io"
	"log"
	"net/http"
	"net/http/cookiejar"
	"os/exec"
	"regexp"
	"strings"
	"sync"
	"time"

	"github.com/chromedp/chromedp"
)

// FetchMethod represents the method used to fetch URLs
type FetchMethod string

const (
	FetchMethodCurl     FetchMethod = "curl"
	FetchMethodWget     FetchMethod = "wget"
	FetchMethodChrome   FetchMethod = "chrome"
	FetchMethodNative   FetchMethod = "native"
)

// FetchResult contains the result of a URL fetch
type FetchResult struct {
	Content      string
	ContentType  string
	FinalURL     string
	StatusCode   int
	Method       FetchMethod
	Truncated    bool // True if content was truncated due to MaxLength
	OriginalSize int  // Original size before truncation (0 if not truncated)
}

// FetchOptions configures the fetch behavior
type FetchOptions struct {
	MaxLength        int
	Timeout          time.Duration
	UserAgent        string
	Headers          map[string]string
	FollowRedirects  bool
	// ForceHeadless forces using headless browser even if curl succeeds
	ForceHeadless    bool
	// WaitForSelector waits for a specific CSS selector before capturing content
	WaitForSelector  string
	// WaitTime is additional time to wait for JS to render (default 2s for headless)
	WaitTime         time.Duration
}

// DefaultFetchOptions returns sensible defaults
func DefaultFetchOptions() FetchOptions {
	return FetchOptions{
		MaxLength:       500000, // 500KB
		Timeout:         30 * time.Second,
		UserAgent:       "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
		Headers:         make(map[string]string),
		FollowRedirects: true,
		WaitTime:        2 * time.Second,
	}
}

// Fetcher provides URL fetching with multiple backend support
type Fetcher struct {
	curlPath      string
	wgetPath      string
	wgetIsBusyBox bool // BusyBox wget has limited options
	chromePath    string
	httpClient    *http.Client
	method        FetchMethod
	hasChrome     bool
	mu            sync.RWMutex

	// chromedp allocator context (reused for efficiency)
	allocCtx    context.Context
	allocCancel context.CancelFunc
}

var (
	globalFetcher *Fetcher
	fetcherOnce   sync.Once
)

// GetFetcher returns the singleton Fetcher instance
func GetFetcher() *Fetcher {
	fetcherOnce.Do(func() {
		globalFetcher = NewFetcher()
	})
	return globalFetcher
}

// NewFetcher creates a new Fetcher, detecting available tools
func NewFetcher() *Fetcher {
	f := &Fetcher{}
	f.detectTools()
	f.initHTTPClient()
	f.initChromeDp()
	return f
}

// detectTools checks which external tools are available
func (f *Fetcher) detectTools() {
	f.mu.Lock()
	defer f.mu.Unlock()

	// Check for curl
	if path, err := exec.LookPath("curl"); err == nil {
		f.curlPath = path
		f.method = FetchMethodCurl
	}

	// Check for wget
	if path, err := exec.LookPath("wget"); err == nil {
		f.wgetPath = path
		// Check if it's BusyBox wget (has limited options)
		versionCmd := exec.Command(path, "--version")
		versionOut, _ := versionCmd.CombinedOutput()
		f.wgetIsBusyBox = strings.Contains(string(versionOut), "BusyBox")
		if f.wgetIsBusyBox {
			log.Printf("[Fetcher] Found BusyBox wget (limited options)")
		}
		if f.method == "" {
			f.method = FetchMethodWget
		}
	}

	// Check for Chrome/Chromium (for headless browser support)
	chromePaths := []string{
		"google-chrome",
		"google-chrome-stable",
		"chromium",
		"chromium-browser",
		"/usr/bin/google-chrome",
		"/usr/bin/chromium",
		"/usr/bin/chromium-browser",
		"/snap/bin/chromium",
		"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
	}

	for _, p := range chromePaths {
		if path, err := exec.LookPath(p); err == nil {
			f.chromePath = path
			f.hasChrome = true
			log.Printf("[Fetcher] Found Chrome at: %s", path)
			break
		}
	}

	// Fall back to native if nothing else available
	if f.method == "" {
		f.method = FetchMethodNative
	}
}

// initHTTPClient sets up the native Go HTTP client with cookie support
func (f *Fetcher) initHTTPClient() {
	jar, _ := cookiejar.New(nil)

	f.httpClient = &http.Client{
		Jar:     jar,
		Timeout: 30 * time.Second,
		CheckRedirect: func(req *http.Request, via []*http.Request) error {
			if len(via) >= 10 {
				return fmt.Errorf("too many redirects")
			}
			return nil
		},
	}
}

// initChromeDp initializes the chromedp allocator if Chrome is available
func (f *Fetcher) initChromeDp() {
	if !f.hasChrome {
		return
	}

	// Create a persistent allocator context for reuse
	opts := append(chromedp.DefaultExecAllocatorOptions[:],
		chromedp.Flag("headless", true),
		chromedp.Flag("disable-gpu", true),
		chromedp.Flag("no-sandbox", true),
		chromedp.Flag("disable-dev-shm-usage", true),
		chromedp.Flag("disable-extensions", true),
		chromedp.Flag("disable-background-networking", true),
		chromedp.Flag("disable-sync", true),
		chromedp.Flag("disable-translate", true),
		chromedp.Flag("mute-audio", true),
		chromedp.Flag("hide-scrollbars", true),
		chromedp.UserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"),
	)

	if f.chromePath != "" {
		opts = append(opts, chromedp.ExecPath(f.chromePath))
	}

	f.allocCtx, f.allocCancel = chromedp.NewExecAllocator(context.Background(), opts...)
	log.Printf("[Fetcher] Chrome headless browser initialized")
}

// Close cleans up resources
func (f *Fetcher) Close() {
	if f.allocCancel != nil {
		f.allocCancel()
	}
}

// Method returns the current primary fetch method being used
func (f *Fetcher) Method() FetchMethod {
	f.mu.RLock()
	defer f.mu.RUnlock()
	return f.method
}

// HasChrome returns whether headless Chrome is available
func (f *Fetcher) HasChrome() bool {
	f.mu.RLock()
	defer f.mu.RUnlock()
	return f.hasChrome
}

// Fetch fetches a URL using the best available method
// For most sites, uses curl/wget. Falls back to headless browser for JS-heavy sites.
func (f *Fetcher) Fetch(ctx context.Context, url string, opts FetchOptions) (*FetchResult, error) {
	// If force headless is set and Chrome is available, use it directly
	if opts.ForceHeadless && f.hasChrome {
		return f.fetchWithChrome(ctx, url, opts)
	}

	// Try fast methods first
	result, err := f.fetchFast(ctx, url, opts)
	if err != nil {
		return nil, err
	}

	// Check if content looks like a JS-rendered page that needs headless browser
	if f.hasChrome && f.isJSRenderedPage(result.Content) {
		log.Printf("[Fetcher] Content appears to be JS-rendered, trying headless browser for: %s", url)
		headlessResult, headlessErr := f.fetchWithChrome(ctx, url, opts)
		if headlessErr == nil && len(headlessResult.Content) > len(result.Content) {
			return headlessResult, nil
		}
		// If headless failed or got less content, return original
		if headlessErr != nil {
			log.Printf("[Fetcher] Headless browser failed: %v, using original content", headlessErr)
		}
	}

	return result, nil
}

// fetchFast tries curl, wget, or native HTTP in order
func (f *Fetcher) fetchFast(ctx context.Context, url string, opts FetchOptions) (*FetchResult, error) {
	f.mu.RLock()
	curlPath := f.curlPath
	wgetPath := f.wgetPath
	method := f.method
	f.mu.RUnlock()

	switch method {
	case FetchMethodCurl:
		return f.fetchWithCurl(ctx, url, curlPath, opts)
	case FetchMethodWget:
		return f.fetchWithWget(ctx, url, wgetPath, opts)
	default:
		return f.fetchNative(ctx, url, opts)
	}
}

// isJSRenderedPage checks if the content appears to be a JS-rendered page
// that hasn't actually rendered its content yet
func (f *Fetcher) isJSRenderedPage(content string) bool {
	// Too short content often indicates JS rendering needed
	if len(strings.TrimSpace(content)) < 500 {
		return true
	}

	// Common patterns indicating JS-only rendering
	jsPatterns := []string{
		`<div id="root"></div>`,
		`<div id="app"></div>`,
		`<div id="__next"></div>`,
		`<div id="__nuxt"></div>`,
		`noscript`,
		`"Loading..."`,
		`"loading..."`,
		`window.__INITIAL_STATE__`,
		`window.__NUXT__`,
		`window.__NEXT_DATA__`,
	}

	contentLower := strings.ToLower(content)
	for _, pattern := range jsPatterns {
		if strings.Contains(contentLower, strings.ToLower(pattern)) {
			// Found JS pattern, but also check if there's substantial content
			// Extract text content (very rough)
			textContent := stripHTMLTags(content)
			if len(strings.TrimSpace(textContent)) < 1000 {
				return true
			}
		}
	}

	// Check for common documentation sites that need JS
	jsHeavySites := []string{
		"docs.rs",
		"reactjs.org",
		"vuejs.org",
		"angular.io",
		"nextjs.org",
		"vercel.com",
		"netlify.com",
	}

	for _, site := range jsHeavySites {
		if strings.Contains(content, site) {
			textContent := stripHTMLTags(content)
			if len(strings.TrimSpace(textContent)) < 2000 {
				return true
			}
		}
	}

	return false
}

// stripHTMLTags removes HTML tags from content (rough extraction)
func stripHTMLTags(content string) string {
	// Remove script and style tags with their content
	scriptRe := regexp.MustCompile(`(?is)<script[^>]*>.*?</script>`)
	content = scriptRe.ReplaceAllString(content, "")

	styleRe := regexp.MustCompile(`(?is)<style[^>]*>.*?</style>`)
	content = styleRe.ReplaceAllString(content, "")

	// Remove all remaining tags
	tagRe := regexp.MustCompile(`<[^>]*>`)
	content = tagRe.ReplaceAllString(content, " ")

	// Collapse whitespace
	spaceRe := regexp.MustCompile(`\s+`)
	content = spaceRe.ReplaceAllString(content, " ")

	return strings.TrimSpace(content)
}

// fetchWithChrome uses headless Chrome to fetch and render the page
func (f *Fetcher) fetchWithChrome(ctx context.Context, url string, opts FetchOptions) (*FetchResult, error) {
	if !f.hasChrome || f.allocCtx == nil {
		return nil, fmt.Errorf("headless Chrome not available")
	}

	// Create a timeout context
	timeout := opts.Timeout
	if timeout == 0 {
		timeout = 30 * time.Second
	}
	ctx, cancel := context.WithTimeout(ctx, timeout)
	defer cancel()

	// Create a new browser context from the allocator
	browserCtx, browserCancel := chromedp.NewContext(f.allocCtx)
	defer browserCancel()

	var content string
	var finalURL string

	// Wait time for JS to render
	waitTime := opts.WaitTime
	if waitTime == 0 {
		waitTime = 2 * time.Second
	}

	// Build the actions
	actions := []chromedp.Action{
		chromedp.Navigate(url),
	}

	// Wait for specific selector if provided
	if opts.WaitForSelector != "" {
		actions = append(actions, chromedp.WaitVisible(opts.WaitForSelector, chromedp.ByQuery))
	} else {
		// Default: wait for body to be visible and give JS time to render
		actions = append(actions,
			chromedp.WaitVisible("body", chromedp.ByQuery),
			chromedp.Sleep(waitTime),
		)
	}

	// Get the final URL and content
	actions = append(actions,
		chromedp.Location(&finalURL),
		chromedp.OuterHTML("html", &content, chromedp.ByQuery),
	)

	// Execute
	if err := chromedp.Run(browserCtx, actions...); err != nil {
		return nil, fmt.Errorf("chromedp failed: %w", err)
	}

	// Truncate if needed
	var truncated bool
	var originalSize int
	if len(content) > opts.MaxLength {
		originalSize = len(content)
		content = content[:opts.MaxLength]
		truncated = true
	}

	return &FetchResult{
		Content:      content,
		ContentType:  "text/html",
		FinalURL:     finalURL,
		StatusCode:   200,
		Method:       FetchMethodChrome,
		Truncated:    truncated,
		OriginalSize: originalSize,
	}, nil
}

// fetchWithCurl uses curl to fetch the URL
func (f *Fetcher) fetchWithCurl(ctx context.Context, url string, curlPath string, opts FetchOptions) (*FetchResult, error) {
	args := []string{
		"-sS",                          // Silent but show errors
		"-L",                           // Follow redirects
		"--max-time", fmt.Sprintf("%d", int(opts.Timeout.Seconds())),
		"-A", opts.UserAgent,           // User agent
		"-w", "\n---CURL_INFO---\n%{content_type}\n%{url_effective}\n%{http_code}", // Output metadata
		"--compressed",                 // Accept compressed responses
	}

	// Add custom headers
	for key, value := range opts.Headers {
		args = append(args, "-H", fmt.Sprintf("%s: %s", key, value))
	}

	// Add common headers for better compatibility
	args = append(args,
		"-H", "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
		"-H", "Accept-Language: en-US,en;q=0.5",
		"-H", "DNT: 1",
		"-H", "Connection: keep-alive",
		"-H", "Upgrade-Insecure-Requests: 1",
	)

	args = append(args, url)

	cmd := exec.CommandContext(ctx, curlPath, args...)
	var stdout, stderr bytes.Buffer
	cmd.Stdout = &stdout
	cmd.Stderr = &stderr

	if err := cmd.Run(); err != nil {
		// Check if it's a context cancellation
		if ctx.Err() != nil {
			return nil, ctx.Err()
		}
		return nil, fmt.Errorf("curl failed: %s - %s", err.Error(), stderr.String())
	}

	output := stdout.String()

	// Parse the output - content and metadata are separated by ---CURL_INFO---
	parts := strings.Split(output, "\n---CURL_INFO---\n")
	if len(parts) != 2 {
		return nil, fmt.Errorf("unexpected curl output format")
	}

	content := parts[0]
	metaLines := strings.Split(strings.TrimSpace(parts[1]), "\n")

	if len(metaLines) < 3 {
		return nil, fmt.Errorf("incomplete curl metadata")
	}

	contentType := metaLines[0]
	finalURL := metaLines[1]
	statusCode := 200
	fmt.Sscanf(metaLines[2], "%d", &statusCode)

	// Truncate content if needed
	var truncated bool
	var originalSize int
	if len(content) > opts.MaxLength {
		originalSize = len(content)
		content = content[:opts.MaxLength]
		truncated = true
	}

	return &FetchResult{
		Content:      content,
		ContentType:  contentType,
		FinalURL:     finalURL,
		StatusCode:   statusCode,
		Method:       FetchMethodCurl,
		Truncated:    truncated,
		OriginalSize: originalSize,
	}, nil
}

// fetchWithWget uses wget to fetch the URL
func (f *Fetcher) fetchWithWget(ctx context.Context, url string, wgetPath string, opts FetchOptions) (*FetchResult, error) {
	f.mu.RLock()
	isBusyBox := f.wgetIsBusyBox
	f.mu.RUnlock()

	var args []string

	if isBusyBox {
		// BusyBox wget has limited options - use short flags only
		args = []string{
			"-q",           // Quiet
			"-O", "-",      // Output to stdout
			"-T", fmt.Sprintf("%d", int(opts.Timeout.Seconds())), // Timeout
			"-U", opts.UserAgent, // User agent
		}
		// BusyBox wget doesn't support custom headers or max-redirect
	} else {
		// GNU wget supports full options
		args = []string{
			"-q",                           // Quiet
			"-O", "-",                      // Output to stdout
			"--timeout", fmt.Sprintf("%d", int(opts.Timeout.Seconds())),
			"--user-agent", opts.UserAgent,
			"--max-redirect", "10",         // Follow up to 10 redirects
			"--header", "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
			"--header", "Accept-Language: en-US,en;q=0.5",
		}

		// Add custom headers (GNU wget only)
		for key, value := range opts.Headers {
			args = append(args, "--header", fmt.Sprintf("%s: %s", key, value))
		}
	}

	args = append(args, url)

	cmd := exec.CommandContext(ctx, wgetPath, args...)
	var stdout, stderr bytes.Buffer
	cmd.Stdout = &stdout
	cmd.Stderr = &stderr

	if err := cmd.Run(); err != nil {
		if ctx.Err() != nil {
			return nil, ctx.Err()
		}
		return nil, fmt.Errorf("wget failed: %s - %s", err.Error(), stderr.String())
	}

	content := stdout.String()

	// Truncate content if needed
	var truncated bool
	var originalSize int
	if len(content) > opts.MaxLength {
		originalSize = len(content)
		content = content[:opts.MaxLength]
		truncated = true
	}

	// wget doesn't easily provide metadata, so we use defaults
	return &FetchResult{
		Content:      content,
		ContentType:  "text/html", // Assume HTML (wget doesn't easily give us this)
		FinalURL:     url,         // wget doesn't easily give us the final URL
		StatusCode:   200,
		Method:       FetchMethodWget,
		Truncated:    truncated,
		OriginalSize: originalSize,
	}, nil
}

// fetchNative uses Go's native http.Client with enhanced capabilities
func (f *Fetcher) fetchNative(ctx context.Context, url string, opts FetchOptions) (*FetchResult, error) {
	// Create request with context
	req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
	if err != nil {
		return nil, fmt.Errorf("failed to create request: %w", err)
	}

	// Set headers
	req.Header.Set("User-Agent", opts.UserAgent)
	req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
	req.Header.Set("Accept-Language", "en-US,en;q=0.5")
	req.Header.Set("Accept-Encoding", "gzip, deflate")
	req.Header.Set("DNT", "1")
	req.Header.Set("Connection", "keep-alive")
	req.Header.Set("Upgrade-Insecure-Requests", "1")

	// Add custom headers
	for key, value := range opts.Headers {
		req.Header.Set(key, value)
	}

	// Create a client with custom timeout
	client := &http.Client{
		Jar:     f.httpClient.Jar,
		Timeout: opts.Timeout,
		CheckRedirect: func(req *http.Request, via []*http.Request) error {
			if !opts.FollowRedirects {
				return http.ErrUseLastResponse
			}
			if len(via) >= 10 {
				return fmt.Errorf("too many redirects")
			}
			return nil
		},
	}

	// Execute request
	resp, err := client.Do(req)
	if err != nil {
		return nil, fmt.Errorf("request failed: %w", err)
	}
	defer resp.Body.Close()

	// Read body with limit + 1 byte to detect truncation
	body, err := io.ReadAll(io.LimitReader(resp.Body, int64(opts.MaxLength)+1))
	if err != nil {
		return nil, fmt.Errorf("failed to read response: %w", err)
	}

	var truncated bool
	var originalSize int
	if len(body) > opts.MaxLength {
		originalSize = len(body) // Note: this is just maxLength+1, not true original
		body = body[:opts.MaxLength]
		truncated = true
	}

	return &FetchResult{
		Content:      string(body),
		ContentType:  resp.Header.Get("Content-Type"),
		FinalURL:     resp.Request.URL.String(),
		StatusCode:   resp.StatusCode,
		Method:       FetchMethodNative,
		Truncated:    truncated,
		OriginalSize: originalSize,
	}, nil
}

// FetchWithHeadless explicitly uses headless browser (for API use)
func (f *Fetcher) FetchWithHeadless(ctx context.Context, url string, opts FetchOptions) (*FetchResult, error) {
	if !f.hasChrome {
		return nil, fmt.Errorf("headless Chrome not available - Chrome/Chromium not found")
	}
	return f.fetchWithChrome(ctx, url, opts)
}

// TryFetchWithFallback attempts to fetch using all available methods
func (f *Fetcher) TryFetchWithFallback(ctx context.Context, url string, opts FetchOptions) (*FetchResult, error) {
	f.mu.RLock()
	curlPath := f.curlPath
	wgetPath := f.wgetPath
	hasChrome := f.hasChrome
	f.mu.RUnlock()

	var lastErr error

	// Try curl first if available
	if curlPath != "" {
		result, err := f.fetchWithCurl(ctx, url, curlPath, opts)
		if err == nil {
			return result, nil
		}
		lastErr = fmt.Errorf("curl: %w", err)
	}

	// Try wget if available
	if wgetPath != "" {
		result, err := f.fetchWithWget(ctx, url, wgetPath, opts)
		if err == nil {
			return result, nil
		}
		lastErr = fmt.Errorf("wget: %w", err)
	}

	// Try native HTTP
	result, err := f.fetchNative(ctx, url, opts)
	if err == nil {
		return result, nil
	}
	lastErr = fmt.Errorf("native: %w", err)

	// Last resort: try headless Chrome
	if hasChrome {
		result, err := f.fetchWithChrome(ctx, url, opts)
		if err == nil {
			return result, nil
		}
		lastErr = fmt.Errorf("chrome: %w", err)
	}

	return nil, fmt.Errorf("all fetch methods failed: %v", lastErr)
}