feat(tools): graceful truncation for fetch_url with LLM retry hints
- Remove curl --max-filesize to avoid hard failures on large pages - Add Truncated/OriginalSize fields to FetchResult for all fetch methods - Return truncation info in proxy response (truncated, originalSize, returnedSize) - Add timeout parameter to fetch_url tool (default 30s, max 120s) - Increase default maxLength from 5KB to 50KB, allow up to 2MB - Include _hint in response guiding LLM to retry with larger maxLength Instead of failing when content exceeds limits, the tool now returns truncated content with guidance for the LLM to request more if needed.
This commit is contained in:
@@ -29,11 +29,13 @@ const (
|
||||
|
||||
// FetchResult contains the result of a URL fetch
|
||||
type FetchResult struct {
|
||||
Content string
|
||||
ContentType string
|
||||
FinalURL string
|
||||
StatusCode int
|
||||
Method FetchMethod
|
||||
Content string
|
||||
ContentType string
|
||||
FinalURL string
|
||||
StatusCode int
|
||||
Method FetchMethod
|
||||
Truncated bool // True if content was truncated due to MaxLength
|
||||
OriginalSize int // Original size before truncation (0 if not truncated)
|
||||
}
|
||||
|
||||
// FetchOptions configures the fetch behavior
|
||||
@@ -401,16 +403,22 @@ func (f *Fetcher) fetchWithChrome(ctx context.Context, url string, opts FetchOpt
|
||||
}
|
||||
|
||||
// Truncate if needed
|
||||
var truncated bool
|
||||
var originalSize int
|
||||
if len(content) > opts.MaxLength {
|
||||
originalSize = len(content)
|
||||
content = content[:opts.MaxLength]
|
||||
truncated = true
|
||||
}
|
||||
|
||||
return &FetchResult{
|
||||
Content: content,
|
||||
ContentType: "text/html",
|
||||
FinalURL: finalURL,
|
||||
StatusCode: 200,
|
||||
Method: FetchMethodChrome,
|
||||
Content: content,
|
||||
ContentType: "text/html",
|
||||
FinalURL: finalURL,
|
||||
StatusCode: 200,
|
||||
Method: FetchMethodChrome,
|
||||
Truncated: truncated,
|
||||
OriginalSize: originalSize,
|
||||
}, nil
|
||||
}
|
||||
|
||||
@@ -420,7 +428,6 @@ func (f *Fetcher) fetchWithCurl(ctx context.Context, url string, curlPath string
|
||||
"-sS", // Silent but show errors
|
||||
"-L", // Follow redirects
|
||||
"--max-time", fmt.Sprintf("%d", int(opts.Timeout.Seconds())),
|
||||
"--max-filesize", fmt.Sprintf("%d", opts.MaxLength),
|
||||
"-A", opts.UserAgent, // User agent
|
||||
"-w", "\n---CURL_INFO---\n%{content_type}\n%{url_effective}\n%{http_code}", // Output metadata
|
||||
"--compressed", // Accept compressed responses
|
||||
@@ -476,16 +483,22 @@ func (f *Fetcher) fetchWithCurl(ctx context.Context, url string, curlPath string
|
||||
fmt.Sscanf(metaLines[2], "%d", &statusCode)
|
||||
|
||||
// Truncate content if needed
|
||||
var truncated bool
|
||||
var originalSize int
|
||||
if len(content) > opts.MaxLength {
|
||||
originalSize = len(content)
|
||||
content = content[:opts.MaxLength]
|
||||
truncated = true
|
||||
}
|
||||
|
||||
return &FetchResult{
|
||||
Content: content,
|
||||
ContentType: contentType,
|
||||
FinalURL: finalURL,
|
||||
StatusCode: statusCode,
|
||||
Method: FetchMethodCurl,
|
||||
Content: content,
|
||||
ContentType: contentType,
|
||||
FinalURL: finalURL,
|
||||
StatusCode: statusCode,
|
||||
Method: FetchMethodCurl,
|
||||
Truncated: truncated,
|
||||
OriginalSize: originalSize,
|
||||
}, nil
|
||||
}
|
||||
|
||||
@@ -541,17 +554,23 @@ func (f *Fetcher) fetchWithWget(ctx context.Context, url string, wgetPath string
|
||||
content := stdout.String()
|
||||
|
||||
// Truncate content if needed
|
||||
var truncated bool
|
||||
var originalSize int
|
||||
if len(content) > opts.MaxLength {
|
||||
originalSize = len(content)
|
||||
content = content[:opts.MaxLength]
|
||||
truncated = true
|
||||
}
|
||||
|
||||
// wget doesn't easily provide metadata, so we use defaults
|
||||
return &FetchResult{
|
||||
Content: content,
|
||||
ContentType: "text/html", // Assume HTML (wget doesn't easily give us this)
|
||||
FinalURL: url, // wget doesn't easily give us the final URL
|
||||
StatusCode: 200,
|
||||
Method: FetchMethodWget,
|
||||
Content: content,
|
||||
ContentType: "text/html", // Assume HTML (wget doesn't easily give us this)
|
||||
FinalURL: url, // wget doesn't easily give us the final URL
|
||||
StatusCode: 200,
|
||||
Method: FetchMethodWget,
|
||||
Truncated: truncated,
|
||||
OriginalSize: originalSize,
|
||||
}, nil
|
||||
}
|
||||
|
||||
@@ -599,18 +618,28 @@ func (f *Fetcher) fetchNative(ctx context.Context, url string, opts FetchOptions
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
// Read body with limit
|
||||
body, err := io.ReadAll(io.LimitReader(resp.Body, int64(opts.MaxLength)))
|
||||
// Read body with limit + 1 byte to detect truncation
|
||||
body, err := io.ReadAll(io.LimitReader(resp.Body, int64(opts.MaxLength)+1))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to read response: %w", err)
|
||||
}
|
||||
|
||||
var truncated bool
|
||||
var originalSize int
|
||||
if len(body) > opts.MaxLength {
|
||||
originalSize = len(body) // Note: this is just maxLength+1, not true original
|
||||
body = body[:opts.MaxLength]
|
||||
truncated = true
|
||||
}
|
||||
|
||||
return &FetchResult{
|
||||
Content: string(body),
|
||||
ContentType: resp.Header.Get("Content-Type"),
|
||||
FinalURL: resp.Request.URL.String(),
|
||||
StatusCode: resp.StatusCode,
|
||||
Method: FetchMethodNative,
|
||||
Content: string(body),
|
||||
ContentType: resp.Header.Get("Content-Type"),
|
||||
FinalURL: resp.Request.URL.String(),
|
||||
StatusCode: resp.StatusCode,
|
||||
Method: FetchMethodNative,
|
||||
Truncated: truncated,
|
||||
OriginalSize: originalSize,
|
||||
}, nil
|
||||
}
|
||||
|
||||
|
||||
@@ -12,6 +12,7 @@ import (
|
||||
type URLFetchRequest struct {
|
||||
URL string `json:"url" binding:"required"`
|
||||
MaxLength int `json:"maxLength"`
|
||||
Timeout int `json:"timeout"` // Timeout in seconds
|
||||
}
|
||||
|
||||
// URLFetchProxyHandler returns a handler that fetches URLs for the frontend
|
||||
@@ -42,10 +43,16 @@ func URLFetchProxyHandler() gin.HandlerFunc {
|
||||
|
||||
// Set up fetch options
|
||||
opts := DefaultFetchOptions()
|
||||
opts.Timeout = 30 * time.Second
|
||||
|
||||
// Set max length (default 500KB)
|
||||
if req.MaxLength > 0 && req.MaxLength <= 500000 {
|
||||
// Set timeout (default 30s, max 120s)
|
||||
if req.Timeout > 0 && req.Timeout <= 120 {
|
||||
opts.Timeout = time.Duration(req.Timeout) * time.Second
|
||||
} else {
|
||||
opts.Timeout = 30 * time.Second
|
||||
}
|
||||
|
||||
// Set max length (default 500KB, max 2MB)
|
||||
if req.MaxLength > 0 && req.MaxLength <= 2000000 {
|
||||
opts.MaxLength = req.MaxLength
|
||||
}
|
||||
|
||||
@@ -66,13 +73,22 @@ func URLFetchProxyHandler() gin.HandlerFunc {
|
||||
}
|
||||
|
||||
// Return the content
|
||||
c.JSON(http.StatusOK, gin.H{
|
||||
response := gin.H{
|
||||
"content": result.Content,
|
||||
"contentType": result.ContentType,
|
||||
"url": result.FinalURL,
|
||||
"status": result.StatusCode,
|
||||
"fetchMethod": string(result.Method),
|
||||
})
|
||||
}
|
||||
|
||||
// Include truncation info if content was truncated
|
||||
if result.Truncated {
|
||||
response["truncated"] = true
|
||||
response["originalSize"] = result.OriginalSize
|
||||
response["returnedSize"] = len(result.Content)
|
||||
}
|
||||
|
||||
c.JSON(http.StatusOK, response)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -291,13 +291,14 @@ interface FetchUrlArgs {
|
||||
url: string;
|
||||
extract?: 'text' | 'title' | 'links' | 'all';
|
||||
maxLength?: number;
|
||||
timeout?: number;
|
||||
}
|
||||
|
||||
const fetchUrlDefinition: ToolDefinition = {
|
||||
type: 'function',
|
||||
function: {
|
||||
name: 'fetch_url',
|
||||
description: 'Fetches and reads content from a specific URL. Use after web_search to read full content from a result URL, or when user provides a URL directly.',
|
||||
description: 'Fetches and reads content from a URL. If content is truncated, you can retry with a larger maxLength. Use after web_search to read full content, or when user provides a URL directly.',
|
||||
parameters: {
|
||||
type: 'object',
|
||||
properties: {
|
||||
@@ -312,7 +313,11 @@ const fetchUrlDefinition: ToolDefinition = {
|
||||
},
|
||||
maxLength: {
|
||||
type: 'number',
|
||||
description: 'Max text length (default: 5000)'
|
||||
description: 'Max content length in bytes. Start with 50000, increase to 200000 or 500000 if truncated. Max: 2000000'
|
||||
},
|
||||
timeout: {
|
||||
type: 'number',
|
||||
description: 'Request timeout in seconds (default: 30, max: 120). Increase for slow sites.'
|
||||
}
|
||||
},
|
||||
required: ['url']
|
||||
@@ -320,21 +325,35 @@ const fetchUrlDefinition: ToolDefinition = {
|
||||
}
|
||||
};
|
||||
|
||||
interface ProxyFetchResult {
|
||||
html: string;
|
||||
finalUrl: string;
|
||||
truncated?: boolean;
|
||||
originalSize?: number;
|
||||
returnedSize?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Try to fetch URL via backend proxy first (bypasses CORS), fall back to direct fetch
|
||||
*/
|
||||
async function fetchViaProxy(url: string, maxLength: number): Promise<{ html: string; finalUrl: string } | { error: string }> {
|
||||
async function fetchViaProxy(url: string, maxLength: number, timeout: number): Promise<ProxyFetchResult | { error: string }> {
|
||||
// Try backend proxy first
|
||||
try {
|
||||
const proxyResponse = await fetch('/api/v1/proxy/fetch', {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({ url, maxLength })
|
||||
body: JSON.stringify({ url, maxLength, timeout })
|
||||
});
|
||||
|
||||
if (proxyResponse.ok) {
|
||||
const data = await proxyResponse.json();
|
||||
return { html: data.content, finalUrl: data.url };
|
||||
return {
|
||||
html: data.content,
|
||||
finalUrl: data.url,
|
||||
truncated: data.truncated,
|
||||
originalSize: data.originalSize,
|
||||
returnedSize: data.returnedSize
|
||||
};
|
||||
}
|
||||
|
||||
// If proxy returns an error, extract it
|
||||
@@ -380,7 +399,7 @@ async function fetchViaProxy(url: string, maxLength: number): Promise<{ html: st
|
||||
}
|
||||
|
||||
const fetchUrlHandler: BuiltinToolHandler<FetchUrlArgs> = async (args) => {
|
||||
const { url, extract = 'text', maxLength = 5000 } = args;
|
||||
const { url, extract = 'text', maxLength = 50000, timeout = 30 } = args;
|
||||
|
||||
try {
|
||||
const parsedUrl = new URL(url);
|
||||
@@ -389,12 +408,12 @@ const fetchUrlHandler: BuiltinToolHandler<FetchUrlArgs> = async (args) => {
|
||||
}
|
||||
|
||||
// Fetch via proxy or direct
|
||||
const result = await fetchViaProxy(url, maxLength);
|
||||
const result = await fetchViaProxy(url, maxLength, timeout);
|
||||
if ('error' in result) {
|
||||
return result;
|
||||
}
|
||||
|
||||
const { html, finalUrl } = result;
|
||||
const { html, finalUrl, truncated, originalSize, returnedSize } = result;
|
||||
|
||||
const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/i);
|
||||
const title = titleMatch ? titleMatch[1].trim() : null;
|
||||
@@ -419,21 +438,34 @@ const fetchUrlHandler: BuiltinToolHandler<FetchUrlArgs> = async (args) => {
|
||||
})).filter(link => link.url && !link.url.startsWith('#'));
|
||||
|
||||
if (extract === 'links') {
|
||||
return links;
|
||||
return truncated
|
||||
? { links, warning: `Content was truncated (${returnedSize ?? maxLength} bytes). Some links may be missing.` }
|
||||
: links;
|
||||
}
|
||||
|
||||
const text = stripHtml(html).substring(0, maxLength);
|
||||
|
||||
// Build response with truncation info
|
||||
const buildResponse = (data: unknown) => {
|
||||
if (!truncated) return data;
|
||||
const suggestedSize = originalSize ? Math.min(originalSize * 2, 2000000) : maxLength * 2;
|
||||
return {
|
||||
...(typeof data === 'object' ? data : { content: data }),
|
||||
_truncated: true,
|
||||
_hint: `Content truncated to ${returnedSize ?? maxLength} bytes. ${originalSize ? `Original was ${originalSize} bytes. ` : ''}Retry with larger maxLength (e.g., ${suggestedSize}) to get full content.`
|
||||
};
|
||||
};
|
||||
|
||||
if (extract === 'text') {
|
||||
return text;
|
||||
return buildResponse(text);
|
||||
}
|
||||
|
||||
return {
|
||||
return buildResponse({
|
||||
title,
|
||||
text,
|
||||
links: links.slice(0, 20),
|
||||
url: finalUrl
|
||||
};
|
||||
});
|
||||
} catch (error) {
|
||||
return { error: `Failed to fetch URL: ${error instanceof Error ? error.message : 'Unknown error'}` };
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user