feat(tools): graceful truncation for fetch_url with LLM retry hints

- Remove curl --max-filesize to avoid hard failures on large pages
- Add Truncated/OriginalSize fields to FetchResult for all fetch methods
- Return truncation info in proxy response (truncated, originalSize, returnedSize)
- Add timeout parameter to fetch_url tool (default 30s, max 120s)
- Increase default maxLength from 5KB to 50KB, allow up to 2MB
- Include _hint in response guiding LLM to retry with larger maxLength

Instead of failing when content exceeds limits, the tool now returns
truncated content with guidance for the LLM to request more if needed.
This commit is contained in:
2026-01-02 19:14:35 +01:00
parent 686c59c6bc
commit c2214aef96
3 changed files with 122 additions and 45 deletions

View File

@@ -29,11 +29,13 @@ const (
// FetchResult contains the result of a URL fetch
type FetchResult struct {
Content string
ContentType string
FinalURL string
StatusCode int
Method FetchMethod
Content string
ContentType string
FinalURL string
StatusCode int
Method FetchMethod
Truncated bool // True if content was truncated due to MaxLength
OriginalSize int // Original size before truncation (0 if not truncated)
}
// FetchOptions configures the fetch behavior
@@ -401,16 +403,22 @@ func (f *Fetcher) fetchWithChrome(ctx context.Context, url string, opts FetchOpt
}
// Truncate if needed
var truncated bool
var originalSize int
if len(content) > opts.MaxLength {
originalSize = len(content)
content = content[:opts.MaxLength]
truncated = true
}
return &FetchResult{
Content: content,
ContentType: "text/html",
FinalURL: finalURL,
StatusCode: 200,
Method: FetchMethodChrome,
Content: content,
ContentType: "text/html",
FinalURL: finalURL,
StatusCode: 200,
Method: FetchMethodChrome,
Truncated: truncated,
OriginalSize: originalSize,
}, nil
}
@@ -420,7 +428,6 @@ func (f *Fetcher) fetchWithCurl(ctx context.Context, url string, curlPath string
"-sS", // Silent but show errors
"-L", // Follow redirects
"--max-time", fmt.Sprintf("%d", int(opts.Timeout.Seconds())),
"--max-filesize", fmt.Sprintf("%d", opts.MaxLength),
"-A", opts.UserAgent, // User agent
"-w", "\n---CURL_INFO---\n%{content_type}\n%{url_effective}\n%{http_code}", // Output metadata
"--compressed", // Accept compressed responses
@@ -476,16 +483,22 @@ func (f *Fetcher) fetchWithCurl(ctx context.Context, url string, curlPath string
fmt.Sscanf(metaLines[2], "%d", &statusCode)
// Truncate content if needed
var truncated bool
var originalSize int
if len(content) > opts.MaxLength {
originalSize = len(content)
content = content[:opts.MaxLength]
truncated = true
}
return &FetchResult{
Content: content,
ContentType: contentType,
FinalURL: finalURL,
StatusCode: statusCode,
Method: FetchMethodCurl,
Content: content,
ContentType: contentType,
FinalURL: finalURL,
StatusCode: statusCode,
Method: FetchMethodCurl,
Truncated: truncated,
OriginalSize: originalSize,
}, nil
}
@@ -541,17 +554,23 @@ func (f *Fetcher) fetchWithWget(ctx context.Context, url string, wgetPath string
content := stdout.String()
// Truncate content if needed
var truncated bool
var originalSize int
if len(content) > opts.MaxLength {
originalSize = len(content)
content = content[:opts.MaxLength]
truncated = true
}
// wget doesn't easily provide metadata, so we use defaults
return &FetchResult{
Content: content,
ContentType: "text/html", // Assume HTML (wget doesn't easily give us this)
FinalURL: url, // wget doesn't easily give us the final URL
StatusCode: 200,
Method: FetchMethodWget,
Content: content,
ContentType: "text/html", // Assume HTML (wget doesn't easily give us this)
FinalURL: url, // wget doesn't easily give us the final URL
StatusCode: 200,
Method: FetchMethodWget,
Truncated: truncated,
OriginalSize: originalSize,
}, nil
}
@@ -599,18 +618,28 @@ func (f *Fetcher) fetchNative(ctx context.Context, url string, opts FetchOptions
}
defer resp.Body.Close()
// Read body with limit
body, err := io.ReadAll(io.LimitReader(resp.Body, int64(opts.MaxLength)))
// Read body with limit + 1 byte to detect truncation
body, err := io.ReadAll(io.LimitReader(resp.Body, int64(opts.MaxLength)+1))
if err != nil {
return nil, fmt.Errorf("failed to read response: %w", err)
}
var truncated bool
var originalSize int
if len(body) > opts.MaxLength {
originalSize = len(body) // Note: this is just maxLength+1, not true original
body = body[:opts.MaxLength]
truncated = true
}
return &FetchResult{
Content: string(body),
ContentType: resp.Header.Get("Content-Type"),
FinalURL: resp.Request.URL.String(),
StatusCode: resp.StatusCode,
Method: FetchMethodNative,
Content: string(body),
ContentType: resp.Header.Get("Content-Type"),
FinalURL: resp.Request.URL.String(),
StatusCode: resp.StatusCode,
Method: FetchMethodNative,
Truncated: truncated,
OriginalSize: originalSize,
}, nil
}

View File

@@ -12,6 +12,7 @@ import (
type URLFetchRequest struct {
URL string `json:"url" binding:"required"`
MaxLength int `json:"maxLength"`
Timeout int `json:"timeout"` // Timeout in seconds
}
// URLFetchProxyHandler returns a handler that fetches URLs for the frontend
@@ -42,10 +43,16 @@ func URLFetchProxyHandler() gin.HandlerFunc {
// Set up fetch options
opts := DefaultFetchOptions()
opts.Timeout = 30 * time.Second
// Set max length (default 500KB)
if req.MaxLength > 0 && req.MaxLength <= 500000 {
// Set timeout (default 30s, max 120s)
if req.Timeout > 0 && req.Timeout <= 120 {
opts.Timeout = time.Duration(req.Timeout) * time.Second
} else {
opts.Timeout = 30 * time.Second
}
// Set max length (default 500KB, max 2MB)
if req.MaxLength > 0 && req.MaxLength <= 2000000 {
opts.MaxLength = req.MaxLength
}
@@ -66,13 +73,22 @@ func URLFetchProxyHandler() gin.HandlerFunc {
}
// Return the content
c.JSON(http.StatusOK, gin.H{
response := gin.H{
"content": result.Content,
"contentType": result.ContentType,
"url": result.FinalURL,
"status": result.StatusCode,
"fetchMethod": string(result.Method),
})
}
// Include truncation info if content was truncated
if result.Truncated {
response["truncated"] = true
response["originalSize"] = result.OriginalSize
response["returnedSize"] = len(result.Content)
}
c.JSON(http.StatusOK, response)
}
}

View File

@@ -291,13 +291,14 @@ interface FetchUrlArgs {
url: string;
extract?: 'text' | 'title' | 'links' | 'all';
maxLength?: number;
timeout?: number;
}
const fetchUrlDefinition: ToolDefinition = {
type: 'function',
function: {
name: 'fetch_url',
description: 'Fetches and reads content from a specific URL. Use after web_search to read full content from a result URL, or when user provides a URL directly.',
description: 'Fetches and reads content from a URL. If content is truncated, you can retry with a larger maxLength. Use after web_search to read full content, or when user provides a URL directly.',
parameters: {
type: 'object',
properties: {
@@ -312,7 +313,11 @@ const fetchUrlDefinition: ToolDefinition = {
},
maxLength: {
type: 'number',
description: 'Max text length (default: 5000)'
description: 'Max content length in bytes. Start with 50000, increase to 200000 or 500000 if truncated. Max: 2000000'
},
timeout: {
type: 'number',
description: 'Request timeout in seconds (default: 30, max: 120). Increase for slow sites.'
}
},
required: ['url']
@@ -320,21 +325,35 @@ const fetchUrlDefinition: ToolDefinition = {
}
};
interface ProxyFetchResult {
html: string;
finalUrl: string;
truncated?: boolean;
originalSize?: number;
returnedSize?: number;
}
/**
* Try to fetch URL via backend proxy first (bypasses CORS), fall back to direct fetch
*/
async function fetchViaProxy(url: string, maxLength: number): Promise<{ html: string; finalUrl: string } | { error: string }> {
async function fetchViaProxy(url: string, maxLength: number, timeout: number): Promise<ProxyFetchResult | { error: string }> {
// Try backend proxy first
try {
const proxyResponse = await fetch('/api/v1/proxy/fetch', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ url, maxLength })
body: JSON.stringify({ url, maxLength, timeout })
});
if (proxyResponse.ok) {
const data = await proxyResponse.json();
return { html: data.content, finalUrl: data.url };
return {
html: data.content,
finalUrl: data.url,
truncated: data.truncated,
originalSize: data.originalSize,
returnedSize: data.returnedSize
};
}
// If proxy returns an error, extract it
@@ -380,7 +399,7 @@ async function fetchViaProxy(url: string, maxLength: number): Promise<{ html: st
}
const fetchUrlHandler: BuiltinToolHandler<FetchUrlArgs> = async (args) => {
const { url, extract = 'text', maxLength = 5000 } = args;
const { url, extract = 'text', maxLength = 50000, timeout = 30 } = args;
try {
const parsedUrl = new URL(url);
@@ -389,12 +408,12 @@ const fetchUrlHandler: BuiltinToolHandler<FetchUrlArgs> = async (args) => {
}
// Fetch via proxy or direct
const result = await fetchViaProxy(url, maxLength);
const result = await fetchViaProxy(url, maxLength, timeout);
if ('error' in result) {
return result;
}
const { html, finalUrl } = result;
const { html, finalUrl, truncated, originalSize, returnedSize } = result;
const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/i);
const title = titleMatch ? titleMatch[1].trim() : null;
@@ -419,21 +438,34 @@ const fetchUrlHandler: BuiltinToolHandler<FetchUrlArgs> = async (args) => {
})).filter(link => link.url && !link.url.startsWith('#'));
if (extract === 'links') {
return links;
return truncated
? { links, warning: `Content was truncated (${returnedSize ?? maxLength} bytes). Some links may be missing.` }
: links;
}
const text = stripHtml(html).substring(0, maxLength);
// Build response with truncation info
const buildResponse = (data: unknown) => {
if (!truncated) return data;
const suggestedSize = originalSize ? Math.min(originalSize * 2, 2000000) : maxLength * 2;
return {
...(typeof data === 'object' ? data : { content: data }),
_truncated: true,
_hint: `Content truncated to ${returnedSize ?? maxLength} bytes. ${originalSize ? `Original was ${originalSize} bytes. ` : ''}Retry with larger maxLength (e.g., ${suggestedSize}) to get full content.`
};
};
if (extract === 'text') {
return text;
return buildResponse(text);
}
return {
return buildResponse({
title,
text,
links: links.slice(0, 20),
url: finalUrl
};
});
} catch (error) {
return { error: `Failed to fetch URL: ${error instanceof Error ? error.message : 'Unknown error'}` };
}