Files
vessel/frontend/src/lib/utils/file-processor.ts
vikingowl 26b58fbd50 feat: improve file attachment handling with processing indicator
- Add "Processing X files..." indicator in chat while handling attachments
- Indicator transitions to "Analyzing X files..." for large files needing LLM summarization
- Reuse streaming message for seamless transition to LLM response
- Add FileAnalyzer service for large file summarization with 10s timeout
- Skip analysis for borderline files (within 20% of 8K threshold)
- Read up to 50KB from original file for analysis (not just truncated content)
- Remove base64 blobs from JSON before analysis to reduce prompt size
- Add AttachmentDisplay component for showing file badges on messages
- Persist attachments to IndexedDB with message references
- Add chat state methods: setStreamContent, removeMessage
- Clean up debug logging
2026-01-04 00:35:33 +01:00

443 lines
11 KiB
TypeScript

/**
* File processor utility
* Handles reading, processing, and extracting content from files
* Supports images, text files, and PDFs
*/
import type {
AttachmentType,
FileAttachment,
ProcessFileOutcome
} from '$lib/types/attachment.js';
import {
isImageMimeType,
isTextMimeType,
isPdfMimeType,
isTextExtension,
MAX_IMAGE_SIZE,
MAX_TEXT_SIZE,
MAX_PDF_SIZE,
MAX_IMAGE_DIMENSION,
MAX_EXTRACTED_CONTENT
} from '$lib/types/attachment.js';
// ============================================================================
// File Type Detection
// ============================================================================
/**
* Detect the attachment type for a file
* @returns The attachment type or null if unsupported
*/
export function detectFileType(file: File): AttachmentType | null {
const mimeType = file.type.toLowerCase();
if (isImageMimeType(mimeType)) {
return 'image';
}
if (isPdfMimeType(mimeType)) {
return 'pdf';
}
if (isTextMimeType(mimeType)) {
return 'text';
}
// Check by extension as fallback
if (isTextExtension(file.name)) {
return 'text';
}
return null;
}
// ============================================================================
// Content Truncation
// ============================================================================
/**
* Result of content truncation
*/
interface TruncateResult {
content: string;
truncated: boolean;
originalLength: number;
}
/**
* Truncate content to maximum allowed length
* Tries to truncate at a natural boundary (newline or space)
*/
function truncateContent(content: string, maxLength: number = MAX_EXTRACTED_CONTENT): TruncateResult {
const originalLength = content.length;
if (originalLength <= maxLength) {
return { content, truncated: false, originalLength };
}
// Try to find a natural break point (newline or space) near the limit
let cutPoint = maxLength;
const searchStart = Math.max(0, maxLength - 500);
// Look for last newline before cutoff
const lastNewline = content.lastIndexOf('\n', maxLength);
if (lastNewline > searchStart) {
cutPoint = lastNewline;
} else {
// Look for last space
const lastSpace = content.lastIndexOf(' ', maxLength);
if (lastSpace > searchStart) {
cutPoint = lastSpace;
}
}
const truncatedContent = content.slice(0, cutPoint) +
`\n\n[... content truncated: ${formatFileSize(originalLength)} total, showing first ${formatFileSize(cutPoint)} ...]`;
return {
content: truncatedContent,
truncated: true,
originalLength
};
}
// ============================================================================
// Text File Processing
// ============================================================================
/**
* Read a text file and return its content with truncation info
*/
export async function readTextFile(file: File): Promise<TruncateResult> {
if (file.size > MAX_TEXT_SIZE) {
throw new Error(`File too large. Maximum size is ${MAX_TEXT_SIZE / 1024 / 1024}MB`);
}
return new Promise((resolve, reject) => {
const reader = new FileReader();
reader.onload = () => {
const rawContent = reader.result as string;
resolve(truncateContent(rawContent));
};
reader.onerror = () => reject(new Error('Failed to read file'));
reader.readAsText(file);
});
}
// ============================================================================
// Image Processing
// ============================================================================
/**
* Process an image file: resize if needed, compress, and return base64
*/
export async function processImage(file: File): Promise<{ base64: string; previewUrl: string }> {
if (file.size > MAX_IMAGE_SIZE * 5) {
// Allow larger initial size, we'll compress
throw new Error(`Image too large. Maximum size is ${(MAX_IMAGE_SIZE * 5) / 1024 / 1024}MB`);
}
return new Promise((resolve, reject) => {
const img = new Image();
const objectUrl = URL.createObjectURL(file);
img.onload = () => {
URL.revokeObjectURL(objectUrl);
// Calculate new dimensions
let { width, height } = img;
if (width > MAX_IMAGE_DIMENSION || height > MAX_IMAGE_DIMENSION) {
const ratio = Math.min(MAX_IMAGE_DIMENSION / width, MAX_IMAGE_DIMENSION / height);
width = Math.round(width * ratio);
height = Math.round(height * ratio);
}
// Draw to canvas and compress
const canvas = document.createElement('canvas');
canvas.width = width;
canvas.height = height;
const ctx = canvas.getContext('2d');
if (!ctx) {
reject(new Error('Failed to create canvas context'));
return;
}
ctx.drawImage(img, 0, 0, width, height);
// Get as JPEG for compression (better than PNG for most cases)
const quality = 0.85;
const dataUrl = canvas.toDataURL('image/jpeg', quality);
// Extract base64 without the data: prefix (Ollama requirement)
const base64 = dataUrl.replace(/^data:image\/\w+;base64,/, '');
resolve({
base64,
previewUrl: dataUrl
});
};
img.onerror = () => {
URL.revokeObjectURL(objectUrl);
reject(new Error('Failed to load image'));
};
img.src = objectUrl;
});
}
// ============================================================================
// PDF Processing
// ============================================================================
// PDF.js will be loaded dynamically when needed
let pdfjsLib: typeof import('pdfjs-dist') | null = null;
/**
* Load PDF.js library dynamically
*/
async function loadPdfJs(): Promise<typeof import('pdfjs-dist')> {
if (pdfjsLib) return pdfjsLib;
try {
pdfjsLib = await import('pdfjs-dist');
// Use locally bundled worker (copied to static/ during build)
// Falls back to CDN if local worker isn't available
const localWorkerPath = '/pdf.worker.min.mjs';
const cdnWorkerPath = `https://cdnjs.cloudflare.com/ajax/libs/pdf.js/${pdfjsLib.version}/pdf.worker.min.mjs`;
// Try local first, with CDN fallback
try {
const response = await fetch(localWorkerPath, { method: 'HEAD' });
pdfjsLib.GlobalWorkerOptions.workerSrc = response.ok ? localWorkerPath : cdnWorkerPath;
} catch {
pdfjsLib.GlobalWorkerOptions.workerSrc = cdnWorkerPath;
}
return pdfjsLib;
} catch (error) {
throw new Error('PDF.js library not available. Install with: npm install pdfjs-dist');
}
}
/**
* Extract text content from a PDF file with error handling and content limits
*/
export async function extractPdfText(file: File): Promise<TruncateResult> {
if (file.size > MAX_PDF_SIZE) {
throw new Error(`PDF too large. Maximum size is ${MAX_PDF_SIZE / 1024 / 1024}MB`);
}
const pdfjs = await loadPdfJs();
const arrayBuffer = await file.arrayBuffer();
const pdf = await pdfjs.getDocument({ data: arrayBuffer }).promise;
const textParts: string[] = [];
let totalChars = 0;
let stoppedEarly = false;
const failedPages: number[] = [];
for (let i = 1; i <= pdf.numPages; i++) {
// Stop if we've already collected enough content
if (totalChars >= MAX_EXTRACTED_CONTENT) {
stoppedEarly = true;
break;
}
try {
const page = await pdf.getPage(i);
const textContent = await page.getTextContent();
// Null check for textContent.items
if (!textContent?.items) {
console.warn(`PDF page ${i}: No text content items`);
failedPages.push(i);
continue;
}
const pageText = textContent.items
.filter((item): item is import('pdfjs-dist/types/src/display/api').TextItem =>
'str' in item && typeof item.str === 'string'
)
.map((item) => item.str)
.join(' ')
.trim();
if (pageText) {
textParts.push(pageText);
totalChars += pageText.length;
}
} catch (pageError) {
console.warn(`PDF page ${i} extraction failed:`, pageError);
failedPages.push(i);
// Continue with other pages instead of failing entirely
}
}
let rawContent = textParts.join('\n\n');
// Add metadata about extraction issues
const metadata: string[] = [];
if (failedPages.length > 0) {
metadata.push(`[Note: Failed to extract pages: ${failedPages.join(', ')}]`);
}
if (stoppedEarly) {
metadata.push(`[Note: Extraction stopped at page ${textParts.length} of ${pdf.numPages} due to content limit]`);
}
if (metadata.length > 0) {
rawContent = metadata.join('\n') + '\n\n' + rawContent;
}
return truncateContent(rawContent);
}
// ============================================================================
// Main Processing Function
// ============================================================================
/**
* Process a file and create an attachment
* Handles all file types (image, text, PDF)
*/
export async function processFile(file: File): Promise<ProcessFileOutcome> {
const type = detectFileType(file);
if (!type) {
return {
success: false,
error: `Unsupported file type: ${file.type || 'unknown'}`
};
}
const id = crypto.randomUUID();
try {
const baseAttachment: FileAttachment = {
id,
type,
filename: file.name,
mimeType: file.type,
size: file.size
};
switch (type) {
case 'image': {
const { base64, previewUrl } = await processImage(file);
return {
success: true,
attachment: {
...baseAttachment,
base64Data: base64,
previewUrl,
originalFile: file
}
};
}
case 'text': {
const result = await readTextFile(file);
return {
success: true,
attachment: {
...baseAttachment,
textContent: result.content,
truncated: result.truncated,
originalLength: result.originalLength,
originalFile: file
}
};
}
case 'pdf': {
const result = await extractPdfText(file);
return {
success: true,
attachment: {
...baseAttachment,
textContent: result.content,
truncated: result.truncated,
originalLength: result.originalLength,
originalFile: file
}
};
}
default:
return {
success: false,
error: `Unsupported file type: ${type}`
};
}
} catch (error) {
return {
success: false,
error: error instanceof Error ? error.message : 'Unknown error processing file'
};
}
}
// ============================================================================
// Utility Functions
// ============================================================================
/**
* Format file size for display
*/
export function formatFileSize(bytes: number): string {
if (bytes < 1024) {
return `${bytes} B`;
}
if (bytes < 1024 * 1024) {
return `${(bytes / 1024).toFixed(1)} KB`;
}
return `${(bytes / (1024 * 1024)).toFixed(1)} MB`;
}
/**
* Get a file icon based on type
*/
export function getFileIcon(type: AttachmentType): string {
switch (type) {
case 'image':
return '🖼️';
case 'pdf':
return '📄';
case 'text':
return '📝';
default:
return '📎';
}
}
/**
* Format attachment content for inclusion in message
* Uses XML-style tags for cleaner parsing by LLMs
*/
export function formatAttachmentsForMessage(attachments: FileAttachment[]): string {
return attachments
.filter((a) => a.textContent)
.map((a) => {
const truncatedAttr = a.truncated ? ' truncated="true"' : '';
const sizeAttr = ` size="${formatFileSize(a.size)}"`;
return `<file name="${escapeXmlAttr(a.filename)}"${sizeAttr}${truncatedAttr}>\n${a.textContent}\n</file>`;
})
.join('\n\n');
}
/**
* Escape special characters for XML attribute values
*/
function escapeXmlAttr(str: string): string {
return str
.replace(/&/g, '&amp;')
.replace(/"/g, '&quot;')
.replace(/'/g, '&apos;')
.replace(/</g, '&lt;')
.replace(/>/g, '&gt;');
}