- Add "Processing X files..." indicator in chat while handling attachments - Indicator transitions to "Analyzing X files..." for large files needing LLM summarization - Reuse streaming message for seamless transition to LLM response - Add FileAnalyzer service for large file summarization with 10s timeout - Skip analysis for borderline files (within 20% of 8K threshold) - Read up to 50KB from original file for analysis (not just truncated content) - Remove base64 blobs from JSON before analysis to reduce prompt size - Add AttachmentDisplay component for showing file badges on messages - Persist attachments to IndexedDB with message references - Add chat state methods: setStreamContent, removeMessage - Clean up debug logging
443 lines
11 KiB
TypeScript
443 lines
11 KiB
TypeScript
/**
|
|
* File processor utility
|
|
* Handles reading, processing, and extracting content from files
|
|
* Supports images, text files, and PDFs
|
|
*/
|
|
|
|
import type {
|
|
AttachmentType,
|
|
FileAttachment,
|
|
ProcessFileOutcome
|
|
} from '$lib/types/attachment.js';
|
|
import {
|
|
isImageMimeType,
|
|
isTextMimeType,
|
|
isPdfMimeType,
|
|
isTextExtension,
|
|
MAX_IMAGE_SIZE,
|
|
MAX_TEXT_SIZE,
|
|
MAX_PDF_SIZE,
|
|
MAX_IMAGE_DIMENSION,
|
|
MAX_EXTRACTED_CONTENT
|
|
} from '$lib/types/attachment.js';
|
|
|
|
// ============================================================================
|
|
// File Type Detection
|
|
// ============================================================================
|
|
|
|
/**
|
|
* Detect the attachment type for a file
|
|
* @returns The attachment type or null if unsupported
|
|
*/
|
|
export function detectFileType(file: File): AttachmentType | null {
|
|
const mimeType = file.type.toLowerCase();
|
|
|
|
if (isImageMimeType(mimeType)) {
|
|
return 'image';
|
|
}
|
|
|
|
if (isPdfMimeType(mimeType)) {
|
|
return 'pdf';
|
|
}
|
|
|
|
if (isTextMimeType(mimeType)) {
|
|
return 'text';
|
|
}
|
|
|
|
// Check by extension as fallback
|
|
if (isTextExtension(file.name)) {
|
|
return 'text';
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
// ============================================================================
|
|
// Content Truncation
|
|
// ============================================================================
|
|
|
|
/**
|
|
* Result of content truncation
|
|
*/
|
|
interface TruncateResult {
|
|
content: string;
|
|
truncated: boolean;
|
|
originalLength: number;
|
|
}
|
|
|
|
/**
|
|
* Truncate content to maximum allowed length
|
|
* Tries to truncate at a natural boundary (newline or space)
|
|
*/
|
|
function truncateContent(content: string, maxLength: number = MAX_EXTRACTED_CONTENT): TruncateResult {
|
|
const originalLength = content.length;
|
|
|
|
if (originalLength <= maxLength) {
|
|
return { content, truncated: false, originalLength };
|
|
}
|
|
|
|
// Try to find a natural break point (newline or space) near the limit
|
|
let cutPoint = maxLength;
|
|
const searchStart = Math.max(0, maxLength - 500);
|
|
|
|
// Look for last newline before cutoff
|
|
const lastNewline = content.lastIndexOf('\n', maxLength);
|
|
if (lastNewline > searchStart) {
|
|
cutPoint = lastNewline;
|
|
} else {
|
|
// Look for last space
|
|
const lastSpace = content.lastIndexOf(' ', maxLength);
|
|
if (lastSpace > searchStart) {
|
|
cutPoint = lastSpace;
|
|
}
|
|
}
|
|
|
|
const truncatedContent = content.slice(0, cutPoint) +
|
|
`\n\n[... content truncated: ${formatFileSize(originalLength)} total, showing first ${formatFileSize(cutPoint)} ...]`;
|
|
|
|
return {
|
|
content: truncatedContent,
|
|
truncated: true,
|
|
originalLength
|
|
};
|
|
}
|
|
|
|
// ============================================================================
|
|
// Text File Processing
|
|
// ============================================================================
|
|
|
|
/**
|
|
* Read a text file and return its content with truncation info
|
|
*/
|
|
export async function readTextFile(file: File): Promise<TruncateResult> {
|
|
if (file.size > MAX_TEXT_SIZE) {
|
|
throw new Error(`File too large. Maximum size is ${MAX_TEXT_SIZE / 1024 / 1024}MB`);
|
|
}
|
|
|
|
return new Promise((resolve, reject) => {
|
|
const reader = new FileReader();
|
|
reader.onload = () => {
|
|
const rawContent = reader.result as string;
|
|
resolve(truncateContent(rawContent));
|
|
};
|
|
reader.onerror = () => reject(new Error('Failed to read file'));
|
|
reader.readAsText(file);
|
|
});
|
|
}
|
|
|
|
// ============================================================================
|
|
// Image Processing
|
|
// ============================================================================
|
|
|
|
/**
|
|
* Process an image file: resize if needed, compress, and return base64
|
|
*/
|
|
export async function processImage(file: File): Promise<{ base64: string; previewUrl: string }> {
|
|
if (file.size > MAX_IMAGE_SIZE * 5) {
|
|
// Allow larger initial size, we'll compress
|
|
throw new Error(`Image too large. Maximum size is ${(MAX_IMAGE_SIZE * 5) / 1024 / 1024}MB`);
|
|
}
|
|
|
|
return new Promise((resolve, reject) => {
|
|
const img = new Image();
|
|
const objectUrl = URL.createObjectURL(file);
|
|
|
|
img.onload = () => {
|
|
URL.revokeObjectURL(objectUrl);
|
|
|
|
// Calculate new dimensions
|
|
let { width, height } = img;
|
|
if (width > MAX_IMAGE_DIMENSION || height > MAX_IMAGE_DIMENSION) {
|
|
const ratio = Math.min(MAX_IMAGE_DIMENSION / width, MAX_IMAGE_DIMENSION / height);
|
|
width = Math.round(width * ratio);
|
|
height = Math.round(height * ratio);
|
|
}
|
|
|
|
// Draw to canvas and compress
|
|
const canvas = document.createElement('canvas');
|
|
canvas.width = width;
|
|
canvas.height = height;
|
|
|
|
const ctx = canvas.getContext('2d');
|
|
if (!ctx) {
|
|
reject(new Error('Failed to create canvas context'));
|
|
return;
|
|
}
|
|
|
|
ctx.drawImage(img, 0, 0, width, height);
|
|
|
|
// Get as JPEG for compression (better than PNG for most cases)
|
|
const quality = 0.85;
|
|
const dataUrl = canvas.toDataURL('image/jpeg', quality);
|
|
|
|
// Extract base64 without the data: prefix (Ollama requirement)
|
|
const base64 = dataUrl.replace(/^data:image\/\w+;base64,/, '');
|
|
|
|
resolve({
|
|
base64,
|
|
previewUrl: dataUrl
|
|
});
|
|
};
|
|
|
|
img.onerror = () => {
|
|
URL.revokeObjectURL(objectUrl);
|
|
reject(new Error('Failed to load image'));
|
|
};
|
|
|
|
img.src = objectUrl;
|
|
});
|
|
}
|
|
|
|
// ============================================================================
|
|
// PDF Processing
|
|
// ============================================================================
|
|
|
|
// PDF.js will be loaded dynamically when needed
|
|
let pdfjsLib: typeof import('pdfjs-dist') | null = null;
|
|
|
|
/**
|
|
* Load PDF.js library dynamically
|
|
*/
|
|
async function loadPdfJs(): Promise<typeof import('pdfjs-dist')> {
|
|
if (pdfjsLib) return pdfjsLib;
|
|
|
|
try {
|
|
pdfjsLib = await import('pdfjs-dist');
|
|
|
|
// Use locally bundled worker (copied to static/ during build)
|
|
// Falls back to CDN if local worker isn't available
|
|
const localWorkerPath = '/pdf.worker.min.mjs';
|
|
const cdnWorkerPath = `https://cdnjs.cloudflare.com/ajax/libs/pdf.js/${pdfjsLib.version}/pdf.worker.min.mjs`;
|
|
|
|
// Try local first, with CDN fallback
|
|
try {
|
|
const response = await fetch(localWorkerPath, { method: 'HEAD' });
|
|
pdfjsLib.GlobalWorkerOptions.workerSrc = response.ok ? localWorkerPath : cdnWorkerPath;
|
|
} catch {
|
|
pdfjsLib.GlobalWorkerOptions.workerSrc = cdnWorkerPath;
|
|
}
|
|
|
|
return pdfjsLib;
|
|
} catch (error) {
|
|
throw new Error('PDF.js library not available. Install with: npm install pdfjs-dist');
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Extract text content from a PDF file with error handling and content limits
|
|
*/
|
|
export async function extractPdfText(file: File): Promise<TruncateResult> {
|
|
if (file.size > MAX_PDF_SIZE) {
|
|
throw new Error(`PDF too large. Maximum size is ${MAX_PDF_SIZE / 1024 / 1024}MB`);
|
|
}
|
|
|
|
const pdfjs = await loadPdfJs();
|
|
|
|
const arrayBuffer = await file.arrayBuffer();
|
|
const pdf = await pdfjs.getDocument({ data: arrayBuffer }).promise;
|
|
|
|
const textParts: string[] = [];
|
|
let totalChars = 0;
|
|
let stoppedEarly = false;
|
|
const failedPages: number[] = [];
|
|
|
|
for (let i = 1; i <= pdf.numPages; i++) {
|
|
// Stop if we've already collected enough content
|
|
if (totalChars >= MAX_EXTRACTED_CONTENT) {
|
|
stoppedEarly = true;
|
|
break;
|
|
}
|
|
|
|
try {
|
|
const page = await pdf.getPage(i);
|
|
const textContent = await page.getTextContent();
|
|
|
|
// Null check for textContent.items
|
|
if (!textContent?.items) {
|
|
console.warn(`PDF page ${i}: No text content items`);
|
|
failedPages.push(i);
|
|
continue;
|
|
}
|
|
|
|
const pageText = textContent.items
|
|
.filter((item): item is import('pdfjs-dist/types/src/display/api').TextItem =>
|
|
'str' in item && typeof item.str === 'string'
|
|
)
|
|
.map((item) => item.str)
|
|
.join(' ')
|
|
.trim();
|
|
|
|
if (pageText) {
|
|
textParts.push(pageText);
|
|
totalChars += pageText.length;
|
|
}
|
|
} catch (pageError) {
|
|
console.warn(`PDF page ${i} extraction failed:`, pageError);
|
|
failedPages.push(i);
|
|
// Continue with other pages instead of failing entirely
|
|
}
|
|
}
|
|
|
|
let rawContent = textParts.join('\n\n');
|
|
|
|
// Add metadata about extraction issues
|
|
const metadata: string[] = [];
|
|
if (failedPages.length > 0) {
|
|
metadata.push(`[Note: Failed to extract pages: ${failedPages.join(', ')}]`);
|
|
}
|
|
if (stoppedEarly) {
|
|
metadata.push(`[Note: Extraction stopped at page ${textParts.length} of ${pdf.numPages} due to content limit]`);
|
|
}
|
|
|
|
if (metadata.length > 0) {
|
|
rawContent = metadata.join('\n') + '\n\n' + rawContent;
|
|
}
|
|
|
|
return truncateContent(rawContent);
|
|
}
|
|
|
|
// ============================================================================
|
|
// Main Processing Function
|
|
// ============================================================================
|
|
|
|
/**
|
|
* Process a file and create an attachment
|
|
* Handles all file types (image, text, PDF)
|
|
*/
|
|
export async function processFile(file: File): Promise<ProcessFileOutcome> {
|
|
const type = detectFileType(file);
|
|
|
|
if (!type) {
|
|
return {
|
|
success: false,
|
|
error: `Unsupported file type: ${file.type || 'unknown'}`
|
|
};
|
|
}
|
|
|
|
const id = crypto.randomUUID();
|
|
|
|
try {
|
|
const baseAttachment: FileAttachment = {
|
|
id,
|
|
type,
|
|
filename: file.name,
|
|
mimeType: file.type,
|
|
size: file.size
|
|
};
|
|
|
|
switch (type) {
|
|
case 'image': {
|
|
const { base64, previewUrl } = await processImage(file);
|
|
return {
|
|
success: true,
|
|
attachment: {
|
|
...baseAttachment,
|
|
base64Data: base64,
|
|
previewUrl,
|
|
originalFile: file
|
|
}
|
|
};
|
|
}
|
|
|
|
case 'text': {
|
|
const result = await readTextFile(file);
|
|
return {
|
|
success: true,
|
|
attachment: {
|
|
...baseAttachment,
|
|
textContent: result.content,
|
|
truncated: result.truncated,
|
|
originalLength: result.originalLength,
|
|
originalFile: file
|
|
}
|
|
};
|
|
}
|
|
|
|
case 'pdf': {
|
|
const result = await extractPdfText(file);
|
|
return {
|
|
success: true,
|
|
attachment: {
|
|
...baseAttachment,
|
|
textContent: result.content,
|
|
truncated: result.truncated,
|
|
originalLength: result.originalLength,
|
|
originalFile: file
|
|
}
|
|
};
|
|
}
|
|
|
|
default:
|
|
return {
|
|
success: false,
|
|
error: `Unsupported file type: ${type}`
|
|
};
|
|
}
|
|
} catch (error) {
|
|
return {
|
|
success: false,
|
|
error: error instanceof Error ? error.message : 'Unknown error processing file'
|
|
};
|
|
}
|
|
}
|
|
|
|
// ============================================================================
|
|
// Utility Functions
|
|
// ============================================================================
|
|
|
|
/**
|
|
* Format file size for display
|
|
*/
|
|
export function formatFileSize(bytes: number): string {
|
|
if (bytes < 1024) {
|
|
return `${bytes} B`;
|
|
}
|
|
if (bytes < 1024 * 1024) {
|
|
return `${(bytes / 1024).toFixed(1)} KB`;
|
|
}
|
|
return `${(bytes / (1024 * 1024)).toFixed(1)} MB`;
|
|
}
|
|
|
|
/**
|
|
* Get a file icon based on type
|
|
*/
|
|
export function getFileIcon(type: AttachmentType): string {
|
|
switch (type) {
|
|
case 'image':
|
|
return '🖼️';
|
|
case 'pdf':
|
|
return '📄';
|
|
case 'text':
|
|
return '📝';
|
|
default:
|
|
return '📎';
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Format attachment content for inclusion in message
|
|
* Uses XML-style tags for cleaner parsing by LLMs
|
|
*/
|
|
export function formatAttachmentsForMessage(attachments: FileAttachment[]): string {
|
|
return attachments
|
|
.filter((a) => a.textContent)
|
|
.map((a) => {
|
|
const truncatedAttr = a.truncated ? ' truncated="true"' : '';
|
|
const sizeAttr = ` size="${formatFileSize(a.size)}"`;
|
|
return `<file name="${escapeXmlAttr(a.filename)}"${sizeAttr}${truncatedAttr}>\n${a.textContent}\n</file>`;
|
|
})
|
|
.join('\n\n');
|
|
}
|
|
|
|
/**
|
|
* Escape special characters for XML attribute values
|
|
*/
|
|
function escapeXmlAttr(str: string): string {
|
|
return str
|
|
.replace(/&/g, '&')
|
|
.replace(/"/g, '"')
|
|
.replace(/'/g, ''')
|
|
.replace(/</g, '<')
|
|
.replace(/>/g, '>');
|
|
}
|