/** * File processor utility * Handles reading, processing, and extracting content from files * Supports images, text files, and PDFs */ import type { AttachmentType, FileAttachment, ProcessFileOutcome } from '$lib/types/attachment.js'; import { isImageMimeType, isTextMimeType, isPdfMimeType, isTextExtension, MAX_IMAGE_SIZE, MAX_TEXT_SIZE, MAX_PDF_SIZE, MAX_IMAGE_DIMENSION, MAX_EXTRACTED_CONTENT } from '$lib/types/attachment.js'; // ============================================================================ // File Type Detection // ============================================================================ /** * Detect the attachment type for a file * @returns The attachment type or null if unsupported */ export function detectFileType(file: File): AttachmentType | null { const mimeType = file.type.toLowerCase(); if (isImageMimeType(mimeType)) { return 'image'; } if (isPdfMimeType(mimeType)) { return 'pdf'; } if (isTextMimeType(mimeType)) { return 'text'; } // Check by extension as fallback if (isTextExtension(file.name)) { return 'text'; } return null; } // ============================================================================ // Content Truncation // ============================================================================ /** * Result of content truncation */ interface TruncateResult { content: string; truncated: boolean; originalLength: number; } /** * Truncate content to maximum allowed length * Tries to truncate at a natural boundary (newline or space) */ function truncateContent(content: string, maxLength: number = MAX_EXTRACTED_CONTENT): TruncateResult { const originalLength = content.length; if (originalLength <= maxLength) { return { content, truncated: false, originalLength }; } // Try to find a natural break point (newline or space) near the limit let cutPoint = maxLength; const searchStart = Math.max(0, maxLength - 500); // Look for last newline before cutoff const lastNewline = content.lastIndexOf('\n', maxLength); if (lastNewline > searchStart) { cutPoint = lastNewline; } else { // Look for last space const lastSpace = content.lastIndexOf(' ', maxLength); if (lastSpace > searchStart) { cutPoint = lastSpace; } } const truncatedContent = content.slice(0, cutPoint) + `\n\n[... content truncated: ${formatFileSize(originalLength)} total, showing first ${formatFileSize(cutPoint)} ...]`; return { content: truncatedContent, truncated: true, originalLength }; } // ============================================================================ // Text File Processing // ============================================================================ /** * Read a text file and return its content with truncation info */ export async function readTextFile(file: File): Promise { if (file.size > MAX_TEXT_SIZE) { throw new Error(`File too large. Maximum size is ${MAX_TEXT_SIZE / 1024 / 1024}MB`); } return new Promise((resolve, reject) => { const reader = new FileReader(); reader.onload = () => { const rawContent = reader.result as string; resolve(truncateContent(rawContent)); }; reader.onerror = () => reject(new Error('Failed to read file')); reader.readAsText(file); }); } // ============================================================================ // Image Processing // ============================================================================ /** * Process an image file: resize if needed, compress, and return base64 */ export async function processImage(file: File): Promise<{ base64: string; previewUrl: string }> { if (file.size > MAX_IMAGE_SIZE * 5) { // Allow larger initial size, we'll compress throw new Error(`Image too large. Maximum size is ${(MAX_IMAGE_SIZE * 5) / 1024 / 1024}MB`); } return new Promise((resolve, reject) => { const img = new Image(); const objectUrl = URL.createObjectURL(file); img.onload = () => { URL.revokeObjectURL(objectUrl); // Calculate new dimensions let { width, height } = img; if (width > MAX_IMAGE_DIMENSION || height > MAX_IMAGE_DIMENSION) { const ratio = Math.min(MAX_IMAGE_DIMENSION / width, MAX_IMAGE_DIMENSION / height); width = Math.round(width * ratio); height = Math.round(height * ratio); } // Draw to canvas and compress const canvas = document.createElement('canvas'); canvas.width = width; canvas.height = height; const ctx = canvas.getContext('2d'); if (!ctx) { reject(new Error('Failed to create canvas context')); return; } ctx.drawImage(img, 0, 0, width, height); // Get as JPEG for compression (better than PNG for most cases) const quality = 0.85; const dataUrl = canvas.toDataURL('image/jpeg', quality); // Extract base64 without the data: prefix (Ollama requirement) const base64 = dataUrl.replace(/^data:image\/\w+;base64,/, ''); resolve({ base64, previewUrl: dataUrl }); }; img.onerror = () => { URL.revokeObjectURL(objectUrl); reject(new Error('Failed to load image')); }; img.src = objectUrl; }); } // ============================================================================ // PDF Processing // ============================================================================ // PDF.js will be loaded dynamically when needed let pdfjsLib: typeof import('pdfjs-dist') | null = null; /** * Load PDF.js library dynamically */ async function loadPdfJs(): Promise { if (pdfjsLib) return pdfjsLib; try { pdfjsLib = await import('pdfjs-dist'); // Use locally bundled worker (copied to static/ during build) // Falls back to CDN if local worker isn't available const localWorkerPath = '/pdf.worker.min.mjs'; const cdnWorkerPath = `https://cdnjs.cloudflare.com/ajax/libs/pdf.js/${pdfjsLib.version}/pdf.worker.min.mjs`; // Try local first, with CDN fallback try { const response = await fetch(localWorkerPath, { method: 'HEAD' }); pdfjsLib.GlobalWorkerOptions.workerSrc = response.ok ? localWorkerPath : cdnWorkerPath; } catch { pdfjsLib.GlobalWorkerOptions.workerSrc = cdnWorkerPath; } return pdfjsLib; } catch (error) { throw new Error('PDF.js library not available. Install with: npm install pdfjs-dist'); } } /** * Extract text content from a PDF file with error handling and content limits */ export async function extractPdfText(file: File): Promise { if (file.size > MAX_PDF_SIZE) { throw new Error(`PDF too large. Maximum size is ${MAX_PDF_SIZE / 1024 / 1024}MB`); } const pdfjs = await loadPdfJs(); const arrayBuffer = await file.arrayBuffer(); const pdf = await pdfjs.getDocument({ data: arrayBuffer }).promise; const textParts: string[] = []; let totalChars = 0; let stoppedEarly = false; const failedPages: number[] = []; for (let i = 1; i <= pdf.numPages; i++) { // Stop if we've already collected enough content if (totalChars >= MAX_EXTRACTED_CONTENT) { stoppedEarly = true; break; } try { const page = await pdf.getPage(i); const textContent = await page.getTextContent(); // Null check for textContent.items if (!textContent?.items) { console.warn(`PDF page ${i}: No text content items`); failedPages.push(i); continue; } const pageText = textContent.items .filter((item): item is import('pdfjs-dist/types/src/display/api').TextItem => 'str' in item && typeof item.str === 'string' ) .map((item) => item.str) .join(' ') .trim(); if (pageText) { textParts.push(pageText); totalChars += pageText.length; } } catch (pageError) { console.warn(`PDF page ${i} extraction failed:`, pageError); failedPages.push(i); // Continue with other pages instead of failing entirely } } let rawContent = textParts.join('\n\n'); // Add metadata about extraction issues const metadata: string[] = []; if (failedPages.length > 0) { metadata.push(`[Note: Failed to extract pages: ${failedPages.join(', ')}]`); } if (stoppedEarly) { metadata.push(`[Note: Extraction stopped at page ${textParts.length} of ${pdf.numPages} due to content limit]`); } if (metadata.length > 0) { rawContent = metadata.join('\n') + '\n\n' + rawContent; } return truncateContent(rawContent); } // ============================================================================ // Main Processing Function // ============================================================================ /** * Process a file and create an attachment * Handles all file types (image, text, PDF) */ export async function processFile(file: File): Promise { const type = detectFileType(file); if (!type) { return { success: false, error: `Unsupported file type: ${file.type || 'unknown'}` }; } const id = crypto.randomUUID(); try { const baseAttachment: FileAttachment = { id, type, filename: file.name, mimeType: file.type, size: file.size }; switch (type) { case 'image': { const { base64, previewUrl } = await processImage(file); return { success: true, attachment: { ...baseAttachment, base64Data: base64, previewUrl, originalFile: file } }; } case 'text': { const result = await readTextFile(file); return { success: true, attachment: { ...baseAttachment, textContent: result.content, truncated: result.truncated, originalLength: result.originalLength, originalFile: file } }; } case 'pdf': { const result = await extractPdfText(file); return { success: true, attachment: { ...baseAttachment, textContent: result.content, truncated: result.truncated, originalLength: result.originalLength, originalFile: file } }; } default: return { success: false, error: `Unsupported file type: ${type}` }; } } catch (error) { return { success: false, error: error instanceof Error ? error.message : 'Unknown error processing file' }; } } // ============================================================================ // Utility Functions // ============================================================================ /** * Format file size for display */ export function formatFileSize(bytes: number): string { if (bytes < 1024) { return `${bytes} B`; } if (bytes < 1024 * 1024) { return `${(bytes / 1024).toFixed(1)} KB`; } return `${(bytes / (1024 * 1024)).toFixed(1)} MB`; } /** * Get a file icon based on type */ export function getFileIcon(type: AttachmentType): string { switch (type) { case 'image': return '🖼️'; case 'pdf': return '📄'; case 'text': return '📝'; default: return '📎'; } } /** * Format attachment content for inclusion in message * Uses XML-style tags for cleaner parsing by LLMs */ export function formatAttachmentsForMessage(attachments: FileAttachment[]): string { return attachments .filter((a) => a.textContent) .map((a) => { const truncatedAttr = a.truncated ? ' truncated="true"' : ''; const sizeAttr = ` size="${formatFileSize(a.size)}"`; return `\n${a.textContent}\n`; }) .join('\n\n'); } /** * Escape special characters for XML attribute values */ function escapeXmlAttr(str: string): string { return str .replace(/&/g, '&') .replace(/"/g, '"') .replace(/'/g, ''') .replace(//g, '>'); }