diff options
Diffstat (limited to 'llama.cpp/tools/server/webui/src/lib/utils/pdf-processing.ts')
| -rw-r--r-- | llama.cpp/tools/server/webui/src/lib/utils/pdf-processing.ts | 150 |
1 files changed, 150 insertions, 0 deletions
diff --git a/llama.cpp/tools/server/webui/src/lib/utils/pdf-processing.ts b/llama.cpp/tools/server/webui/src/lib/utils/pdf-processing.ts new file mode 100644 index 0000000..84c456d --- /dev/null +++ b/llama.cpp/tools/server/webui/src/lib/utils/pdf-processing.ts @@ -0,0 +1,150 @@ +/** + * PDF processing utilities using PDF.js + * Handles PDF text extraction and image conversion in the browser + */ + +import { browser } from '$app/environment'; +import { MimeTypeApplication, MimeTypeImage } from '$lib/enums'; +import * as pdfjs from 'pdfjs-dist'; + +type TextContent = { + items: Array<{ str: string }>; +}; + +if (browser) { + // Import worker as text and create blob URL for inline bundling + import('pdfjs-dist/build/pdf.worker.min.mjs?raw') + .then((workerModule) => { + const workerBlob = new Blob([workerModule.default], { type: 'application/javascript' }); + pdfjs.GlobalWorkerOptions.workerSrc = URL.createObjectURL(workerBlob); + }) + .catch(() => { + console.warn('Failed to load PDF.js worker, PDF processing may not work'); + }); +} + +/** + * Convert a File object to ArrayBuffer for PDF.js processing + * @param file - The PDF file to convert + * @returns Promise resolving to the file's ArrayBuffer + */ +async function getFileAsBuffer(file: File): Promise<ArrayBuffer> { + return new Promise((resolve, reject) => { + const reader = new FileReader(); + reader.onload = (event) => { + if (event.target?.result) { + resolve(event.target.result as ArrayBuffer); + } else { + reject(new Error('Failed to read file.')); + } + }; + reader.onerror = () => { + reject(new Error('Failed to read file.')); + }; + reader.readAsArrayBuffer(file); + }); +} + +/** + * Extract text content from a PDF file + * @param file - The PDF file to process + * @returns Promise resolving to the extracted text content + */ +export async function convertPDFToText(file: File): Promise<string> { + if (!browser) { + throw new Error('PDF processing is only available in the browser'); + } + + try { + const buffer = await getFileAsBuffer(file); + const pdf = await pdfjs.getDocument(buffer).promise; + const numPages = pdf.numPages; + + const textContentPromises: Promise<TextContent>[] = []; + + for (let i = 1; i <= numPages; i++) { + // eslint-disable-next-line @typescript-eslint/no-explicit-any + textContentPromises.push(pdf.getPage(i).then((page: any) => page.getTextContent())); + } + + const textContents = await Promise.all(textContentPromises); + const textItems = textContents.flatMap((textContent: TextContent) => + textContent.items.map((item) => item.str ?? '') + ); + + return textItems.join('\n'); + } catch (error) { + console.error('Error converting PDF to text:', error); + throw new Error( + `Failed to convert PDF to text: ${error instanceof Error ? error.message : 'Unknown error'}` + ); + } +} + +/** + * Convert PDF pages to PNG images as data URLs + * @param file - The PDF file to convert + * @param scale - Rendering scale factor (default: 1.5) + * @returns Promise resolving to array of PNG data URLs + */ +export async function convertPDFToImage(file: File, scale: number = 1.5): Promise<string[]> { + if (!browser) { + throw new Error('PDF processing is only available in the browser'); + } + + try { + const buffer = await getFileAsBuffer(file); + const doc = await pdfjs.getDocument(buffer).promise; + const pages: Promise<string>[] = []; + + for (let i = 1; i <= doc.numPages; i++) { + const page = await doc.getPage(i); + const viewport = page.getViewport({ scale }); + const canvas = document.createElement('canvas'); + const ctx = canvas.getContext('2d'); + + canvas.width = viewport.width; + canvas.height = viewport.height; + + if (!ctx) { + throw new Error('Failed to get 2D context from canvas'); + } + + const task = page.render({ + canvasContext: ctx, + viewport: viewport, + canvas: canvas + }); + pages.push( + task.promise.then(() => { + return canvas.toDataURL(MimeTypeImage.PNG); + }) + ); + } + + return await Promise.all(pages); + } catch (error) { + console.error('Error converting PDF to images:', error); + throw new Error( + `Failed to convert PDF to images: ${error instanceof Error ? error.message : 'Unknown error'}` + ); + } +} + +/** + * Check if a file is a PDF based on its MIME type + * @param file - The file to check + * @returns True if the file is a PDF + */ +export function isPdfFile(file: File): boolean { + return file.type === MimeTypeApplication.PDF; +} + +/** + * Check if a MIME type represents a PDF + * @param mimeType - The MIME type to check + * @returns True if the MIME type is application/pdf + */ +export function isApplicationMimeType(mimeType: string): boolean { + return mimeType === MimeTypeApplication.PDF; +} |
