summaryrefslogtreecommitdiff
path: root/llama.cpp/tools/server/webui/src/lib/utils/pdf-processing.ts
diff options
context:
space:
mode:
Diffstat (limited to 'llama.cpp/tools/server/webui/src/lib/utils/pdf-processing.ts')
-rw-r--r--llama.cpp/tools/server/webui/src/lib/utils/pdf-processing.ts150
1 files changed, 150 insertions, 0 deletions
diff --git a/llama.cpp/tools/server/webui/src/lib/utils/pdf-processing.ts b/llama.cpp/tools/server/webui/src/lib/utils/pdf-processing.ts
new file mode 100644
index 0000000..84c456d
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/utils/pdf-processing.ts
@@ -0,0 +1,150 @@
+/**
+ * PDF processing utilities using PDF.js
+ * Handles PDF text extraction and image conversion in the browser
+ */
+
+import { browser } from '$app/environment';
+import { MimeTypeApplication, MimeTypeImage } from '$lib/enums';
+import * as pdfjs from 'pdfjs-dist';
+
+type TextContent = {
+ items: Array<{ str: string }>;
+};
+
+if (browser) {
+ // Import worker as text and create blob URL for inline bundling
+ import('pdfjs-dist/build/pdf.worker.min.mjs?raw')
+ .then((workerModule) => {
+ const workerBlob = new Blob([workerModule.default], { type: 'application/javascript' });
+ pdfjs.GlobalWorkerOptions.workerSrc = URL.createObjectURL(workerBlob);
+ })
+ .catch(() => {
+ console.warn('Failed to load PDF.js worker, PDF processing may not work');
+ });
+}
+
+/**
+ * Convert a File object to ArrayBuffer for PDF.js processing
+ * @param file - The PDF file to convert
+ * @returns Promise resolving to the file's ArrayBuffer
+ */
+async function getFileAsBuffer(file: File): Promise<ArrayBuffer> {
+ return new Promise((resolve, reject) => {
+ const reader = new FileReader();
+ reader.onload = (event) => {
+ if (event.target?.result) {
+ resolve(event.target.result as ArrayBuffer);
+ } else {
+ reject(new Error('Failed to read file.'));
+ }
+ };
+ reader.onerror = () => {
+ reject(new Error('Failed to read file.'));
+ };
+ reader.readAsArrayBuffer(file);
+ });
+}
+
+/**
+ * Extract text content from a PDF file
+ * @param file - The PDF file to process
+ * @returns Promise resolving to the extracted text content
+ */
+export async function convertPDFToText(file: File): Promise<string> {
+ if (!browser) {
+ throw new Error('PDF processing is only available in the browser');
+ }
+
+ try {
+ const buffer = await getFileAsBuffer(file);
+ const pdf = await pdfjs.getDocument(buffer).promise;
+ const numPages = pdf.numPages;
+
+ const textContentPromises: Promise<TextContent>[] = [];
+
+ for (let i = 1; i <= numPages; i++) {
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
+ textContentPromises.push(pdf.getPage(i).then((page: any) => page.getTextContent()));
+ }
+
+ const textContents = await Promise.all(textContentPromises);
+ const textItems = textContents.flatMap((textContent: TextContent) =>
+ textContent.items.map((item) => item.str ?? '')
+ );
+
+ return textItems.join('\n');
+ } catch (error) {
+ console.error('Error converting PDF to text:', error);
+ throw new Error(
+ `Failed to convert PDF to text: ${error instanceof Error ? error.message : 'Unknown error'}`
+ );
+ }
+}
+
+/**
+ * Convert PDF pages to PNG images as data URLs
+ * @param file - The PDF file to convert
+ * @param scale - Rendering scale factor (default: 1.5)
+ * @returns Promise resolving to array of PNG data URLs
+ */
+export async function convertPDFToImage(file: File, scale: number = 1.5): Promise<string[]> {
+ if (!browser) {
+ throw new Error('PDF processing is only available in the browser');
+ }
+
+ try {
+ const buffer = await getFileAsBuffer(file);
+ const doc = await pdfjs.getDocument(buffer).promise;
+ const pages: Promise<string>[] = [];
+
+ for (let i = 1; i <= doc.numPages; i++) {
+ const page = await doc.getPage(i);
+ const viewport = page.getViewport({ scale });
+ const canvas = document.createElement('canvas');
+ const ctx = canvas.getContext('2d');
+
+ canvas.width = viewport.width;
+ canvas.height = viewport.height;
+
+ if (!ctx) {
+ throw new Error('Failed to get 2D context from canvas');
+ }
+
+ const task = page.render({
+ canvasContext: ctx,
+ viewport: viewport,
+ canvas: canvas
+ });
+ pages.push(
+ task.promise.then(() => {
+ return canvas.toDataURL(MimeTypeImage.PNG);
+ })
+ );
+ }
+
+ return await Promise.all(pages);
+ } catch (error) {
+ console.error('Error converting PDF to images:', error);
+ throw new Error(
+ `Failed to convert PDF to images: ${error instanceof Error ? error.message : 'Unknown error'}`
+ );
+ }
+}
+
+/**
+ * Check if a file is a PDF based on its MIME type
+ * @param file - The file to check
+ * @returns True if the file is a PDF
+ */
+export function isPdfFile(file: File): boolean {
+ return file.type === MimeTypeApplication.PDF;
+}
+
+/**
+ * Check if a MIME type represents a PDF
+ * @param mimeType - The MIME type to check
+ * @returns True if the MIME type is application/pdf
+ */
+export function isApplicationMimeType(mimeType: string): boolean {
+ return mimeType === MimeTypeApplication.PDF;
+}