1/**
  2 * PDF processing utilities using PDF.js
  3 * Handles PDF text extraction and image conversion in the browser
  4 */
  5
  6import { browser } from '$app/environment';
  7import { MimeTypeApplication, MimeTypeImage } from '$lib/enums';
  8import * as pdfjs from 'pdfjs-dist';
  9
 10type TextContent = {
 11	items: Array<{ str: string }>;
 12};
 13
 14if (browser) {
 15	// Import worker as text and create blob URL for inline bundling
 16	import('pdfjs-dist/build/pdf.worker.min.mjs?raw')
 17		.then((workerModule) => {
 18			const workerBlob = new Blob([workerModule.default], { type: 'application/javascript' });
 19			pdfjs.GlobalWorkerOptions.workerSrc = URL.createObjectURL(workerBlob);
 20		})
 21		.catch(() => {
 22			console.warn('Failed to load PDF.js worker, PDF processing may not work');
 23		});
 24}
 25
 26/**
 27 * Convert a File object to ArrayBuffer for PDF.js processing
 28 * @param file - The PDF file to convert
 29 * @returns Promise resolving to the file's ArrayBuffer
 30 */
 31async function getFileAsBuffer(file: File): Promise<ArrayBuffer> {
 32	return new Promise((resolve, reject) => {
 33		const reader = new FileReader();
 34		reader.onload = (event) => {
 35			if (event.target?.result) {
 36				resolve(event.target.result as ArrayBuffer);
 37			} else {
 38				reject(new Error('Failed to read file.'));
 39			}
 40		};
 41		reader.onerror = () => {
 42			reject(new Error('Failed to read file.'));
 43		};
 44		reader.readAsArrayBuffer(file);
 45	});
 46}
 47
 48/**
 49 * Extract text content from a PDF file
 50 * @param file - The PDF file to process
 51 * @returns Promise resolving to the extracted text content
 52 */
 53export async function convertPDFToText(file: File): Promise<string> {
 54	if (!browser) {
 55		throw new Error('PDF processing is only available in the browser');
 56	}
 57
 58	try {
 59		const buffer = await getFileAsBuffer(file);
 60		const pdf = await pdfjs.getDocument(buffer).promise;
 61		const numPages = pdf.numPages;
 62
 63		const textContentPromises: Promise<TextContent>[] = [];
 64
 65		for (let i = 1; i <= numPages; i++) {
 66			// eslint-disable-next-line @typescript-eslint/no-explicit-any
 67			textContentPromises.push(pdf.getPage(i).then((page: any) => page.getTextContent()));
 68		}
 69
 70		const textContents = await Promise.all(textContentPromises);
 71		const textItems = textContents.flatMap((textContent: TextContent) =>
 72			textContent.items.map((item) => item.str ?? '')
 73		);
 74
 75		return textItems.join('\n');
 76	} catch (error) {
 77		console.error('Error converting PDF to text:', error);
 78		throw new Error(
 79			`Failed to convert PDF to text: ${error instanceof Error ? error.message : 'Unknown error'}`
 80		);
 81	}
 82}
 83
 84/**
 85 * Convert PDF pages to PNG images as data URLs
 86 * @param file - The PDF file to convert
 87 * @param scale - Rendering scale factor (default: 1.5)
 88 * @returns Promise resolving to array of PNG data URLs
 89 */
 90export async function convertPDFToImage(file: File, scale: number = 1.5): Promise<string[]> {
 91	if (!browser) {
 92		throw new Error('PDF processing is only available in the browser');
 93	}
 94
 95	try {
 96		const buffer = await getFileAsBuffer(file);
 97		const doc = await pdfjs.getDocument(buffer).promise;
 98		const pages: Promise<string>[] = [];
 99
100		for (let i = 1; i <= doc.numPages; i++) {
101			const page = await doc.getPage(i);
102			const viewport = page.getViewport({ scale });
103			const canvas = document.createElement('canvas');
104			const ctx = canvas.getContext('2d');
105
106			canvas.width = viewport.width;
107			canvas.height = viewport.height;
108
109			if (!ctx) {
110				throw new Error('Failed to get 2D context from canvas');
111			}
112
113			const task = page.render({
114				canvasContext: ctx,
115				viewport: viewport,
116				canvas: canvas
117			});
118			pages.push(
119				task.promise.then(() => {
120					return canvas.toDataURL(MimeTypeImage.PNG);
121				})
122			);
123		}
124
125		return await Promise.all(pages);
126	} catch (error) {
127		console.error('Error converting PDF to images:', error);
128		throw new Error(
129			`Failed to convert PDF to images: ${error instanceof Error ? error.message : 'Unknown error'}`
130		);
131	}
132}
133
134/**
135 * Check if a file is a PDF based on its MIME type
136 * @param file - The file to check
137 * @returns True if the file is a PDF
138 */
139export function isPdfFile(file: File): boolean {
140	return file.type === MimeTypeApplication.PDF;
141}
142
143/**
144 * Check if a MIME type represents a PDF
145 * @param mimeType - The MIME type to check
146 * @returns True if the MIME type is application/pdf
147 */
148export function isApplicationMimeType(mimeType: string): boolean {
149	return mimeType === MimeTypeApplication.PDF;
150}