1/**
2 * PDF processing utilities using PDF.js
3 * Handles PDF text extraction and image conversion in the browser
4 */
5
6import { browser } from '$app/environment';
7import { MimeTypeApplication, MimeTypeImage } from '$lib/enums';
8import * as pdfjs from 'pdfjs-dist';
9
10type TextContent = {
11 items: Array<{ str: string }>;
12};
13
14if (browser) {
15 // Import worker as text and create blob URL for inline bundling
16 import('pdfjs-dist/build/pdf.worker.min.mjs?raw')
17 .then((workerModule) => {
18 const workerBlob = new Blob([workerModule.default], { type: 'application/javascript' });
19 pdfjs.GlobalWorkerOptions.workerSrc = URL.createObjectURL(workerBlob);
20 })
21 .catch(() => {
22 console.warn('Failed to load PDF.js worker, PDF processing may not work');
23 });
24}
25
26/**
27 * Convert a File object to ArrayBuffer for PDF.js processing
28 * @param file - The PDF file to convert
29 * @returns Promise resolving to the file's ArrayBuffer
30 */
31async function getFileAsBuffer(file: File): Promise<ArrayBuffer> {
32 return new Promise((resolve, reject) => {
33 const reader = new FileReader();
34 reader.onload = (event) => {
35 if (event.target?.result) {
36 resolve(event.target.result as ArrayBuffer);
37 } else {
38 reject(new Error('Failed to read file.'));
39 }
40 };
41 reader.onerror = () => {
42 reject(new Error('Failed to read file.'));
43 };
44 reader.readAsArrayBuffer(file);
45 });
46}
47
48/**
49 * Extract text content from a PDF file
50 * @param file - The PDF file to process
51 * @returns Promise resolving to the extracted text content
52 */
53export async function convertPDFToText(file: File): Promise<string> {
54 if (!browser) {
55 throw new Error('PDF processing is only available in the browser');
56 }
57
58 try {
59 const buffer = await getFileAsBuffer(file);
60 const pdf = await pdfjs.getDocument(buffer).promise;
61 const numPages = pdf.numPages;
62
63 const textContentPromises: Promise<TextContent>[] = [];
64
65 for (let i = 1; i <= numPages; i++) {
66 // eslint-disable-next-line @typescript-eslint/no-explicit-any
67 textContentPromises.push(pdf.getPage(i).then((page: any) => page.getTextContent()));
68 }
69
70 const textContents = await Promise.all(textContentPromises);
71 const textItems = textContents.flatMap((textContent: TextContent) =>
72 textContent.items.map((item) => item.str ?? '')
73 );
74
75 return textItems.join('\n');
76 } catch (error) {
77 console.error('Error converting PDF to text:', error);
78 throw new Error(
79 `Failed to convert PDF to text: ${error instanceof Error ? error.message : 'Unknown error'}`
80 );
81 }
82}
83
84/**
85 * Convert PDF pages to PNG images as data URLs
86 * @param file - The PDF file to convert
87 * @param scale - Rendering scale factor (default: 1.5)
88 * @returns Promise resolving to array of PNG data URLs
89 */
90export async function convertPDFToImage(file: File, scale: number = 1.5): Promise<string[]> {
91 if (!browser) {
92 throw new Error('PDF processing is only available in the browser');
93 }
94
95 try {
96 const buffer = await getFileAsBuffer(file);
97 const doc = await pdfjs.getDocument(buffer).promise;
98 const pages: Promise<string>[] = [];
99
100 for (let i = 1; i <= doc.numPages; i++) {
101 const page = await doc.getPage(i);
102 const viewport = page.getViewport({ scale });
103 const canvas = document.createElement('canvas');
104 const ctx = canvas.getContext('2d');
105
106 canvas.width = viewport.width;
107 canvas.height = viewport.height;
108
109 if (!ctx) {
110 throw new Error('Failed to get 2D context from canvas');
111 }
112
113 const task = page.render({
114 canvasContext: ctx,
115 viewport: viewport,
116 canvas: canvas
117 });
118 pages.push(
119 task.promise.then(() => {
120 return canvas.toDataURL(MimeTypeImage.PNG);
121 })
122 );
123 }
124
125 return await Promise.all(pages);
126 } catch (error) {
127 console.error('Error converting PDF to images:', error);
128 throw new Error(
129 `Failed to convert PDF to images: ${error instanceof Error ? error.message : 'Unknown error'}`
130 );
131 }
132}
133
134/**
135 * Check if a file is a PDF based on its MIME type
136 * @param file - The file to check
137 * @returns True if the file is a PDF
138 */
139export function isPdfFile(file: File): boolean {
140 return file.type === MimeTypeApplication.PDF;
141}
142
143/**
144 * Check if a MIME type represents a PDF
145 * @param mimeType - The MIME type to check
146 * @returns True if the MIME type is application/pdf
147 */
148export function isApplicationMimeType(mimeType: string): boolean {
149 return mimeType === MimeTypeApplication.PDF;
150}