1/**
 2 * Text file processing utilities
 3 * Handles text file detection, reading, and validation
 4 */
 5
 6import {
 7	DEFAULT_BINARY_DETECTION_OPTIONS,
 8	type BinaryDetectionOptions
 9} from '$lib/constants/binary-detection';
10import { FileExtensionText } from '$lib/enums';
11
12/**
13 * Check if a filename indicates a text file based on its extension
14 * @param filename - The filename to check
15 * @returns True if the filename has a recognized text file extension
16 */
17export function isTextFileByName(filename: string): boolean {
18	const textExtensions = Object.values(FileExtensionText);
19
20	return textExtensions.some((ext: FileExtensionText) => filename.toLowerCase().endsWith(ext));
21}
22
23/**
24 * Read a file's content as text
25 * @param file - The file to read
26 * @returns Promise resolving to the file's text content
27 */
28export async function readFileAsText(file: File): Promise<string> {
29	return new Promise((resolve, reject) => {
30		const reader = new FileReader();
31
32		reader.onload = (event) => {
33			if (event.target?.result !== null && event.target?.result !== undefined) {
34				resolve(event.target.result as string);
35			} else {
36				reject(new Error('Failed to read file'));
37			}
38		};
39
40		reader.onerror = () => reject(new Error('File reading error'));
41
42		reader.readAsText(file);
43	});
44}
45
46/**
47 * Heuristic check to determine if content is likely from a text file
48 * Detects binary files by counting suspicious characters and null bytes
49 * @param content - The file content to analyze
50 * @param options - Optional configuration for detection parameters
51 * @returns True if the content appears to be text-based
52 */
53export function isLikelyTextFile(
54	content: string,
55	options: Partial<BinaryDetectionOptions> = {}
56): boolean {
57	if (!content) return true;
58
59	const config = { ...DEFAULT_BINARY_DETECTION_OPTIONS, ...options };
60	const sample = content.substring(0, config.prefixLength);
61
62	let nullCount = 0;
63	let suspiciousControlCount = 0;
64
65	for (let i = 0; i < sample.length; i++) {
66		const charCode = sample.charCodeAt(i);
67
68		// Count null bytes - these are strong indicators of binary files
69		if (charCode === 0) {
70			nullCount++;
71
72			continue;
73		}
74
75		// Count suspicious control characters
76		// Allow common whitespace characters: tab (9), newline (10), carriage return (13)
77		if (charCode < 32 && charCode !== 9 && charCode !== 10 && charCode !== 13) {
78			// Count most suspicious control characters
79			if (charCode < 8 || (charCode > 13 && charCode < 27)) {
80				suspiciousControlCount++;
81			}
82		}
83
84		// Count replacement characters (indicates encoding issues)
85		if (charCode === 0xfffd) {
86			suspiciousControlCount++;
87		}
88	}
89
90	// Reject if too many null bytes
91	if (nullCount > config.maxAbsoluteNullBytes) return false;
92
93	// Reject if too many suspicious characters
94	if (suspiciousControlCount / sample.length > config.suspiciousCharThresholdRatio) return false;
95
96	return true;
97}