llmnpc - llama.cpp/tools/server/webui/src/lib/utils/latex-protection.ts

Path: llmnpc / llama.cpp / tools / server / webui / src / lib / utils / latex-protection.ts (raw)
  1import {
  2	CODE_BLOCK_REGEXP,
  3	LATEX_MATH_AND_CODE_PATTERN,
  4	LATEX_LINEBREAK_REGEXP,
  5	MHCHEM_PATTERN_MAP
  6} from '$lib/constants/latex-protection';
  7
  8/**
  9 * Replaces inline LaTeX expressions enclosed in `$...$` with placeholders, avoiding dollar signs
 10 * that appear to be part of monetary values or identifiers.
 11 *
 12 * This function processes the input line by line and skips `$` sequences that are likely
 13 * part of money amounts (e.g., `$5`, `$100.99`) or code-like tokens (e.g., `var$`, `$var`).
 14 * Valid LaTeX inline math is replaced with a placeholder like `<<LATEX_0>>`, and the
 15 * actual LaTeX content is stored in the provided `latexExpressions` array.
 16 *
 17 * @param content - The input text potentially containing LaTeX expressions.
 18 * @param latexExpressions - An array used to collect extracted LaTeX expressions.
 19 * @returns The processed string with LaTeX replaced by placeholders.
 20 */
 21export function maskInlineLaTeX(content: string, latexExpressions: string[]): string {
 22	if (!content.includes('$')) {
 23		return content;
 24	}
 25	return content
 26		.split('\n')
 27		.map((line) => {
 28			if (line.indexOf('$') == -1) {
 29				return line;
 30			}
 31
 32			let processedLine = '';
 33			let currentPosition = 0;
 34
 35			while (currentPosition < line.length) {
 36				const openDollarIndex = line.indexOf('$', currentPosition);
 37
 38				if (openDollarIndex == -1) {
 39					processedLine += line.slice(currentPosition);
 40					break;
 41				}
 42
 43				// Is there a next $-sign?
 44				const closeDollarIndex = line.indexOf('$', openDollarIndex + 1);
 45
 46				if (closeDollarIndex == -1) {
 47					processedLine += line.slice(currentPosition);
 48					break;
 49				}
 50
 51				const charBeforeOpen = openDollarIndex > 0 ? line[openDollarIndex - 1] : '';
 52				const charAfterOpen = line[openDollarIndex + 1];
 53				const charBeforeClose =
 54					openDollarIndex + 1 < closeDollarIndex ? line[closeDollarIndex - 1] : '';
 55				const charAfterClose = closeDollarIndex + 1 < line.length ? line[closeDollarIndex + 1] : '';
 56
 57				let shouldSkipAsNonLatex = false;
 58
 59				if (closeDollarIndex == currentPosition + 1) {
 60					// No content
 61					shouldSkipAsNonLatex = true;
 62				}
 63
 64				if (/[A-Za-z0-9_$-]/.test(charBeforeOpen)) {
 65					// Character, digit, $, _ or - before first '$', no TeX.
 66					shouldSkipAsNonLatex = true;
 67				}
 68
 69				if (
 70					/[0-9]/.test(charAfterOpen) &&
 71					(/[A-Za-z0-9_$-]/.test(charAfterClose) || ' ' == charBeforeClose)
 72				) {
 73					// First $ seems to belong to an amount.
 74					shouldSkipAsNonLatex = true;
 75				}
 76
 77				if (shouldSkipAsNonLatex) {
 78					processedLine += line.slice(currentPosition, openDollarIndex + 1);
 79					currentPosition = openDollarIndex + 1;
 80
 81					continue;
 82				}
 83
 84				// Treat as LaTeX
 85				processedLine += line.slice(currentPosition, openDollarIndex);
 86				const latexContent = line.slice(openDollarIndex, closeDollarIndex + 1);
 87				latexExpressions.push(latexContent);
 88				processedLine += `<<LATEX_${latexExpressions.length - 1}>>`;
 89				currentPosition = closeDollarIndex + 1;
 90			}
 91
 92			return processedLine;
 93		})
 94		.join('\n');
 95}
 96
 97function escapeBrackets(text: string): string {
 98	return text.replace(
 99		LATEX_MATH_AND_CODE_PATTERN,
100		(
101			match: string,
102			codeBlock: string | undefined,
103			squareBracket: string | undefined,
104			roundBracket: string | undefined
105		): string => {
106			if (codeBlock != null) {
107				return codeBlock;
108			} else if (squareBracket != null) {
109				return `$$${squareBracket}$$`;
110			} else if (roundBracket != null) {
111				return `$${roundBracket}$`;
112			}
113
114			return match;
115		}
116	);
117}
118
119// Escape $\\ce{...} → $\\ce{...} but with proper handling
120function escapeMhchem(text: string): string {
121	return MHCHEM_PATTERN_MAP.reduce((result, [pattern, replacement]) => {
122		return result.replace(pattern, replacement);
123	}, text);
124}
125
126const doEscapeMhchem = false;
127
128/**
129 * Preprocesses markdown content to safely handle LaTeX math expressions while protecting
130 * against false positives (e.g., dollar amounts like $5.99) and ensuring proper rendering.
131 *
132 * This function:
133 * - Protects code blocks (```) and inline code (`...`)
134 * - Safeguards block and inline LaTeX: \(...\), \[...\], $$...$$, and selective $...$
135 * - Escapes standalone dollar signs before numbers (e.g., $5 → \$5) to prevent misinterpretation
136 * - Restores protected LaTeX and code blocks after processing
137 * - Converts \(...\) → $...$ and \[...\] → $$...$$ for compatibility with math renderers
138 * - Applies additional escaping for brackets and mhchem syntax if needed
139 *
140 * @param content - The raw text (e.g., markdown) that may contain LaTeX or code blocks.
141 * @returns The preprocessed string with properly escaped and normalized LaTeX.
142 *
143 * @example
144 * preprocessLaTeX("Price: $10. The equation is \\(x^2\\).")
145 * // → "Price: $10. The equation is $x^2$."
146 */
147export function preprocessLaTeX(content: string): string {
148	// See also:
149	// https://github.com/danny-avila/LibreChat/blob/main/client/src/utils/latex.ts
150
151	// Step 0: Temporarily remove blockquote markers (>) to process LaTeX correctly
152	// Store the structure so we can restore it later
153	const blockquoteMarkers: Map<number, string> = new Map();
154	const lines = content.split('\n');
155	const processedLines = lines.map((line, index) => {
156		const match = line.match(/^(>\s*)/);
157		if (match) {
158			blockquoteMarkers.set(index, match[1]);
159			return line.slice(match[1].length);
160		}
161		return line;
162	});
163	content = processedLines.join('\n');
164
165	// Step 1: Protect code blocks
166	const codeBlocks: string[] = [];
167
168	content = content.replace(CODE_BLOCK_REGEXP, (match) => {
169		codeBlocks.push(match);
170
171		return `<<CODE_BLOCK_${codeBlocks.length - 1}>>`;
172	});
173
174	// Step 2: Protect existing LaTeX expressions
175	const latexExpressions: string[] = [];
176
177	// Match \S...\[...\] and protect them and insert a line-break.
178	content = content.replace(/([\S].*?)\\\[([\s\S]*?)\\\](.*)/g, (match, group1, group2, group3) => {
179		// Check if there are characters following the formula (display-formula in a table-cell?)
180		if (group1.endsWith('\\')) {
181			return match; // Backslash before \[, do nothing.
182		}
183		const hasSuffix = /\S/.test(group3);
184		let optBreak;
185
186		if (hasSuffix) {
187			latexExpressions.push(`\\(${group2.trim()}\\)`); // Convert into inline.
188			optBreak = '';
189		} else {
190			latexExpressions.push(`\\[${group2}\\]`);
191			optBreak = '\n';
192		}
193
194		return `${group1}${optBreak}<<LATEX_${latexExpressions.length - 1}>>${optBreak}${group3}`;
195	});
196
197	// Match \(...\), \[...\], $$...$$ and protect them
198	content = content.replace(
199		/(\$\$[\s\S]*?\$\$|(?<!\\)\\\[[\s\S]*?\\\]|(?<!\\)\\\(.*?\\\))/g,
200		(match) => {
201			latexExpressions.push(match);
202
203			return `<<LATEX_${latexExpressions.length - 1}>>`;
204		}
205	);
206
207	// Protect inline $...$ but NOT if it looks like money (e.g., $10, $3.99)
208	content = maskInlineLaTeX(content, latexExpressions);
209
210	// Step 3: Escape standalone $ before digits (currency like $5 → \$5)
211	// (Now that inline math is protected, this will only escape dollars not already protected)
212	content = content.replace(/\$(?=\d)/g, '\\$');
213
214	// Step 4: Restore protected LaTeX expressions (they are valid)
215	content = content.replace(/<<LATEX_(\d+)>>/g, (_, index) => {
216		let expr = latexExpressions[parseInt(index)];
217		const match = expr.match(LATEX_LINEBREAK_REGEXP);
218		if (match) {
219			// Katex: The $$-delimiters should be in their own line
220			// if there are \\-line-breaks.
221			const formula = match[1];
222			const prefix = formula.startsWith('\n') ? '' : '\n';
223			const suffix = formula.endsWith('\n') ? '' : '\n';
224			expr = '$$' + prefix + formula + suffix + '$$';
225		}
226		return expr;
227	});
228
229	// Step 5: Apply additional escaping functions (brackets and mhchem)
230	// This must happen BEFORE restoring code blocks to avoid affecting code content
231	content = escapeBrackets(content);
232
233	if (doEscapeMhchem && (content.includes('\\ce{') || content.includes('\\pu{'))) {
234		content = escapeMhchem(content);
235	}
236
237	// Step 6: Convert remaining \(...\) → $...$, \[...\] → $$...$$
238	// This must happen BEFORE restoring code blocks to avoid affecting code content
239	content = content
240		// Using the look‑behind pattern `(?<!\\)` we skip matches
241		// that are preceded by a backslash, e.g.
242		// `Definitions\\(also called macros)` (title of chapter 20 in The TeXbook).
243		.replace(/(?<!\\)\\\((.+?)\\\)/g, '$$$1$') // inline
244		.replace(
245			// Using the look‑behind pattern `(?<!\\)` we skip matches
246			// that are preceded by a backslash, e.g. `\\[4pt]`.
247			/(?<!\\)\\\[([\s\S]*?)\\\]/g, // display, see also PR #16599
248			(_, content: string) => {
249				return `$$${content}$$`;
250			}
251		);
252
253	// Step 7: Restore code blocks
254	// This happens AFTER all LaTeX conversions to preserve code content
255	content = content.replace(/<<CODE_BLOCK_(\d+)>>/g, (_, index) => {
256		return codeBlocks[parseInt(index)];
257	});
258
259	// Step 8: Restore blockquote markers
260	if (blockquoteMarkers.size > 0) {
261		const finalLines = content.split('\n');
262		const restoredLines = finalLines.map((line, index) => {
263			const marker = blockquoteMarkers.get(index);
264			return marker ? marker + line : line;
265		});
266		content = restoredLines.join('\n');
267	}
268
269	return content;
270}