1import {
2 CODE_BLOCK_REGEXP,
3 LATEX_MATH_AND_CODE_PATTERN,
4 LATEX_LINEBREAK_REGEXP,
5 MHCHEM_PATTERN_MAP
6} from '$lib/constants/latex-protection';
7
8/**
9 * Replaces inline LaTeX expressions enclosed in `$...$` with placeholders, avoiding dollar signs
10 * that appear to be part of monetary values or identifiers.
11 *
12 * This function processes the input line by line and skips `$` sequences that are likely
13 * part of money amounts (e.g., `$5`, `$100.99`) or code-like tokens (e.g., `var$`, `$var`).
14 * Valid LaTeX inline math is replaced with a placeholder like `<<LATEX_0>>`, and the
15 * actual LaTeX content is stored in the provided `latexExpressions` array.
16 *
17 * @param content - The input text potentially containing LaTeX expressions.
18 * @param latexExpressions - An array used to collect extracted LaTeX expressions.
19 * @returns The processed string with LaTeX replaced by placeholders.
20 */
21export function maskInlineLaTeX(content: string, latexExpressions: string[]): string {
22 if (!content.includes('$')) {
23 return content;
24 }
25 return content
26 .split('\n')
27 .map((line) => {
28 if (line.indexOf('$') == -1) {
29 return line;
30 }
31
32 let processedLine = '';
33 let currentPosition = 0;
34
35 while (currentPosition < line.length) {
36 const openDollarIndex = line.indexOf('$', currentPosition);
37
38 if (openDollarIndex == -1) {
39 processedLine += line.slice(currentPosition);
40 break;
41 }
42
43 // Is there a next $-sign?
44 const closeDollarIndex = line.indexOf('$', openDollarIndex + 1);
45
46 if (closeDollarIndex == -1) {
47 processedLine += line.slice(currentPosition);
48 break;
49 }
50
51 const charBeforeOpen = openDollarIndex > 0 ? line[openDollarIndex - 1] : '';
52 const charAfterOpen = line[openDollarIndex + 1];
53 const charBeforeClose =
54 openDollarIndex + 1 < closeDollarIndex ? line[closeDollarIndex - 1] : '';
55 const charAfterClose = closeDollarIndex + 1 < line.length ? line[closeDollarIndex + 1] : '';
56
57 let shouldSkipAsNonLatex = false;
58
59 if (closeDollarIndex == currentPosition + 1) {
60 // No content
61 shouldSkipAsNonLatex = true;
62 }
63
64 if (/[A-Za-z0-9_$-]/.test(charBeforeOpen)) {
65 // Character, digit, $, _ or - before first '$', no TeX.
66 shouldSkipAsNonLatex = true;
67 }
68
69 if (
70 /[0-9]/.test(charAfterOpen) &&
71 (/[A-Za-z0-9_$-]/.test(charAfterClose) || ' ' == charBeforeClose)
72 ) {
73 // First $ seems to belong to an amount.
74 shouldSkipAsNonLatex = true;
75 }
76
77 if (shouldSkipAsNonLatex) {
78 processedLine += line.slice(currentPosition, openDollarIndex + 1);
79 currentPosition = openDollarIndex + 1;
80
81 continue;
82 }
83
84 // Treat as LaTeX
85 processedLine += line.slice(currentPosition, openDollarIndex);
86 const latexContent = line.slice(openDollarIndex, closeDollarIndex + 1);
87 latexExpressions.push(latexContent);
88 processedLine += `<<LATEX_${latexExpressions.length - 1}>>`;
89 currentPosition = closeDollarIndex + 1;
90 }
91
92 return processedLine;
93 })
94 .join('\n');
95}
96
97function escapeBrackets(text: string): string {
98 return text.replace(
99 LATEX_MATH_AND_CODE_PATTERN,
100 (
101 match: string,
102 codeBlock: string | undefined,
103 squareBracket: string | undefined,
104 roundBracket: string | undefined
105 ): string => {
106 if (codeBlock != null) {
107 return codeBlock;
108 } else if (squareBracket != null) {
109 return `$$${squareBracket}$$`;
110 } else if (roundBracket != null) {
111 return `$${roundBracket}$`;
112 }
113
114 return match;
115 }
116 );
117}
118
119// Escape $\\ce{...} → $\\ce{...} but with proper handling
120function escapeMhchem(text: string): string {
121 return MHCHEM_PATTERN_MAP.reduce((result, [pattern, replacement]) => {
122 return result.replace(pattern, replacement);
123 }, text);
124}
125
126const doEscapeMhchem = false;
127
128/**
129 * Preprocesses markdown content to safely handle LaTeX math expressions while protecting
130 * against false positives (e.g., dollar amounts like $5.99) and ensuring proper rendering.
131 *
132 * This function:
133 * - Protects code blocks (```) and inline code (`...`)
134 * - Safeguards block and inline LaTeX: \(...\), \[...\], $$...$$, and selective $...$
135 * - Escapes standalone dollar signs before numbers (e.g., $5 → \$5) to prevent misinterpretation
136 * - Restores protected LaTeX and code blocks after processing
137 * - Converts \(...\) → $...$ and \[...\] → $$...$$ for compatibility with math renderers
138 * - Applies additional escaping for brackets and mhchem syntax if needed
139 *
140 * @param content - The raw text (e.g., markdown) that may contain LaTeX or code blocks.
141 * @returns The preprocessed string with properly escaped and normalized LaTeX.
142 *
143 * @example
144 * preprocessLaTeX("Price: $10. The equation is \\(x^2\\).")
145 * // → "Price: $10. The equation is $x^2$."
146 */
147export function preprocessLaTeX(content: string): string {
148 // See also:
149 // https://github.com/danny-avila/LibreChat/blob/main/client/src/utils/latex.ts
150
151 // Step 0: Temporarily remove blockquote markers (>) to process LaTeX correctly
152 // Store the structure so we can restore it later
153 const blockquoteMarkers: Map<number, string> = new Map();
154 const lines = content.split('\n');
155 const processedLines = lines.map((line, index) => {
156 const match = line.match(/^(>\s*)/);
157 if (match) {
158 blockquoteMarkers.set(index, match[1]);
159 return line.slice(match[1].length);
160 }
161 return line;
162 });
163 content = processedLines.join('\n');
164
165 // Step 1: Protect code blocks
166 const codeBlocks: string[] = [];
167
168 content = content.replace(CODE_BLOCK_REGEXP, (match) => {
169 codeBlocks.push(match);
170
171 return `<<CODE_BLOCK_${codeBlocks.length - 1}>>`;
172 });
173
174 // Step 2: Protect existing LaTeX expressions
175 const latexExpressions: string[] = [];
176
177 // Match \S...\[...\] and protect them and insert a line-break.
178 content = content.replace(/([\S].*?)\\\[([\s\S]*?)\\\](.*)/g, (match, group1, group2, group3) => {
179 // Check if there are characters following the formula (display-formula in a table-cell?)
180 if (group1.endsWith('\\')) {
181 return match; // Backslash before \[, do nothing.
182 }
183 const hasSuffix = /\S/.test(group3);
184 let optBreak;
185
186 if (hasSuffix) {
187 latexExpressions.push(`\\(${group2.trim()}\\)`); // Convert into inline.
188 optBreak = '';
189 } else {
190 latexExpressions.push(`\\[${group2}\\]`);
191 optBreak = '\n';
192 }
193
194 return `${group1}${optBreak}<<LATEX_${latexExpressions.length - 1}>>${optBreak}${group3}`;
195 });
196
197 // Match \(...\), \[...\], $$...$$ and protect them
198 content = content.replace(
199 /(\$\$[\s\S]*?\$\$|(?<!\\)\\\[[\s\S]*?\\\]|(?<!\\)\\\(.*?\\\))/g,
200 (match) => {
201 latexExpressions.push(match);
202
203 return `<<LATEX_${latexExpressions.length - 1}>>`;
204 }
205 );
206
207 // Protect inline $...$ but NOT if it looks like money (e.g., $10, $3.99)
208 content = maskInlineLaTeX(content, latexExpressions);
209
210 // Step 3: Escape standalone $ before digits (currency like $5 → \$5)
211 // (Now that inline math is protected, this will only escape dollars not already protected)
212 content = content.replace(/\$(?=\d)/g, '\\$');
213
214 // Step 4: Restore protected LaTeX expressions (they are valid)
215 content = content.replace(/<<LATEX_(\d+)>>/g, (_, index) => {
216 let expr = latexExpressions[parseInt(index)];
217 const match = expr.match(LATEX_LINEBREAK_REGEXP);
218 if (match) {
219 // Katex: The $$-delimiters should be in their own line
220 // if there are \\-line-breaks.
221 const formula = match[1];
222 const prefix = formula.startsWith('\n') ? '' : '\n';
223 const suffix = formula.endsWith('\n') ? '' : '\n';
224 expr = '$$' + prefix + formula + suffix + '$$';
225 }
226 return expr;
227 });
228
229 // Step 5: Apply additional escaping functions (brackets and mhchem)
230 // This must happen BEFORE restoring code blocks to avoid affecting code content
231 content = escapeBrackets(content);
232
233 if (doEscapeMhchem && (content.includes('\\ce{') || content.includes('\\pu{'))) {
234 content = escapeMhchem(content);
235 }
236
237 // Step 6: Convert remaining \(...\) → $...$, \[...\] → $$...$$
238 // This must happen BEFORE restoring code blocks to avoid affecting code content
239 content = content
240 // Using the look‑behind pattern `(?<!\\)` we skip matches
241 // that are preceded by a backslash, e.g.
242 // `Definitions\\(also called macros)` (title of chapter 20 in The TeXbook).
243 .replace(/(?<!\\)\\\((.+?)\\\)/g, '$$$1$') // inline
244 .replace(
245 // Using the look‑behind pattern `(?<!\\)` we skip matches
246 // that are preceded by a backslash, e.g. `\\[4pt]`.
247 /(?<!\\)\\\[([\s\S]*?)\\\]/g, // display, see also PR #16599
248 (_, content: string) => {
249 return `$$${content}$$`;
250 }
251 );
252
253 // Step 7: Restore code blocks
254 // This happens AFTER all LaTeX conversions to preserve code content
255 content = content.replace(/<<CODE_BLOCK_(\d+)>>/g, (_, index) => {
256 return codeBlocks[parseInt(index)];
257 });
258
259 // Step 8: Restore blockquote markers
260 if (blockquoteMarkers.size > 0) {
261 const finalLines = content.split('\n');
262 const restoredLines = finalLines.map((line, index) => {
263 const marker = blockquoteMarkers.get(index);
264 return marker ? marker + line : line;
265 });
266 content = restoredLines.join('\n');
267 }
268
269 return content;
270}