diff options
Diffstat (limited to 'llama.cpp/tools/server/webui/src/lib/utils/latex-protection.ts')
| -rw-r--r-- | llama.cpp/tools/server/webui/src/lib/utils/latex-protection.ts | 270 |
1 files changed, 270 insertions, 0 deletions
diff --git a/llama.cpp/tools/server/webui/src/lib/utils/latex-protection.ts b/llama.cpp/tools/server/webui/src/lib/utils/latex-protection.ts new file mode 100644 index 0000000..cafa2d4 --- /dev/null +++ b/llama.cpp/tools/server/webui/src/lib/utils/latex-protection.ts @@ -0,0 +1,270 @@ +import { + CODE_BLOCK_REGEXP, + LATEX_MATH_AND_CODE_PATTERN, + LATEX_LINEBREAK_REGEXP, + MHCHEM_PATTERN_MAP +} from '$lib/constants/latex-protection'; + +/** + * Replaces inline LaTeX expressions enclosed in `$...$` with placeholders, avoiding dollar signs + * that appear to be part of monetary values or identifiers. + * + * This function processes the input line by line and skips `$` sequences that are likely + * part of money amounts (e.g., `$5`, `$100.99`) or code-like tokens (e.g., `var$`, `$var`). + * Valid LaTeX inline math is replaced with a placeholder like `<<LATEX_0>>`, and the + * actual LaTeX content is stored in the provided `latexExpressions` array. + * + * @param content - The input text potentially containing LaTeX expressions. + * @param latexExpressions - An array used to collect extracted LaTeX expressions. + * @returns The processed string with LaTeX replaced by placeholders. + */ +export function maskInlineLaTeX(content: string, latexExpressions: string[]): string { + if (!content.includes('$')) { + return content; + } + return content + .split('\n') + .map((line) => { + if (line.indexOf('$') == -1) { + return line; + } + + let processedLine = ''; + let currentPosition = 0; + + while (currentPosition < line.length) { + const openDollarIndex = line.indexOf('$', currentPosition); + + if (openDollarIndex == -1) { + processedLine += line.slice(currentPosition); + break; + } + + // Is there a next $-sign? + const closeDollarIndex = line.indexOf('$', openDollarIndex + 1); + + if (closeDollarIndex == -1) { + processedLine += line.slice(currentPosition); + break; + } + + const charBeforeOpen = openDollarIndex > 0 ? line[openDollarIndex - 1] : ''; + const charAfterOpen = line[openDollarIndex + 1]; + const charBeforeClose = + openDollarIndex + 1 < closeDollarIndex ? line[closeDollarIndex - 1] : ''; + const charAfterClose = closeDollarIndex + 1 < line.length ? line[closeDollarIndex + 1] : ''; + + let shouldSkipAsNonLatex = false; + + if (closeDollarIndex == currentPosition + 1) { + // No content + shouldSkipAsNonLatex = true; + } + + if (/[A-Za-z0-9_$-]/.test(charBeforeOpen)) { + // Character, digit, $, _ or - before first '$', no TeX. + shouldSkipAsNonLatex = true; + } + + if ( + /[0-9]/.test(charAfterOpen) && + (/[A-Za-z0-9_$-]/.test(charAfterClose) || ' ' == charBeforeClose) + ) { + // First $ seems to belong to an amount. + shouldSkipAsNonLatex = true; + } + + if (shouldSkipAsNonLatex) { + processedLine += line.slice(currentPosition, openDollarIndex + 1); + currentPosition = openDollarIndex + 1; + + continue; + } + + // Treat as LaTeX + processedLine += line.slice(currentPosition, openDollarIndex); + const latexContent = line.slice(openDollarIndex, closeDollarIndex + 1); + latexExpressions.push(latexContent); + processedLine += `<<LATEX_${latexExpressions.length - 1}>>`; + currentPosition = closeDollarIndex + 1; + } + + return processedLine; + }) + .join('\n'); +} + +function escapeBrackets(text: string): string { + return text.replace( + LATEX_MATH_AND_CODE_PATTERN, + ( + match: string, + codeBlock: string | undefined, + squareBracket: string | undefined, + roundBracket: string | undefined + ): string => { + if (codeBlock != null) { + return codeBlock; + } else if (squareBracket != null) { + return `$$${squareBracket}$$`; + } else if (roundBracket != null) { + return `$${roundBracket}$`; + } + + return match; + } + ); +} + +// Escape $\\ce{...} → $\\ce{...} but with proper handling +function escapeMhchem(text: string): string { + return MHCHEM_PATTERN_MAP.reduce((result, [pattern, replacement]) => { + return result.replace(pattern, replacement); + }, text); +} + +const doEscapeMhchem = false; + +/** + * Preprocesses markdown content to safely handle LaTeX math expressions while protecting + * against false positives (e.g., dollar amounts like $5.99) and ensuring proper rendering. + * + * This function: + * - Protects code blocks (```) and inline code (`...`) + * - Safeguards block and inline LaTeX: \(...\), \[...\], $$...$$, and selective $...$ + * - Escapes standalone dollar signs before numbers (e.g., $5 → \$5) to prevent misinterpretation + * - Restores protected LaTeX and code blocks after processing + * - Converts \(...\) → $...$ and \[...\] → $$...$$ for compatibility with math renderers + * - Applies additional escaping for brackets and mhchem syntax if needed + * + * @param content - The raw text (e.g., markdown) that may contain LaTeX or code blocks. + * @returns The preprocessed string with properly escaped and normalized LaTeX. + * + * @example + * preprocessLaTeX("Price: $10. The equation is \\(x^2\\).") + * // → "Price: $10. The equation is $x^2$." + */ +export function preprocessLaTeX(content: string): string { + // See also: + // https://github.com/danny-avila/LibreChat/blob/main/client/src/utils/latex.ts + + // Step 0: Temporarily remove blockquote markers (>) to process LaTeX correctly + // Store the structure so we can restore it later + const blockquoteMarkers: Map<number, string> = new Map(); + const lines = content.split('\n'); + const processedLines = lines.map((line, index) => { + const match = line.match(/^(>\s*)/); + if (match) { + blockquoteMarkers.set(index, match[1]); + return line.slice(match[1].length); + } + return line; + }); + content = processedLines.join('\n'); + + // Step 1: Protect code blocks + const codeBlocks: string[] = []; + + content = content.replace(CODE_BLOCK_REGEXP, (match) => { + codeBlocks.push(match); + + return `<<CODE_BLOCK_${codeBlocks.length - 1}>>`; + }); + + // Step 2: Protect existing LaTeX expressions + const latexExpressions: string[] = []; + + // Match \S...\[...\] and protect them and insert a line-break. + content = content.replace(/([\S].*?)\\\[([\s\S]*?)\\\](.*)/g, (match, group1, group2, group3) => { + // Check if there are characters following the formula (display-formula in a table-cell?) + if (group1.endsWith('\\')) { + return match; // Backslash before \[, do nothing. + } + const hasSuffix = /\S/.test(group3); + let optBreak; + + if (hasSuffix) { + latexExpressions.push(`\\(${group2.trim()}\\)`); // Convert into inline. + optBreak = ''; + } else { + latexExpressions.push(`\\[${group2}\\]`); + optBreak = '\n'; + } + + return `${group1}${optBreak}<<LATEX_${latexExpressions.length - 1}>>${optBreak}${group3}`; + }); + + // Match \(...\), \[...\], $$...$$ and protect them + content = content.replace( + /(\$\$[\s\S]*?\$\$|(?<!\\)\\\[[\s\S]*?\\\]|(?<!\\)\\\(.*?\\\))/g, + (match) => { + latexExpressions.push(match); + + return `<<LATEX_${latexExpressions.length - 1}>>`; + } + ); + + // Protect inline $...$ but NOT if it looks like money (e.g., $10, $3.99) + content = maskInlineLaTeX(content, latexExpressions); + + // Step 3: Escape standalone $ before digits (currency like $5 → \$5) + // (Now that inline math is protected, this will only escape dollars not already protected) + content = content.replace(/\$(?=\d)/g, '\\$'); + + // Step 4: Restore protected LaTeX expressions (they are valid) + content = content.replace(/<<LATEX_(\d+)>>/g, (_, index) => { + let expr = latexExpressions[parseInt(index)]; + const match = expr.match(LATEX_LINEBREAK_REGEXP); + if (match) { + // Katex: The $$-delimiters should be in their own line + // if there are \\-line-breaks. + const formula = match[1]; + const prefix = formula.startsWith('\n') ? '' : '\n'; + const suffix = formula.endsWith('\n') ? '' : '\n'; + expr = '$$' + prefix + formula + suffix + '$$'; + } + return expr; + }); + + // Step 5: Apply additional escaping functions (brackets and mhchem) + // This must happen BEFORE restoring code blocks to avoid affecting code content + content = escapeBrackets(content); + + if (doEscapeMhchem && (content.includes('\\ce{') || content.includes('\\pu{'))) { + content = escapeMhchem(content); + } + + // Step 6: Convert remaining \(...\) → $...$, \[...\] → $$...$$ + // This must happen BEFORE restoring code blocks to avoid affecting code content + content = content + // Using the look‑behind pattern `(?<!\\)` we skip matches + // that are preceded by a backslash, e.g. + // `Definitions\\(also called macros)` (title of chapter 20 in The TeXbook). + .replace(/(?<!\\)\\\((.+?)\\\)/g, '$$$1$') // inline + .replace( + // Using the look‑behind pattern `(?<!\\)` we skip matches + // that are preceded by a backslash, e.g. `\\[4pt]`. + /(?<!\\)\\\[([\s\S]*?)\\\]/g, // display, see also PR #16599 + (_, content: string) => { + return `$$${content}$$`; + } + ); + + // Step 7: Restore code blocks + // This happens AFTER all LaTeX conversions to preserve code content + content = content.replace(/<<CODE_BLOCK_(\d+)>>/g, (_, index) => { + return codeBlocks[parseInt(index)]; + }); + + // Step 8: Restore blockquote markers + if (blockquoteMarkers.size > 0) { + const finalLines = content.split('\n'); + const restoredLines = finalLines.map((line, index) => { + const marker = blockquoteMarkers.get(index); + return marker ? marker + line : line; + }); + content = restoredLines.join('\n'); + } + + return content; +} |
