1/**
  2 * Rehype plugin to restore limited HTML elements inside Markdown table cells.
  3 *
  4 * ## Problem
  5 * The remark/rehype pipeline neutralizes inline HTML as literal text
  6 * (remarkLiteralHtml) so that XML/HTML snippets in LLM responses display
  7 * as-is instead of being rendered. This causes <br> and <ul> markup in
  8 * table cells to show as plain text.
  9 *
 10 * ## Solution
 11 * This plugin traverses the HAST post-conversion, parses whitelisted HTML
 12 * patterns from text nodes, and replaces them with actual HAST element nodes
 13 * that will be rendered as real HTML.
 14 *
 15 * ## Supported HTML
 16 * - `<br>` / `<br/>` / `<br />` - Line breaks (inline)
 17 * - `<ul><li>...</li></ul>` - Unordered lists (block)
 18 *
 19 * ## Key Implementation Details
 20 *
 21 * ### 1. Sibling Combination (Critical)
 22 * The Markdown pipeline may fragment content across multiple text nodes and `<br>`
 23 * elements. For example, `<ul><li>a</li></ul>` might arrive as:
 24 *   - Text: `"<ul>"`
 25 *   - Element: `<br>`
 26 *   - Text: `"<li>a</li></ul>"`
 27 *
 28 * We must combine consecutive text nodes and `<br>` elements into a single string
 29 * before attempting to parse list markup. Without this, list detection fails.
 30 *
 31 * ### 2. visitParents for Deep Traversal
 32 * Table cell content may be wrapped in intermediate elements (e.g., `<p>` tags).
 33 * Using `visitParents` instead of direct child iteration ensures we find text
 34 * nodes at any depth within the cell.
 35 *
 36 * ### 3. Reference Comparison for No-Op Detection
 37 * When checking if `<br>` expansion changed anything, we compare:
 38 *   `expanded.length !== 1 || expanded[0] !== textNode`
 39 *
 40 * This catches both cases:
 41 * - Multiple nodes created (text was split)
 42 * - Single NEW node created (original had only `<br>`, now it's an element)
 43 *
 44 * A simple `length > 1` check would miss the single `<br>` case.
 45 *
 46 * ### 4. Strict List Validation
 47 * `parseList()` rejects malformed markup by checking for garbage text between
 48 * `<li>` elements. This prevents creating broken DOM from partial matches like
 49 * `<ul>garbage<li>a</li></ul>`.
 50 *
 51 * ### 5. Newline Substitution for `<br>` in Combined String
 52 * When combining siblings, existing `<br>` elements become `\n` in the combined
 53 * string. This allows list content to span visual lines while still being parsed
 54 * as a single unit.
 55 *
 56 * @example
 57 * // Input Markdown:
 58 * // | Feature | Notes |
 59 * // |---------|-------|
 60 * // | Multi-line | First<br>Second |
 61 * // | List | <ul><li>A</li><li>B</li></ul> |
 62 * //
 63 * // Without this plugin: <br> and <ul> render as literal text
 64 * // With this plugin: <br> becomes line break, <ul> becomes actual list
 65 */
 66
 67import type { Plugin } from 'unified';
 68import type { Element, ElementContent, Root, Text } from 'hast';
 69import { visit } from 'unist-util-visit';
 70import { visitParents } from 'unist-util-visit-parents';
 71import { BR_PATTERN, LIST_PATTERN, LI_PATTERN } from '$lib/constants/table-html-restorer';
 72
 73/**
 74 * Expands text containing `<br>` tags into an array of text nodes and br elements.
 75 */
 76function expandBrTags(value: string): ElementContent[] {
 77	const matches = [...value.matchAll(BR_PATTERN)];
 78	if (!matches.length) return [{ type: 'text', value } as Text];
 79
 80	const result: ElementContent[] = [];
 81	let cursor = 0;
 82
 83	for (const m of matches) {
 84		if (m.index! > cursor) {
 85			result.push({ type: 'text', value: value.slice(cursor, m.index) } as Text);
 86		}
 87		result.push({ type: 'element', tagName: 'br', properties: {}, children: [] } as Element);
 88		cursor = m.index! + m[0].length;
 89	}
 90
 91	if (cursor < value.length) {
 92		result.push({ type: 'text', value: value.slice(cursor) } as Text);
 93	}
 94
 95	return result;
 96}
 97
 98/**
 99 * Parses a `<ul><li>...</li></ul>` string into a HAST element.
100 * Returns null if the markup is malformed or contains unexpected content.
101 */
102function parseList(value: string): Element | null {
103	const match = value.trim().match(LIST_PATTERN);
104	if (!match) return null;
105
106	const body = match[1];
107	const items: ElementContent[] = [];
108	let cursor = 0;
109
110	for (const liMatch of body.matchAll(LI_PATTERN)) {
111		// Reject if there's non-whitespace between list items
112		if (body.slice(cursor, liMatch.index!).trim()) return null;
113
114		items.push({
115			type: 'element',
116			tagName: 'li',
117			properties: {},
118			children: expandBrTags(liMatch[1] ?? '')
119		} as Element);
120
121		cursor = liMatch.index! + liMatch[0].length;
122	}
123
124	// Reject if no items found or trailing garbage exists
125	if (!items.length || body.slice(cursor).trim()) return null;
126
127	return { type: 'element', tagName: 'ul', properties: {}, children: items } as Element;
128}
129
130/**
131 * Processes a single table cell, restoring HTML elements from text content.
132 */
133function processCell(cell: Element) {
134	visitParents(cell, 'text', (textNode: Text, ancestors) => {
135		const parent = ancestors[ancestors.length - 1];
136		if (!parent || parent.type !== 'element') return;
137
138		const parentEl = parent as Element;
139		const siblings = parentEl.children as ElementContent[];
140		const startIndex = siblings.indexOf(textNode as ElementContent);
141		if (startIndex === -1) return;
142
143		// Combine consecutive text nodes and <br> elements into one string
144		let combined = '';
145		let endIndex = startIndex;
146
147		for (let i = startIndex; i < siblings.length; i++) {
148			const sib = siblings[i];
149			if (sib.type === 'text') {
150				combined += (sib as Text).value;
151				endIndex = i;
152			} else if (sib.type === 'element' && (sib as Element).tagName === 'br') {
153				combined += '\n';
154				endIndex = i;
155			} else {
156				break;
157			}
158		}
159
160		// Try parsing as list first (replaces entire combined range)
161		const list = parseList(combined);
162		if (list) {
163			siblings.splice(startIndex, endIndex - startIndex + 1, list);
164			return;
165		}
166
167		// Otherwise, just expand <br> tags in this text node
168		const expanded = expandBrTags(textNode.value);
169		if (expanded.length !== 1 || expanded[0] !== textNode) {
170			siblings.splice(startIndex, 1, ...expanded);
171		}
172	});
173}
174
175export const rehypeRestoreTableHtml: Plugin<[], Root> = () => (tree) => {
176	visit(tree, 'element', (node: Element) => {
177		if (node.tagName === 'td' || node.tagName === 'th') {
178			processCell(node);
179		}
180	});
181};