1/**
2 * Rehype plugin to restore limited HTML elements inside Markdown table cells.
3 *
4 * ## Problem
5 * The remark/rehype pipeline neutralizes inline HTML as literal text
6 * (remarkLiteralHtml) so that XML/HTML snippets in LLM responses display
7 * as-is instead of being rendered. This causes <br> and <ul> markup in
8 * table cells to show as plain text.
9 *
10 * ## Solution
11 * This plugin traverses the HAST post-conversion, parses whitelisted HTML
12 * patterns from text nodes, and replaces them with actual HAST element nodes
13 * that will be rendered as real HTML.
14 *
15 * ## Supported HTML
16 * - `<br>` / `<br/>` / `<br />` - Line breaks (inline)
17 * - `<ul><li>...</li></ul>` - Unordered lists (block)
18 *
19 * ## Key Implementation Details
20 *
21 * ### 1. Sibling Combination (Critical)
22 * The Markdown pipeline may fragment content across multiple text nodes and `<br>`
23 * elements. For example, `<ul><li>a</li></ul>` might arrive as:
24 * - Text: `"<ul>"`
25 * - Element: `<br>`
26 * - Text: `"<li>a</li></ul>"`
27 *
28 * We must combine consecutive text nodes and `<br>` elements into a single string
29 * before attempting to parse list markup. Without this, list detection fails.
30 *
31 * ### 2. visitParents for Deep Traversal
32 * Table cell content may be wrapped in intermediate elements (e.g., `<p>` tags).
33 * Using `visitParents` instead of direct child iteration ensures we find text
34 * nodes at any depth within the cell.
35 *
36 * ### 3. Reference Comparison for No-Op Detection
37 * When checking if `<br>` expansion changed anything, we compare:
38 * `expanded.length !== 1 || expanded[0] !== textNode`
39 *
40 * This catches both cases:
41 * - Multiple nodes created (text was split)
42 * - Single NEW node created (original had only `<br>`, now it's an element)
43 *
44 * A simple `length > 1` check would miss the single `<br>` case.
45 *
46 * ### 4. Strict List Validation
47 * `parseList()` rejects malformed markup by checking for garbage text between
48 * `<li>` elements. This prevents creating broken DOM from partial matches like
49 * `<ul>garbage<li>a</li></ul>`.
50 *
51 * ### 5. Newline Substitution for `<br>` in Combined String
52 * When combining siblings, existing `<br>` elements become `\n` in the combined
53 * string. This allows list content to span visual lines while still being parsed
54 * as a single unit.
55 *
56 * @example
57 * // Input Markdown:
58 * // | Feature | Notes |
59 * // |---------|-------|
60 * // | Multi-line | First<br>Second |
61 * // | List | <ul><li>A</li><li>B</li></ul> |
62 * //
63 * // Without this plugin: <br> and <ul> render as literal text
64 * // With this plugin: <br> becomes line break, <ul> becomes actual list
65 */
66
67import type { Plugin } from 'unified';
68import type { Element, ElementContent, Root, Text } from 'hast';
69import { visit } from 'unist-util-visit';
70import { visitParents } from 'unist-util-visit-parents';
71import { BR_PATTERN, LIST_PATTERN, LI_PATTERN } from '$lib/constants/table-html-restorer';
72
73/**
74 * Expands text containing `<br>` tags into an array of text nodes and br elements.
75 */
76function expandBrTags(value: string): ElementContent[] {
77 const matches = [...value.matchAll(BR_PATTERN)];
78 if (!matches.length) return [{ type: 'text', value } as Text];
79
80 const result: ElementContent[] = [];
81 let cursor = 0;
82
83 for (const m of matches) {
84 if (m.index! > cursor) {
85 result.push({ type: 'text', value: value.slice(cursor, m.index) } as Text);
86 }
87 result.push({ type: 'element', tagName: 'br', properties: {}, children: [] } as Element);
88 cursor = m.index! + m[0].length;
89 }
90
91 if (cursor < value.length) {
92 result.push({ type: 'text', value: value.slice(cursor) } as Text);
93 }
94
95 return result;
96}
97
98/**
99 * Parses a `<ul><li>...</li></ul>` string into a HAST element.
100 * Returns null if the markup is malformed or contains unexpected content.
101 */
102function parseList(value: string): Element | null {
103 const match = value.trim().match(LIST_PATTERN);
104 if (!match) return null;
105
106 const body = match[1];
107 const items: ElementContent[] = [];
108 let cursor = 0;
109
110 for (const liMatch of body.matchAll(LI_PATTERN)) {
111 // Reject if there's non-whitespace between list items
112 if (body.slice(cursor, liMatch.index!).trim()) return null;
113
114 items.push({
115 type: 'element',
116 tagName: 'li',
117 properties: {},
118 children: expandBrTags(liMatch[1] ?? '')
119 } as Element);
120
121 cursor = liMatch.index! + liMatch[0].length;
122 }
123
124 // Reject if no items found or trailing garbage exists
125 if (!items.length || body.slice(cursor).trim()) return null;
126
127 return { type: 'element', tagName: 'ul', properties: {}, children: items } as Element;
128}
129
130/**
131 * Processes a single table cell, restoring HTML elements from text content.
132 */
133function processCell(cell: Element) {
134 visitParents(cell, 'text', (textNode: Text, ancestors) => {
135 const parent = ancestors[ancestors.length - 1];
136 if (!parent || parent.type !== 'element') return;
137
138 const parentEl = parent as Element;
139 const siblings = parentEl.children as ElementContent[];
140 const startIndex = siblings.indexOf(textNode as ElementContent);
141 if (startIndex === -1) return;
142
143 // Combine consecutive text nodes and <br> elements into one string
144 let combined = '';
145 let endIndex = startIndex;
146
147 for (let i = startIndex; i < siblings.length; i++) {
148 const sib = siblings[i];
149 if (sib.type === 'text') {
150 combined += (sib as Text).value;
151 endIndex = i;
152 } else if (sib.type === 'element' && (sib as Element).tagName === 'br') {
153 combined += '\n';
154 endIndex = i;
155 } else {
156 break;
157 }
158 }
159
160 // Try parsing as list first (replaces entire combined range)
161 const list = parseList(combined);
162 if (list) {
163 siblings.splice(startIndex, endIndex - startIndex + 1, list);
164 return;
165 }
166
167 // Otherwise, just expand <br> tags in this text node
168 const expanded = expandBrTags(textNode.value);
169 if (expanded.length !== 1 || expanded[0] !== textNode) {
170 siblings.splice(startIndex, 1, ...expanded);
171 }
172 });
173}
174
175export const rehypeRestoreTableHtml: Plugin<[], Root> = () => (tree) => {
176 visit(tree, 'element', (node: Element) => {
177 if (node.tagName === 'td' || node.tagName === 'th') {
178 processCell(node);
179 }
180 });
181};