1package parser
2
3import (
4 "bytes"
5 "regexp"
6 "strings"
7
8 "github.com/yuin/goldmark/ast"
9 "github.com/yuin/goldmark/text"
10 "github.com/yuin/goldmark/util"
11)
12
13var allowedBlockTags = map[string]bool{
14 "address": true,
15 "article": true,
16 "aside": true,
17 "base": true,
18 "basefont": true,
19 "blockquote": true,
20 "body": true,
21 "caption": true,
22 "center": true,
23 "col": true,
24 "colgroup": true,
25 "dd": true,
26 "details": true,
27 "dialog": true,
28 "dir": true,
29 "div": true,
30 "dl": true,
31 "dt": true,
32 "fieldset": true,
33 "figcaption": true,
34 "figure": true,
35 "footer": true,
36 "form": true,
37 "frame": true,
38 "frameset": true,
39 "h1": true,
40 "h2": true,
41 "h3": true,
42 "h4": true,
43 "h5": true,
44 "h6": true,
45 "head": true,
46 "header": true,
47 "hr": true,
48 "html": true,
49 "iframe": true,
50 "legend": true,
51 "li": true,
52 "link": true,
53 "main": true,
54 "menu": true,
55 "menuitem": true,
56 "meta": true,
57 "nav": true,
58 "noframes": true,
59 "ol": true,
60 "optgroup": true,
61 "option": true,
62 "p": true,
63 "param": true,
64 "search": true,
65 "section": true,
66 "summary": true,
67 "table": true,
68 "tbody": true,
69 "td": true,
70 "tfoot": true,
71 "th": true,
72 "thead": true,
73 "title": true,
74 "tr": true,
75 "track": true,
76 "ul": true,
77}
78
79var htmlBlockType1OpenRegexp = regexp.MustCompile(`(?i)^[ ]{0,3}<(script|pre|style|textarea)(?:\s.*|>.*|/>.*|)(?:\r\n|\n)?$`) //nolint:golint,lll
80var htmlBlockType1CloseRegexp = regexp.MustCompile(`(?i)^.*</(?:script|pre|style|textarea)>.*`)
81
82var htmlBlockType2OpenRegexp = regexp.MustCompile(`^[ ]{0,3}<!\-\-`)
83var htmlBlockType2Close = []byte{'-', '-', '>'}
84
85var htmlBlockType3OpenRegexp = regexp.MustCompile(`^[ ]{0,3}<\?`)
86var htmlBlockType3Close = []byte{'?', '>'}
87
88var htmlBlockType4OpenRegexp = regexp.MustCompile(`^[ ]{0,3}<![A-Z]+.*(?:\r\n|\n)?$`)
89var htmlBlockType4Close = []byte{'>'}
90
91var htmlBlockType5OpenRegexp = regexp.MustCompile(`^[ ]{0,3}<\!\[CDATA\[`)
92var htmlBlockType5Close = []byte{']', ']', '>'}
93
94var htmlBlockType6Regexp = regexp.MustCompile(`^[ ]{0,3}<(?:/[ ]*)?([a-zA-Z]+[a-zA-Z0-9\-]*)(?:[ ].*|>.*|/>.*|)(?:\r\n|\n)?$`) //nolint:golint,lll
95
96var htmlBlockType7Regexp = regexp.MustCompile(`^[ ]{0,3}<(/[ ]*)?([a-zA-Z]+[a-zA-Z0-9\-]*)(` + attributePattern + `*)[ ]*(?:>|/>)[ ]*(?:\r\n|\n)?$`) //nolint:golint,lll
97
98type htmlBlockParser struct {
99}
100
101var defaultHTMLBlockParser = &htmlBlockParser{}
102
103// NewHTMLBlockParser return a new BlockParser that can parse html
104// blocks.
105func NewHTMLBlockParser() BlockParser {
106 return defaultHTMLBlockParser
107}
108
109func (b *htmlBlockParser) Trigger() []byte {
110 return []byte{'<'}
111}
112
113func (b *htmlBlockParser) Open(parent ast.Node, reader text.Reader, pc Context) (ast.Node, State) {
114 var node *ast.HTMLBlock
115 line, segment := reader.PeekLine()
116 last := pc.LastOpenedBlock().Node
117
118 if m := htmlBlockType1OpenRegexp.FindSubmatchIndex(line); m != nil {
119 node = ast.NewHTMLBlock(ast.HTMLBlockType1)
120 } else if htmlBlockType2OpenRegexp.Match(line) {
121 node = ast.NewHTMLBlock(ast.HTMLBlockType2)
122 } else if htmlBlockType3OpenRegexp.Match(line) {
123 node = ast.NewHTMLBlock(ast.HTMLBlockType3)
124 } else if htmlBlockType4OpenRegexp.Match(line) {
125 node = ast.NewHTMLBlock(ast.HTMLBlockType4)
126 } else if htmlBlockType5OpenRegexp.Match(line) {
127 node = ast.NewHTMLBlock(ast.HTMLBlockType5)
128 } else if match := htmlBlockType7Regexp.FindSubmatchIndex(line); match != nil {
129 isCloseTag := match[2] > -1 && bytes.Equal(line[match[2]:match[3]], []byte("/"))
130 hasAttr := match[6] != match[7]
131 tagName := strings.ToLower(string(line[match[4]:match[5]]))
132 _, ok := allowedBlockTags[tagName]
133 if ok {
134 node = ast.NewHTMLBlock(ast.HTMLBlockType6)
135 } else if tagName != "script" && tagName != "style" &&
136 tagName != "pre" && !ast.IsParagraph(last) && !(isCloseTag && hasAttr) { // type 7 can not interrupt paragraph
137 node = ast.NewHTMLBlock(ast.HTMLBlockType7)
138 }
139 }
140 if node == nil {
141 if match := htmlBlockType6Regexp.FindSubmatchIndex(line); match != nil {
142 tagName := string(line[match[2]:match[3]])
143 _, ok := allowedBlockTags[strings.ToLower(tagName)]
144 if ok {
145 node = ast.NewHTMLBlock(ast.HTMLBlockType6)
146 }
147 }
148 }
149 if node != nil {
150 reader.AdvanceToEOL()
151 node.Lines().Append(segment)
152 return node, NoChildren
153 }
154 return nil, NoChildren
155}
156
157func (b *htmlBlockParser) Continue(node ast.Node, reader text.Reader, pc Context) State {
158 htmlBlock := node.(*ast.HTMLBlock)
159 lines := htmlBlock.Lines()
160 line, segment := reader.PeekLine()
161 var closurePattern []byte
162
163 switch htmlBlock.HTMLBlockType {
164 case ast.HTMLBlockType1:
165 if lines.Len() == 1 {
166 firstLine := lines.At(0)
167 if htmlBlockType1CloseRegexp.Match(firstLine.Value(reader.Source())) {
168 return Close
169 }
170 }
171 if htmlBlockType1CloseRegexp.Match(line) {
172 htmlBlock.ClosureLine = segment
173 reader.AdvanceToEOL()
174 return Close
175 }
176 case ast.HTMLBlockType2:
177 closurePattern = htmlBlockType2Close
178 fallthrough
179 case ast.HTMLBlockType3:
180 if closurePattern == nil {
181 closurePattern = htmlBlockType3Close
182 }
183 fallthrough
184 case ast.HTMLBlockType4:
185 if closurePattern == nil {
186 closurePattern = htmlBlockType4Close
187 }
188 fallthrough
189 case ast.HTMLBlockType5:
190 if closurePattern == nil {
191 closurePattern = htmlBlockType5Close
192 }
193
194 if lines.Len() == 1 {
195 firstLine := lines.At(0)
196 if bytes.Contains(firstLine.Value(reader.Source()), closurePattern) {
197 return Close
198 }
199 }
200 if bytes.Contains(line, closurePattern) {
201 htmlBlock.ClosureLine = segment
202 reader.AdvanceToEOL()
203 return Close
204 }
205
206 case ast.HTMLBlockType6, ast.HTMLBlockType7:
207 if util.IsBlank(line) {
208 return Close
209 }
210 }
211 node.Lines().Append(segment)
212 reader.AdvanceToEOL()
213 return Continue | NoChildren
214}
215
216func (b *htmlBlockParser) Close(node ast.Node, reader text.Reader, pc Context) {
217 // nothing to do
218}
219
220func (b *htmlBlockParser) CanInterruptParagraph() bool {
221 return true
222}
223
224func (b *htmlBlockParser) CanAcceptIndentedLine() bool {
225 return false
226}