1// Package html minifies HTML5 following the specifications at http://www.w3.org/TR/html5/syntax.html.
  2package html
  3
  4import (
  5	"bytes"
  6	"io"
  7
  8	"github.com/tdewolff/minify/v2"
  9	"github.com/tdewolff/parse/v2"
 10	"github.com/tdewolff/parse/v2/buffer"
 11	"github.com/tdewolff/parse/v2/html"
 12)
 13
 14var (
 15	gtBytes         = []byte(">")
 16	isBytes         = []byte("=")
 17	spaceBytes      = []byte(" ")
 18	doctypeBytes    = []byte("<!doctype html>")
 19	jsMimeBytes     = []byte("application/javascript")
 20	cssMimeBytes    = []byte("text/css")
 21	htmlMimeBytes   = []byte("text/html")
 22	svgMimeBytes    = []byte("image/svg+xml")
 23	formMimeBytes   = []byte("application/x-www-form-urlencoded")
 24	mathMimeBytes   = []byte("application/mathml+xml")
 25	dataSchemeBytes = []byte("data:")
 26	jsSchemeBytes   = []byte("javascript:")
 27	httpBytes       = []byte("http")
 28	radioBytes      = []byte("radio")
 29	onBytes         = []byte("on")
 30	textBytes       = []byte("text")
 31	noneBytes       = []byte("none")
 32	submitBytes     = []byte("submit")
 33	allBytes        = []byte("all")
 34	rectBytes       = []byte("rect")
 35	dataBytes       = []byte("data")
 36	getBytes        = []byte("get")
 37	autoBytes       = []byte("auto")
 38	oneBytes        = []byte("one")
 39	inlineParams    = map[string]string{"inline": "1"}
 40)
 41
 42////////////////////////////////////////////////////////////////
 43
 44// Minifier is an HTML minifier.
 45type Minifier struct {
 46	KeepComments            bool
 47	KeepConditionalComments bool
 48	KeepDefaultAttrVals     bool
 49	KeepDocumentTags        bool
 50	KeepEndTags             bool
 51	KeepQuotes              bool
 52	KeepWhitespace          bool
 53}
 54
 55// Minify minifies HTML data, it reads from r and writes to w.
 56func Minify(m *minify.M, w io.Writer, r io.Reader, params map[string]string) error {
 57	return (&Minifier{}).Minify(m, w, r, params)
 58}
 59
 60// Minify minifies HTML data, it reads from r and writes to w.
 61func (o *Minifier) Minify(m *minify.M, w io.Writer, r io.Reader, _ map[string]string) error {
 62	var rawTagHash Hash
 63	var rawTagMediatype []byte
 64
 65	omitSpace := true // if true the next leading space is omitted
 66	inPre := false
 67
 68	attrMinifyBuffer := buffer.NewWriter(make([]byte, 0, 64))
 69	attrByteBuffer := make([]byte, 0, 64)
 70
 71	z := parse.NewInput(r)
 72	defer z.Restore()
 73
 74	l := html.NewLexer(z)
 75	tb := NewTokenBuffer(z, l)
 76	for {
 77		t := *tb.Shift()
 78		switch t.TokenType {
 79		case html.ErrorToken:
 80			if _, err := w.Write(nil); err != nil {
 81				return err
 82			}
 83			if l.Err() == io.EOF {
 84				return nil
 85			}
 86			return l.Err()
 87		case html.DoctypeToken:
 88			w.Write(doctypeBytes)
 89		case html.CommentToken:
 90			if o.KeepComments {
 91				w.Write(t.Data)
 92			} else if o.KeepConditionalComments && 6 < len(t.Text) && (bytes.HasPrefix(t.Text, []byte("[if ")) || bytes.HasSuffix(t.Text, []byte("[endif]")) || bytes.HasSuffix(t.Text, []byte("[endif]--"))) {
 93				// [if ...] is always 7 or more characters, [endif] is only encountered for downlevel-revealed
 94				// see https://msdn.microsoft.com/en-us/library/ms537512(v=vs.85).aspx#syntax
 95				if bytes.HasPrefix(t.Data, []byte("<!--[if ")) && bytes.HasSuffix(t.Data, []byte("<![endif]-->")) { // downlevel-hidden
 96					begin := bytes.IndexByte(t.Data, '>') + 1
 97					end := len(t.Data) - len("<![endif]-->")
 98					w.Write(t.Data[:begin])
 99					if err := o.Minify(m, w, buffer.NewReader(t.Data[begin:end]), nil); err != nil {
100						return minify.UpdateErrorPosition(err, z, t.Offset)
101					}
102					w.Write(t.Data[end:])
103				} else {
104					w.Write(t.Data) // downlevel-revealed or short downlevel-hidden
105				}
106			} else if 1 < len(t.Text) && t.Text[0] == '#' {
107				// SSI tags
108				w.Write(t.Data)
109			}
110		case html.SvgToken:
111			if err := m.MinifyMimetype(svgMimeBytes, w, buffer.NewReader(t.Data), nil); err != nil {
112				if err != minify.ErrNotExist {
113					return minify.UpdateErrorPosition(err, z, t.Offset)
114				}
115				w.Write(t.Data)
116			}
117		case html.MathToken:
118			if err := m.MinifyMimetype(mathMimeBytes, w, buffer.NewReader(t.Data), nil); err != nil {
119				if err != minify.ErrNotExist {
120					return minify.UpdateErrorPosition(err, z, t.Offset)
121				}
122				w.Write(t.Data)
123			}
124		case html.TextToken:
125			// CSS and JS minifiers for inline code
126			if rawTagHash != 0 {
127				if rawTagHash == Style || rawTagHash == Script || rawTagHash == Iframe {
128					var mimetype []byte
129					var params map[string]string
130					if rawTagHash == Iframe {
131						mimetype = htmlMimeBytes
132					} else if len(rawTagMediatype) > 0 {
133						mimetype, params = parse.Mediatype(rawTagMediatype)
134					} else if rawTagHash == Script {
135						mimetype = jsMimeBytes
136					} else if rawTagHash == Style {
137						mimetype = cssMimeBytes
138					}
139					if err := m.MinifyMimetype(mimetype, w, buffer.NewReader(t.Data), params); err != nil {
140						if err != minify.ErrNotExist {
141							return minify.UpdateErrorPosition(err, z, t.Offset)
142						}
143						w.Write(t.Data)
144					}
145				} else {
146					w.Write(t.Data)
147				}
148			} else if inPre {
149				w.Write(t.Data)
150			} else {
151				t.Data = parse.ReplaceMultipleWhitespaceAndEntities(t.Data, EntitiesMap, TextRevEntitiesMap)
152
153				// whitespace removal; trim left
154				if omitSpace && parse.IsWhitespace(t.Data[0]) {
155					t.Data = t.Data[1:]
156				}
157
158				// whitespace removal; trim right
159				omitSpace = false
160				if len(t.Data) == 0 {
161					omitSpace = true
162				} else if parse.IsWhitespace(t.Data[len(t.Data)-1]) {
163					omitSpace = true
164					i := 0
165					for {
166						next := tb.Peek(i)
167						// trim if EOF, text token with leading whitespace or block token
168						if next.TokenType == html.ErrorToken {
169							t.Data = t.Data[:len(t.Data)-1]
170							omitSpace = false
171							break
172						} else if next.TokenType == html.TextToken {
173							// this only happens when a comment, doctype or phrasing end tag (only for !o.KeepWhitespace) was in between
174							// remove if the text token starts with a whitespace
175							if len(next.Data) > 0 && parse.IsWhitespace(next.Data[0]) {
176								t.Data = t.Data[:len(t.Data)-1]
177								omitSpace = false
178							}
179							break
180						} else if next.TokenType == html.StartTagToken || next.TokenType == html.EndTagToken {
181							if o.KeepWhitespace {
182								break
183							}
184							// remove when followed up by a block tag
185							if next.Traits&nonPhrasingTag != 0 {
186								t.Data = t.Data[:len(t.Data)-1]
187								omitSpace = false
188								break
189							} else if next.TokenType == html.StartTagToken {
190								break
191							}
192						}
193						i++
194					}
195				}
196
197				w.Write(t.Data)
198			}
199		case html.StartTagToken, html.EndTagToken:
200			rawTagHash = 0
201			hasAttributes := false
202			if t.TokenType == html.StartTagToken {
203				if next := tb.Peek(0); next.TokenType == html.AttributeToken {
204					hasAttributes = true
205				}
206				if t.Traits&rawTag != 0 {
207					// ignore empty script and style tags
208					if !hasAttributes && (t.Hash == Script || t.Hash == Style) {
209						if next := tb.Peek(1); next.TokenType == html.EndTagToken {
210							tb.Shift()
211							tb.Shift()
212							break
213						}
214					}
215					rawTagHash = t.Hash
216					rawTagMediatype = nil
217
218					// do not minify content of <style amp-boilerplate>
219					if hasAttributes && t.Hash == Style {
220						if attrs := tb.Attributes(Amp_Boilerplate); attrs[0] != nil {
221							rawTagHash = 0
222						}
223					}
224				}
225			} else if t.Hash == Template {
226				omitSpace = true // EndTagToken
227			}
228
229			if t.Hash == Pre {
230				inPre = t.TokenType == html.StartTagToken
231			}
232
233			// remove superfluous tags, except for html, head and body tags when KeepDocumentTags is set
234			if !hasAttributes && (!o.KeepDocumentTags && (t.Hash == Html || t.Hash == Head || t.Hash == Body) || t.Hash == Colgroup) {
235				break
236			} else if t.TokenType == html.EndTagToken {
237				omitEndTag := false
238				if !o.KeepEndTags {
239					if t.Hash == Thead || t.Hash == Tbody || t.Hash == Tfoot || t.Hash == Tr || t.Hash == Th ||
240						t.Hash == Td || t.Hash == Option || t.Hash == Dd || t.Hash == Dt || t.Hash == Li ||
241						t.Hash == Rb || t.Hash == Rt || t.Hash == Rtc || t.Hash == Rp {
242						omitEndTag = true // omit end tags
243					} else if t.Hash == P {
244						i := 0
245						for {
246							next := tb.Peek(i)
247							i++
248							// continue if text token is empty or whitespace
249							if next.TokenType == html.TextToken && parse.IsAllWhitespace(next.Data) {
250								continue
251							}
252							if next.TokenType == html.ErrorToken || next.TokenType == html.EndTagToken && next.Traits&keepPTag == 0 || next.TokenType == html.StartTagToken && next.Traits&omitPTag != 0 {
253								omitEndTag = true // omit p end tag
254							}
255							break
256						}
257					} else if t.Hash == Optgroup {
258						i := 0
259						for {
260							next := tb.Peek(i)
261							i++
262							// continue if text token
263							if next.TokenType == html.TextToken {
264								continue
265							}
266							if next.TokenType == html.ErrorToken || next.Hash != Option {
267								omitEndTag = true // omit optgroup end tag
268							}
269							break
270						}
271					}
272				}
273
274				if t.Traits&nonPhrasingTag != 0 {
275					omitSpace = true // omit spaces after block elements
276				} else if o.KeepWhitespace || t.Traits&objectTag != 0 {
277					omitSpace = false
278				}
279
280				if !omitEndTag {
281					if len(t.Data) > 3+len(t.Text) {
282						t.Data[2+len(t.Text)] = '>'
283						t.Data = t.Data[:3+len(t.Text)]
284					}
285					w.Write(t.Data)
286				}
287
288				// skip text in select and optgroup tags
289				if t.Hash == Option || t.Hash == Optgroup {
290					if next := tb.Peek(0); next.TokenType == html.TextToken {
291						tb.Shift()
292					}
293				}
294				break
295			}
296
297			if o.KeepWhitespace || t.Traits&objectTag != 0 {
298				omitSpace = false
299			} else if t.Traits&nonPhrasingTag != 0 {
300				omitSpace = true // omit spaces after block elements
301			}
302
303			w.Write(t.Data)
304
305			if hasAttributes {
306				if t.Hash == Meta {
307					attrs := tb.Attributes(Content, Http_Equiv, Charset, Name)
308					if content := attrs[0]; content != nil {
309						if httpEquiv := attrs[1]; httpEquiv != nil {
310							httpEquiv.AttrVal = parse.TrimWhitespace(httpEquiv.AttrVal)
311							if charset := attrs[2]; charset == nil && parse.EqualFold(httpEquiv.AttrVal, []byte("content-type")) {
312								content.AttrVal = minify.Mediatype(content.AttrVal)
313								if bytes.Equal(content.AttrVal, []byte("text/html;charset=utf-8")) {
314									httpEquiv.Text = nil
315									content.Text = []byte("charset")
316									content.Hash = Charset
317									content.AttrVal = []byte("utf-8")
318								}
319							}
320						}
321						if name := attrs[3]; name != nil {
322							name.AttrVal = parse.TrimWhitespace(name.AttrVal)
323							if parse.EqualFold(name.AttrVal, []byte("keywords")) {
324								content.AttrVal = bytes.ReplaceAll(content.AttrVal, []byte(", "), []byte(","))
325							} else if parse.EqualFold(name.AttrVal, []byte("viewport")) {
326								content.AttrVal = bytes.ReplaceAll(content.AttrVal, []byte(" "), []byte(""))
327								for i := 0; i < len(content.AttrVal); i++ {
328									if content.AttrVal[i] == '=' && i+2 < len(content.AttrVal) {
329										i++
330										if n := parse.Number(content.AttrVal[i:]); n > 0 {
331											minNum := minify.Number(content.AttrVal[i:i+n], -1)
332											if len(minNum) < n {
333												copy(content.AttrVal[i:i+len(minNum)], minNum)
334												copy(content.AttrVal[i+len(minNum):], content.AttrVal[i+n:])
335												content.AttrVal = content.AttrVal[:len(content.AttrVal)+len(minNum)-n]
336											}
337											i += len(minNum)
338										}
339										i-- // mitigate for-loop increase
340									}
341								}
342							}
343						}
344					}
345				} else if t.Hash == Script {
346					attrs := tb.Attributes(Src, Charset)
347					if attrs[0] != nil && attrs[1] != nil {
348						attrs[1].Text = nil
349					}
350				} else if t.Hash == Input {
351					attrs := tb.Attributes(Type, Value)
352					if t, value := attrs[0], attrs[1]; t != nil && value != nil {
353						isRadio := parse.EqualFold(t.AttrVal, radioBytes)
354						if !isRadio && len(value.AttrVal) == 0 {
355							value.Text = nil
356						} else if isRadio && parse.EqualFold(value.AttrVal, onBytes) {
357							value.Text = nil
358						}
359					}
360				} else if t.Hash == A {
361					attrs := tb.Attributes(Id, Name)
362					if id, name := attrs[0], attrs[1]; id != nil && name != nil {
363						if bytes.Equal(id.AttrVal, name.AttrVal) {
364							name.Text = nil
365						}
366					}
367				}
368
369				// write attributes
370				for {
371					attr := *tb.Shift()
372					if attr.TokenType != html.AttributeToken {
373						break
374					} else if attr.Text == nil {
375						continue // removed attribute
376					}
377
378					val := attr.AttrVal
379					if attr.Traits&trimAttr != 0 {
380						val = parse.ReplaceMultipleWhitespaceAndEntities(val, EntitiesMap, nil)
381						val = parse.TrimWhitespace(val)
382					} else {
383						val = parse.ReplaceEntities(val, EntitiesMap, nil)
384					}
385					if t.Traits != 0 {
386						if len(val) == 0 && (attr.Hash == Class ||
387							attr.Hash == Dir ||
388							attr.Hash == Id ||
389							attr.Hash == Name ||
390							attr.Hash == Action && t.Hash == Form) {
391							continue // omit empty attribute values
392						}
393						if attr.Traits&caselessAttr != 0 {
394							val = parse.ToLower(val)
395						}
396						if rawTagHash != 0 && attr.Hash == Type {
397							rawTagMediatype = parse.Copy(val)
398						}
399
400						if attr.Hash == Enctype || attr.Hash == Codetype || attr.Hash == Accept || attr.Hash == Type && (t.Hash == A || t.Hash == Link || t.Hash == Embed || t.Hash == Object || t.Hash == Source || t.Hash == Script || t.Hash == Style) {
401							val = minify.Mediatype(val)
402						}
403
404						// default attribute values can be omitted
405						if !o.KeepDefaultAttrVals && (attr.Hash == Type && (t.Hash == Script && jsMimetypes[string(val)] ||
406							t.Hash == Style && bytes.Equal(val, cssMimeBytes) ||
407							t.Hash == Link && bytes.Equal(val, cssMimeBytes) ||
408							t.Hash == Input && bytes.Equal(val, textBytes) ||
409							t.Hash == Button && bytes.Equal(val, submitBytes)) ||
410							attr.Hash == Language && t.Hash == Script ||
411							attr.Hash == Method && bytes.Equal(val, getBytes) ||
412							attr.Hash == Enctype && bytes.Equal(val, formMimeBytes) ||
413							attr.Hash == Colspan && bytes.Equal(val, oneBytes) ||
414							attr.Hash == Rowspan && bytes.Equal(val, oneBytes) ||
415							attr.Hash == Shape && bytes.Equal(val, rectBytes) ||
416							attr.Hash == Span && bytes.Equal(val, oneBytes) ||
417							attr.Hash == Clear && bytes.Equal(val, noneBytes) ||
418							attr.Hash == Frameborder && bytes.Equal(val, oneBytes) ||
419							attr.Hash == Scrolling && bytes.Equal(val, autoBytes) ||
420							attr.Hash == Valuetype && bytes.Equal(val, dataBytes) ||
421							attr.Hash == Media && t.Hash == Style && bytes.Equal(val, allBytes)) {
422							continue
423						}
424
425						if attr.Hash == Style {
426							// CSS minifier for attribute inline code
427							val = parse.TrimWhitespace(val)
428							attrMinifyBuffer.Reset()
429							if err := m.MinifyMimetype(cssMimeBytes, attrMinifyBuffer, buffer.NewReader(val), inlineParams); err == nil {
430								val = attrMinifyBuffer.Bytes()
431							} else if err != minify.ErrNotExist {
432								return minify.UpdateErrorPosition(err, z, attr.Offset)
433							}
434							if len(val) == 0 {
435								continue
436							}
437						} else if len(attr.Text) > 2 && attr.Text[0] == 'o' && attr.Text[1] == 'n' {
438							// JS minifier for attribute inline code
439							val = parse.TrimWhitespace(val)
440							if len(val) >= 11 && parse.EqualFold(val[:11], jsSchemeBytes) {
441								val = val[11:]
442							}
443							attrMinifyBuffer.Reset()
444							if err := m.MinifyMimetype(jsMimeBytes, attrMinifyBuffer, buffer.NewReader(val), nil); err == nil {
445								val = attrMinifyBuffer.Bytes()
446							} else if err != minify.ErrNotExist {
447								return minify.UpdateErrorPosition(err, z, attr.Offset)
448							}
449							if len(val) == 0 {
450								continue
451							}
452						} else if attr.Traits&urlAttr != 0 { // anchors are already handled
453							val = parse.TrimWhitespace(val)
454							if 5 < len(val) {
455								if parse.EqualFold(val[:4], httpBytes) {
456									if val[4] == ':' {
457										if m.URL != nil && m.URL.Scheme == "http" {
458											val = val[5:]
459										} else {
460											parse.ToLower(val[:4])
461										}
462									} else if (val[4] == 's' || val[4] == 'S') && val[5] == ':' {
463										if m.URL != nil && m.URL.Scheme == "https" {
464											val = val[6:]
465										} else {
466											parse.ToLower(val[:5])
467										}
468									}
469								} else if parse.EqualFold(val[:5], dataSchemeBytes) {
470									val = minify.DataURI(m, val)
471								}
472							}
473						}
474					}
475
476					w.Write(spaceBytes)
477					w.Write(attr.Text)
478					if len(val) > 0 && attr.Traits&booleanAttr == 0 {
479						w.Write(isBytes)
480
481						// use double quotes for RDFa attributes
482						isXML := attr.Hash == Vocab || attr.Hash == Typeof || attr.Hash == Property || attr.Hash == Resource || attr.Hash == Prefix || attr.Hash == Content || attr.Hash == About || attr.Hash == Rev || attr.Hash == Datatype || attr.Hash == Inlist
483
484						// no quotes if possible, else prefer single or double depending on which occurs more often in value
485						var quote byte
486
487						if 0 < len(attr.Data) && (attr.Data[len(attr.Data)-1] == '\'' || attr.Data[len(attr.Data)-1] == '"') {
488							quote = attr.Data[len(attr.Data)-1]
489						}
490						val = html.EscapeAttrVal(&attrByteBuffer, val, quote, o.KeepQuotes, isXML)
491						w.Write(val)
492					}
493				}
494			} else {
495				_ = tb.Shift() // StartTagClose
496			}
497			w.Write(gtBytes)
498
499			// skip text in select and optgroup tags
500			if t.Hash == Select || t.Hash == Optgroup {
501				if next := tb.Peek(0); next.TokenType == html.TextToken {
502					tb.Shift()
503				}
504			}
505
506			// keep space after phrasing tags (<i>, <span>, ...) FontAwesome etc.
507			if t.TokenType == html.StartTagToken && t.Traits&nonPhrasingTag == 0 {
508				if next := tb.Peek(0); next.Hash == t.Hash && next.TokenType == html.EndTagToken {
509					omitSpace = false
510				}
511			}
512		}
513	}
514}