aboutsummaryrefslogtreecommitdiff
path: root/vendor/github.com/microcosm-cc/bluemonday/sanitize.go
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/microcosm-cc/bluemonday/sanitize.go')
-rw-r--r--vendor/github.com/microcosm-cc/bluemonday/sanitize.go1116
1 files changed, 1116 insertions, 0 deletions
diff --git a/vendor/github.com/microcosm-cc/bluemonday/sanitize.go b/vendor/github.com/microcosm-cc/bluemonday/sanitize.go
new file mode 100644
index 0000000..9121aef
--- /dev/null
+++ b/vendor/github.com/microcosm-cc/bluemonday/sanitize.go
@@ -0,0 +1,1116 @@
1// Copyright (c) 2014, David Kitchen <david@buro9.com>
2//
3// All rights reserved.
4//
5// Redistribution and use in source and binary forms, with or without
6// modification, are permitted provided that the following conditions are met:
7//
8// * Redistributions of source code must retain the above copyright notice, this
9// list of conditions and the following disclaimer.
10//
11// * Redistributions in binary form must reproduce the above copyright notice,
12// this list of conditions and the following disclaimer in the documentation
13// and/or other materials provided with the distribution.
14//
15// * Neither the name of the organisation (Microcosm) nor the names of its
16// contributors may be used to endorse or promote products derived from
17// this software without specific prior written permission.
18//
19// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
23// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30package bluemonday
31
32import (
33 "bytes"
34 "fmt"
35 "io"
36 "net/url"
37 "regexp"
38 "strconv"
39 "strings"
40
41 "golang.org/x/net/html"
42
43 "github.com/aymerick/douceur/parser"
44)
45
46var (
47 dataAttribute = regexp.MustCompile("^data-.+")
48 dataAttributeXMLPrefix = regexp.MustCompile("^xml.+")
49 dataAttributeInvalidChars = regexp.MustCompile("[A-Z;]+")
50 cssUnicodeChar = regexp.MustCompile(`\\[0-9a-f]{1,6} ?`)
51 dataURIbase64Prefix = regexp.MustCompile(`^data:[^,]*;base64,`)
52)
53
54// Sanitize takes a string that contains a HTML fragment or document and applies
55// the given policy allowlist.
56//
57// It returns a HTML string that has been sanitized by the policy or an empty
58// string if an error has occurred (most likely as a consequence of extremely
59// malformed input)
60func (p *Policy) Sanitize(s string) string {
61 if strings.TrimSpace(s) == "" {
62 return s
63 }
64
65 return p.sanitizeWithBuff(strings.NewReader(s)).String()
66}
67
68// SanitizeBytes takes a []byte that contains a HTML fragment or document and applies
69// the given policy allowlist.
70//
71// It returns a []byte containing the HTML that has been sanitized by the policy
72// or an empty []byte if an error has occurred (most likely as a consequence of
73// extremely malformed input)
74func (p *Policy) SanitizeBytes(b []byte) []byte {
75 if len(bytes.TrimSpace(b)) == 0 {
76 return b
77 }
78
79 return p.sanitizeWithBuff(bytes.NewReader(b)).Bytes()
80}
81
82// SanitizeReader takes an io.Reader that contains a HTML fragment or document
83// and applies the given policy allowlist.
84//
85// It returns a bytes.Buffer containing the HTML that has been sanitized by the
86// policy. Errors during sanitization will merely return an empty result.
87func (p *Policy) SanitizeReader(r io.Reader) *bytes.Buffer {
88 return p.sanitizeWithBuff(r)
89}
90
91// SanitizeReaderToWriter takes an io.Reader that contains a HTML fragment or document
92// and applies the given policy allowlist and writes to the provided writer returning
93// an error if there is one.
94func (p *Policy) SanitizeReaderToWriter(r io.Reader, w io.Writer) error {
95 return p.sanitize(r, w)
96}
97
98const escapedURLChars = "'<>\"\r"
99
100func escapeUrlComponent(w stringWriterWriter, val string) error {
101 i := strings.IndexAny(val, escapedURLChars)
102 for i != -1 {
103 if _, err := w.WriteString(val[:i]); err != nil {
104 return err
105 }
106 var esc string
107 switch val[i] {
108 case '\'':
109 // "&#39;" is shorter than "&apos;" and apos was not in HTML until HTML5.
110 esc = "&#39;"
111 case '<':
112 esc = "&lt;"
113 case '>':
114 esc = "&gt;"
115 case '"':
116 // "&#34;" is shorter than "&quot;".
117 esc = "&#34;"
118 case '\r':
119 esc = "&#13;"
120 default:
121 panic("unrecognized escape character")
122 }
123 val = val[i+1:]
124 if _, err := w.WriteString(esc); err != nil {
125 return err
126 }
127 i = strings.IndexAny(val, escapedURLChars)
128 }
129 _, err := w.WriteString(val)
130 return err
131}
132
133// Query represents a single part of the query string, a query param
134type Query struct {
135 Key string
136 Value string
137 HasValue bool
138}
139
140func parseQuery(query string) (values []Query, err error) {
141 // This is essentially a copy of parseQuery from
142 // https://golang.org/src/net/url/url.go but adjusted to build our values
143 // based on our type, which we need to preserve the ordering of the query
144 // string
145 for query != "" {
146 key := query
147 if i := strings.IndexAny(key, "&;"); i >= 0 {
148 key, query = key[:i], key[i+1:]
149 } else {
150 query = ""
151 }
152 if key == "" {
153 continue
154 }
155 value := ""
156 hasValue := false
157 if i := strings.Index(key, "="); i >= 0 {
158 key, value = key[:i], key[i+1:]
159 hasValue = true
160 }
161 key, err1 := url.QueryUnescape(key)
162 if err1 != nil {
163 if err == nil {
164 err = err1
165 }
166 continue
167 }
168 value, err1 = url.QueryUnescape(value)
169 if err1 != nil {
170 if err == nil {
171 err = err1
172 }
173 continue
174 }
175 values = append(values, Query{
176 Key: key,
177 Value: value,
178 HasValue: hasValue,
179 })
180 }
181 return values, err
182}
183
184func encodeQueries(queries []Query) string {
185 var buff bytes.Buffer
186 for i, query := range queries {
187 buff.WriteString(url.QueryEscape(query.Key))
188 if query.HasValue {
189 buff.WriteString("=")
190 buff.WriteString(url.QueryEscape(query.Value))
191 }
192 if i < len(queries)-1 {
193 buff.WriteString("&")
194 }
195 }
196 return buff.String()
197}
198
199func sanitizedURL(val string) (string, error) {
200 u, err := url.Parse(val)
201 if err != nil {
202 return "", err
203 }
204
205 // we use parseQuery but not u.Query to keep the order not change because
206 // url.Values is a map which has a random order.
207 queryValues, err := parseQuery(u.RawQuery)
208 if err != nil {
209 return "", err
210 }
211 // sanitize the url query params
212 for i, query := range queryValues {
213 queryValues[i].Key = html.EscapeString(query.Key)
214 }
215 u.RawQuery = encodeQueries(queryValues)
216 // u.String() will also sanitize host/scheme/user/pass
217 return u.String(), nil
218}
219
220// Performs the actual sanitization process.
221func (p *Policy) sanitizeWithBuff(r io.Reader) *bytes.Buffer {
222 var buff bytes.Buffer
223 if err := p.sanitize(r, &buff); err != nil {
224 return &bytes.Buffer{}
225 }
226 return &buff
227}
228
229type asStringWriter struct {
230 io.Writer
231}
232
233func (a *asStringWriter) WriteString(s string) (int, error) {
234 return a.Write([]byte(s))
235}
236
237func (p *Policy) sanitize(r io.Reader, w io.Writer) error {
238 // It is possible that the developer has created the policy via:
239 // p := bluemonday.Policy{}
240 // rather than:
241 // p := bluemonday.NewPolicy()
242 // If this is the case, and if they haven't yet triggered an action that
243 // would initialize the maps, then we need to do that.
244 p.init()
245
246 buff, ok := w.(stringWriterWriter)
247 if !ok {
248 buff = &asStringWriter{w}
249 }
250
251 var (
252 skipElementContent bool
253 skippingElementsCount int64
254 skipClosingTag bool
255 closingTagToSkipStack []string
256 mostRecentlyStartedToken string
257 )
258
259 tokenizer := html.NewTokenizer(r)
260 for {
261 if tokenizer.Next() == html.ErrorToken {
262 err := tokenizer.Err()
263 if err == io.EOF {
264 // End of input means end of processing
265 return nil
266 }
267
268 // Raw tokenizer error
269 return err
270 }
271
272 token := tokenizer.Token()
273 switch token.Type {
274 case html.DoctypeToken:
275
276 // DocType is not handled as there is no safe parsing mechanism
277 // provided by golang.org/x/net/html for the content, and this can
278 // be misused to insert HTML tags that are not then sanitized
279 //
280 // One might wish to recursively sanitize here using the same policy
281 // but I will need to do some further testing before considering
282 // this.
283
284 case html.CommentToken:
285
286 // Comments are ignored by default
287 if p.allowComments {
288 // But if allowed then write the comment out as-is
289 buff.WriteString(token.String())
290 }
291
292 case html.StartTagToken:
293
294 mostRecentlyStartedToken = normaliseElementName(token.Data)
295
296 switch normaliseElementName(token.Data) {
297 case `script`:
298 if !p.allowUnsafe {
299 continue
300 }
301 case `style`:
302 if !p.allowUnsafe {
303 continue
304 }
305 }
306
307 aps, ok := p.elsAndAttrs[token.Data]
308 if !ok {
309 aa, matched := p.matchRegex(token.Data)
310 if !matched {
311 if _, ok := p.setOfElementsToSkipContent[token.Data]; ok {
312 skipElementContent = true
313 skippingElementsCount++
314 }
315 if p.addSpaces {
316 if _, err := buff.WriteString(" "); err != nil {
317 return err
318 }
319 }
320 break
321 }
322 aps = aa
323 }
324 if len(token.Attr) != 0 {
325 token.Attr = p.sanitizeAttrs(token.Data, token.Attr, aps)
326 }
327
328 if len(token.Attr) == 0 {
329 if !p.allowNoAttrs(token.Data) {
330 skipClosingTag = true
331 closingTagToSkipStack = append(closingTagToSkipStack, token.Data)
332 if p.addSpaces {
333 if _, err := buff.WriteString(" "); err != nil {
334 return err
335 }
336 }
337 break
338 }
339 }
340
341 if !skipElementContent {
342 if _, err := buff.WriteString(token.String()); err != nil {
343 return err
344 }
345 }
346
347 case html.EndTagToken:
348
349 if mostRecentlyStartedToken == normaliseElementName(token.Data) {
350 mostRecentlyStartedToken = ""
351 }
352
353 switch normaliseElementName(token.Data) {
354 case `script`:
355 if !p.allowUnsafe {
356 continue
357 }
358 case `style`:
359 if !p.allowUnsafe {
360 continue
361 }
362 }
363
364 if skipClosingTag && closingTagToSkipStack[len(closingTagToSkipStack)-1] == token.Data {
365 closingTagToSkipStack = closingTagToSkipStack[:len(closingTagToSkipStack)-1]
366 if len(closingTagToSkipStack) == 0 {
367 skipClosingTag = false
368 }
369 if p.addSpaces {
370 if _, err := buff.WriteString(" "); err != nil {
371 return err
372 }
373 }
374 break
375 }
376 if _, ok := p.elsAndAttrs[token.Data]; !ok {
377 match := false
378 for regex := range p.elsMatchingAndAttrs {
379 if regex.MatchString(token.Data) {
380 skipElementContent = false
381 match = true
382 break
383 }
384 }
385 if _, ok := p.setOfElementsToSkipContent[token.Data]; ok && !match {
386 skippingElementsCount--
387 if skippingElementsCount == 0 {
388 skipElementContent = false
389 }
390 }
391 if !match {
392 if p.addSpaces {
393 if _, err := buff.WriteString(" "); err != nil {
394 return err
395 }
396 }
397 break
398 }
399 }
400
401 if !skipElementContent {
402 if _, err := buff.WriteString(token.String()); err != nil {
403 return err
404 }
405 }
406
407 case html.SelfClosingTagToken:
408
409 switch normaliseElementName(token.Data) {
410 case `script`:
411 if !p.allowUnsafe {
412 continue
413 }
414 case `style`:
415 if !p.allowUnsafe {
416 continue
417 }
418 }
419
420 aps, ok := p.elsAndAttrs[token.Data]
421 if !ok {
422 aa, matched := p.matchRegex(token.Data)
423 if !matched {
424 if p.addSpaces && !matched {
425 if _, err := buff.WriteString(" "); err != nil {
426 return err
427 }
428 }
429 break
430 }
431 aps = aa
432 }
433
434 if len(token.Attr) != 0 {
435 token.Attr = p.sanitizeAttrs(token.Data, token.Attr, aps)
436 }
437
438 if len(token.Attr) == 0 && !p.allowNoAttrs(token.Data) {
439 if p.addSpaces {
440 if _, err := buff.WriteString(" "); err != nil {
441 return err
442 }
443 }
444 break
445 }
446 if !skipElementContent {
447 if _, err := buff.WriteString(token.String()); err != nil {
448 return err
449 }
450 }
451
452 case html.TextToken:
453
454 if !skipElementContent {
455 switch mostRecentlyStartedToken {
456 case `script`:
457 // not encouraged, but if a policy allows JavaScript we
458 // should not HTML escape it as that would break the output
459 //
460 // requires p.AllowUnsafe()
461 if p.allowUnsafe {
462 if _, err := buff.WriteString(token.Data); err != nil {
463 return err
464 }
465 }
466 case "style":
467 // not encouraged, but if a policy allows CSS styles we
468 // should not HTML escape it as that would break the output
469 //
470 // requires p.AllowUnsafe()
471 if p.allowUnsafe {
472 if _, err := buff.WriteString(token.Data); err != nil {
473 return err
474 }
475 }
476 default:
477 // HTML escape the text
478 if _, err := buff.WriteString(token.String()); err != nil {
479 return err
480 }
481 }
482 }
483
484 default:
485 // A token that didn't exist in the html package when we wrote this
486 return fmt.Errorf("unknown token: %v", token)
487 }
488 }
489}
490
491// sanitizeAttrs takes a set of element attribute policies and the global
492// attribute policies and applies them to the []html.Attribute returning a set
493// of html.Attributes that match the policies
494func (p *Policy) sanitizeAttrs(
495 elementName string,
496 attrs []html.Attribute,
497 aps map[string][]attrPolicy,
498) []html.Attribute {
499
500 if len(attrs) == 0 {
501 return attrs
502 }
503
504 hasStylePolicies := false
505 sps, elementHasStylePolicies := p.elsAndStyles[elementName]
506 if len(p.globalStyles) > 0 || (elementHasStylePolicies && len(sps) > 0) {
507 hasStylePolicies = true
508 }
509 // no specific element policy found, look for a pattern match
510 if !hasStylePolicies {
511 for k, v := range p.elsMatchingAndStyles {
512 if k.MatchString(elementName) {
513 if len(v) > 0 {
514 hasStylePolicies = true
515 break
516 }
517 }
518 }
519 }
520
521 // Builds a new attribute slice based on the whether the attribute has been
522 // allowed explicitly or globally.
523 cleanAttrs := []html.Attribute{}
524attrsLoop:
525 for _, htmlAttr := range attrs {
526 if p.allowDataAttributes {
527 // If we see a data attribute, let it through.
528 if isDataAttribute(htmlAttr.Key) {
529 cleanAttrs = append(cleanAttrs, htmlAttr)
530 continue
531 }
532 }
533 // Is this a "style" attribute, and if so, do we need to sanitize it?
534 if htmlAttr.Key == "style" && hasStylePolicies {
535 htmlAttr = p.sanitizeStyles(htmlAttr, elementName)
536 if htmlAttr.Val == "" {
537 // We've sanitized away any and all styles; don't bother to
538 // output the style attribute (even if it's allowed)
539 continue
540 } else {
541 cleanAttrs = append(cleanAttrs, htmlAttr)
542 continue
543 }
544 }
545
546 // Is there an element specific attribute policy that applies?
547 if apl, ok := aps[htmlAttr.Key]; ok {
548 for _, ap := range apl {
549 if ap.regexp != nil {
550 if ap.regexp.MatchString(htmlAttr.Val) {
551 cleanAttrs = append(cleanAttrs, htmlAttr)
552 continue attrsLoop
553 }
554 } else {
555 cleanAttrs = append(cleanAttrs, htmlAttr)
556 continue attrsLoop
557 }
558 }
559 }
560
561 // Is there a global attribute policy that applies?
562 if apl, ok := p.globalAttrs[htmlAttr.Key]; ok {
563 for _, ap := range apl {
564 if ap.regexp != nil {
565 if ap.regexp.MatchString(htmlAttr.Val) {
566 cleanAttrs = append(cleanAttrs, htmlAttr)
567 }
568 } else {
569 cleanAttrs = append(cleanAttrs, htmlAttr)
570 }
571 }
572 }
573 }
574
575 if len(cleanAttrs) == 0 {
576 // If nothing was allowed, let's get out of here
577 return cleanAttrs
578 }
579 // cleanAttrs now contains the attributes that are permitted
580
581 if linkable(elementName) {
582 if p.requireParseableURLs {
583 // Ensure URLs are parseable:
584 // - a.href
585 // - area.href
586 // - link.href
587 // - blockquote.cite
588 // - q.cite
589 // - img.src
590 // - script.src
591 tmpAttrs := []html.Attribute{}
592 for _, htmlAttr := range cleanAttrs {
593 switch elementName {
594 case "a", "area", "base", "link":
595 if htmlAttr.Key == "href" {
596 if u, ok := p.validURL(htmlAttr.Val); ok {
597 htmlAttr.Val = u
598 tmpAttrs = append(tmpAttrs, htmlAttr)
599 }
600 break
601 }
602 tmpAttrs = append(tmpAttrs, htmlAttr)
603 case "blockquote", "del", "ins", "q":
604 if htmlAttr.Key == "cite" {
605 if u, ok := p.validURL(htmlAttr.Val); ok {
606 htmlAttr.Val = u
607 tmpAttrs = append(tmpAttrs, htmlAttr)
608 }
609 break
610 }
611 tmpAttrs = append(tmpAttrs, htmlAttr)
612 case "audio", "embed", "iframe", "img", "script", "source", "track", "video":
613 if htmlAttr.Key == "src" {
614 if u, ok := p.validURL(htmlAttr.Val); ok {
615 htmlAttr.Val = u
616 tmpAttrs = append(tmpAttrs, htmlAttr)
617 }
618 break
619 }
620 tmpAttrs = append(tmpAttrs, htmlAttr)
621 default:
622 tmpAttrs = append(tmpAttrs, htmlAttr)
623 }
624 }
625 cleanAttrs = tmpAttrs
626 }
627
628 if (p.requireNoFollow ||
629 p.requireNoFollowFullyQualifiedLinks ||
630 p.requireNoReferrer ||
631 p.requireNoReferrerFullyQualifiedLinks ||
632 p.addTargetBlankToFullyQualifiedLinks) &&
633 len(cleanAttrs) > 0 {
634
635 // Add rel="nofollow" if a "href" exists
636 switch elementName {
637 case "a", "area", "base", "link":
638 var hrefFound bool
639 var externalLink bool
640 for _, htmlAttr := range cleanAttrs {
641 if htmlAttr.Key == "href" {
642 hrefFound = true
643
644 u, err := url.Parse(htmlAttr.Val)
645 if err != nil {
646 continue
647 }
648 if u.Host != "" {
649 externalLink = true
650 }
651
652 continue
653 }
654 }
655
656 if hrefFound {
657 var (
658 noFollowFound bool
659 noReferrerFound bool
660 targetBlankFound bool
661 )
662
663 addNoFollow := (p.requireNoFollow ||
664 externalLink && p.requireNoFollowFullyQualifiedLinks)
665
666 addNoReferrer := (p.requireNoReferrer ||
667 externalLink && p.requireNoReferrerFullyQualifiedLinks)
668
669 addTargetBlank := (externalLink &&
670 p.addTargetBlankToFullyQualifiedLinks)
671
672 tmpAttrs := []html.Attribute{}
673 for _, htmlAttr := range cleanAttrs {
674
675 var appended bool
676 if htmlAttr.Key == "rel" && (addNoFollow || addNoReferrer) {
677
678 if addNoFollow && !strings.Contains(htmlAttr.Val, "nofollow") {
679 htmlAttr.Val += " nofollow"
680 }
681 if addNoReferrer && !strings.Contains(htmlAttr.Val, "noreferrer") {
682 htmlAttr.Val += " noreferrer"
683 }
684 noFollowFound = addNoFollow
685 noReferrerFound = addNoReferrer
686 tmpAttrs = append(tmpAttrs, htmlAttr)
687 appended = true
688 }
689
690 if elementName == "a" && htmlAttr.Key == "target" {
691 if htmlAttr.Val == "_blank" {
692 targetBlankFound = true
693 }
694 if addTargetBlank && !targetBlankFound {
695 htmlAttr.Val = "_blank"
696 targetBlankFound = true
697 tmpAttrs = append(tmpAttrs, htmlAttr)
698 appended = true
699 }
700 }
701
702 if !appended {
703 tmpAttrs = append(tmpAttrs, htmlAttr)
704 }
705 }
706 if noFollowFound || noReferrerFound || targetBlankFound {
707 cleanAttrs = tmpAttrs
708 }
709
710 if (addNoFollow && !noFollowFound) || (addNoReferrer && !noReferrerFound) {
711 rel := html.Attribute{}
712 rel.Key = "rel"
713 if addNoFollow {
714 rel.Val = "nofollow"
715 }
716 if addNoReferrer {
717 if rel.Val != "" {
718 rel.Val += " "
719 }
720 rel.Val += "noreferrer"
721 }
722 cleanAttrs = append(cleanAttrs, rel)
723 }
724
725 if elementName == "a" && addTargetBlank && !targetBlankFound {
726 rel := html.Attribute{}
727 rel.Key = "target"
728 rel.Val = "_blank"
729 targetBlankFound = true
730 cleanAttrs = append(cleanAttrs, rel)
731 }
732
733 if targetBlankFound {
734 // target="_blank" has a security risk that allows the
735 // opened window/tab to issue JavaScript calls against
736 // window.opener, which in effect allow the destination
737 // of the link to control the source:
738 // https://dev.to/ben/the-targetblank-vulnerability-by-example
739 //
740 // To mitigate this risk, we need to add a specific rel
741 // attribute if it is not already present.
742 // rel="noopener"
743 //
744 // Unfortunately this is processing the rel twice (we
745 // already looked at it earlier ^^) as we cannot be sure
746 // of the ordering of the href and rel, and whether we
747 // have fully satisfied that we need to do this. This
748 // double processing only happens *if* target="_blank"
749 // is true.
750 var noOpenerAdded bool
751 tmpAttrs := []html.Attribute{}
752 for _, htmlAttr := range cleanAttrs {
753 var appended bool
754 if htmlAttr.Key == "rel" {
755 if strings.Contains(htmlAttr.Val, "noopener") {
756 noOpenerAdded = true
757 tmpAttrs = append(tmpAttrs, htmlAttr)
758 } else {
759 htmlAttr.Val += " noopener"
760 noOpenerAdded = true
761 tmpAttrs = append(tmpAttrs, htmlAttr)
762 }
763
764 appended = true
765 }
766 if !appended {
767 tmpAttrs = append(tmpAttrs, htmlAttr)
768 }
769 }
770 if noOpenerAdded {
771 cleanAttrs = tmpAttrs
772 } else {
773 // rel attr was not found, or else noopener would
774 // have been added already
775 rel := html.Attribute{}
776 rel.Key = "rel"
777 rel.Val = "noopener"
778 cleanAttrs = append(cleanAttrs, rel)
779 }
780
781 }
782 }
783 default:
784 }
785 }
786 }
787
788 if p.requireCrossOriginAnonymous && len(cleanAttrs) > 0 {
789 switch elementName {
790 case "audio", "img", "link", "script", "video":
791 var crossOriginFound bool
792 for _, htmlAttr := range cleanAttrs {
793 if htmlAttr.Key == "crossorigin" {
794 crossOriginFound = true
795 htmlAttr.Val = "anonymous"
796 }
797 }
798
799 if !crossOriginFound {
800 crossOrigin := html.Attribute{}
801 crossOrigin.Key = "crossorigin"
802 crossOrigin.Val = "anonymous"
803 cleanAttrs = append(cleanAttrs, crossOrigin)
804 }
805 }
806 }
807
808 if p.requireSandboxOnIFrame != nil && elementName == "iframe" {
809 var sandboxFound bool
810 for i, htmlAttr := range cleanAttrs {
811 if htmlAttr.Key == "sandbox" {
812 sandboxFound = true
813 var cleanVals []string
814 cleanValsSet := make(map[string]bool)
815 for _, val := range strings.Fields(htmlAttr.Val) {
816 if p.requireSandboxOnIFrame[val] {
817 if !cleanValsSet[val] {
818 cleanVals = append(cleanVals, val)
819 cleanValsSet[val] = true
820 }
821 }
822 }
823 cleanAttrs[i].Val = strings.Join(cleanVals, " ")
824 }
825 }
826
827 if !sandboxFound {
828 sandbox := html.Attribute{}
829 sandbox.Key = "sandbox"
830 sandbox.Val = ""
831 cleanAttrs = append(cleanAttrs, sandbox)
832 }
833 }
834
835 return cleanAttrs
836}
837
838func (p *Policy) sanitizeStyles(attr html.Attribute, elementName string) html.Attribute {
839 sps := p.elsAndStyles[elementName]
840 if len(sps) == 0 {
841 sps = map[string][]stylePolicy{}
842 // check for any matching elements, if we don't already have a policy found
843 // if multiple matches are found they will be overwritten, it's best
844 // to not have overlapping matchers
845 for regex, policies := range p.elsMatchingAndStyles {
846 if regex.MatchString(elementName) {
847 for k, v := range policies {
848 sps[k] = append(sps[k], v...)
849 }
850 }
851 }
852 }
853
854 //Add semi-colon to end to fix parsing issue
855 attr.Val = strings.TrimRight(attr.Val, " ")
856 if len(attr.Val) > 0 && attr.Val[len(attr.Val)-1] != ';' {
857 attr.Val = attr.Val + ";"
858 }
859 decs, err := parser.ParseDeclarations(attr.Val)
860 if err != nil {
861 attr.Val = ""
862 return attr
863 }
864 clean := []string{}
865 prefixes := []string{"-webkit-", "-moz-", "-ms-", "-o-", "mso-", "-xv-", "-atsc-", "-wap-", "-khtml-", "prince-", "-ah-", "-hp-", "-ro-", "-rim-", "-tc-"}
866
867decLoop:
868 for _, dec := range decs {
869 tempProperty := strings.ToLower(dec.Property)
870 tempValue := removeUnicode(strings.ToLower(dec.Value))
871 for _, i := range prefixes {
872 tempProperty = strings.TrimPrefix(tempProperty, i)
873 }
874 if spl, ok := sps[tempProperty]; ok {
875 for _, sp := range spl {
876 if sp.handler != nil {
877 if sp.handler(tempValue) {
878 clean = append(clean, dec.Property+": "+dec.Value)
879 continue decLoop
880 }
881 } else if len(sp.enum) > 0 {
882 if stringInSlice(tempValue, sp.enum) {
883 clean = append(clean, dec.Property+": "+dec.Value)
884 continue decLoop
885 }
886 } else if sp.regexp != nil {
887 if sp.regexp.MatchString(tempValue) {
888 clean = append(clean, dec.Property+": "+dec.Value)
889 continue decLoop
890 }
891 }
892 }
893 }
894 if spl, ok := p.globalStyles[tempProperty]; ok {
895 for _, sp := range spl {
896 if sp.handler != nil {
897 if sp.handler(tempValue) {
898 clean = append(clean, dec.Property+": "+dec.Value)
899 continue decLoop
900 }
901 } else if len(sp.enum) > 0 {
902 if stringInSlice(tempValue, sp.enum) {
903 clean = append(clean, dec.Property+": "+dec.Value)
904 continue decLoop
905 }
906 } else if sp.regexp != nil {
907 if sp.regexp.MatchString(tempValue) {
908 clean = append(clean, dec.Property+": "+dec.Value)
909 continue decLoop
910 }
911 }
912 }
913 }
914 }
915 if len(clean) > 0 {
916 attr.Val = strings.Join(clean, "; ")
917 } else {
918 attr.Val = ""
919 }
920 return attr
921}
922
923func (p *Policy) allowNoAttrs(elementName string) bool {
924 _, ok := p.setOfElementsAllowedWithoutAttrs[elementName]
925 if !ok {
926 for _, r := range p.setOfElementsMatchingAllowedWithoutAttrs {
927 if r.MatchString(elementName) {
928 ok = true
929 break
930 }
931 }
932 }
933 return ok
934}
935
936func (p *Policy) validURL(rawurl string) (string, bool) {
937 if p.requireParseableURLs {
938 // URLs are valid if when space is trimmed the URL is valid
939 rawurl = strings.TrimSpace(rawurl)
940
941 // URLs cannot contain whitespace, unless it is a data-uri
942 if strings.Contains(rawurl, " ") ||
943 strings.Contains(rawurl, "\t") ||
944 strings.Contains(rawurl, "\n") {
945 if !strings.HasPrefix(rawurl, `data:`) {
946 return "", false
947 }
948
949 // Remove \r and \n from base64 encoded data to pass url.Parse.
950 matched := dataURIbase64Prefix.FindString(rawurl)
951 if matched != "" {
952 rawurl = matched + strings.Replace(
953 strings.Replace(
954 rawurl[len(matched):],
955 "\r",
956 "",
957 -1,
958 ),
959 "\n",
960 "",
961 -1,
962 )
963 }
964 }
965
966 // URLs are valid if they parse
967 u, err := url.Parse(rawurl)
968 if err != nil {
969 return "", false
970 }
971
972 if u.Scheme != "" {
973 for _, r := range p.allowURLSchemeRegexps {
974 if r.MatchString(u.Scheme) {
975 return u.String(), true
976 }
977 }
978
979 urlPolicies, ok := p.allowURLSchemes[u.Scheme]
980 if !ok {
981 return "", false
982 }
983
984 if len(urlPolicies) == 0 {
985 return u.String(), true
986 }
987
988 for _, urlPolicy := range urlPolicies {
989 if urlPolicy(u) == true {
990 return u.String(), true
991 }
992 }
993
994 return "", false
995 }
996
997 if p.allowRelativeURLs {
998 if u.String() != "" {
999 return u.String(), true
1000 }
1001 }
1002
1003 return "", false
1004 }
1005
1006 return rawurl, true
1007}
1008
1009func linkable(elementName string) bool {
1010 switch elementName {
1011 case "a", "area", "base", "link":
1012 // elements that allow .href
1013 return true
1014 case "blockquote", "del", "ins", "q":
1015 // elements that allow .cite
1016 return true
1017 case "audio", "embed", "iframe", "img", "input", "script", "track", "video":
1018 // elements that allow .src
1019 return true
1020 default:
1021 return false
1022 }
1023}
1024
1025// stringInSlice returns true if needle exists in haystack
1026func stringInSlice(needle string, haystack []string) bool {
1027 for _, straw := range haystack {
1028 if strings.ToLower(straw) == strings.ToLower(needle) {
1029 return true
1030 }
1031 }
1032 return false
1033}
1034
1035func isDataAttribute(val string) bool {
1036 if !dataAttribute.MatchString(val) {
1037 return false
1038 }
1039 rest := strings.Split(val, "data-")
1040 if len(rest) == 1 {
1041 return false
1042 }
1043 // data-xml* is invalid.
1044 if dataAttributeXMLPrefix.MatchString(rest[1]) {
1045 return false
1046 }
1047 // no uppercase or semi-colons allowed.
1048 if dataAttributeInvalidChars.MatchString(rest[1]) {
1049 return false
1050 }
1051 return true
1052}
1053
1054func removeUnicode(value string) string {
1055 substitutedValue := value
1056 currentLoc := cssUnicodeChar.FindStringIndex(substitutedValue)
1057 for currentLoc != nil {
1058
1059 character := substitutedValue[currentLoc[0]+1 : currentLoc[1]]
1060 character = strings.TrimSpace(character)
1061 if len(character) < 4 {
1062 character = strings.Repeat("0", 4-len(character)) + character
1063 } else {
1064 for len(character) > 4 {
1065 if character[0] != '0' {
1066 character = ""
1067 break
1068 } else {
1069 character = character[1:]
1070 }
1071 }
1072 }
1073 character = "\\u" + character
1074 translatedChar, err := strconv.Unquote(`"` + character + `"`)
1075 translatedChar = strings.TrimSpace(translatedChar)
1076 if err != nil {
1077 return ""
1078 }
1079 substitutedValue = substitutedValue[0:currentLoc[0]] + translatedChar + substitutedValue[currentLoc[1]:]
1080 currentLoc = cssUnicodeChar.FindStringIndex(substitutedValue)
1081 }
1082 return substitutedValue
1083}
1084
1085func (p *Policy) matchRegex(elementName string) (map[string][]attrPolicy, bool) {
1086 aps := make(map[string][]attrPolicy, 0)
1087 matched := false
1088 for regex, attrs := range p.elsMatchingAndAttrs {
1089 if regex.MatchString(elementName) {
1090 matched = true
1091 for k, v := range attrs {
1092 aps[k] = append(aps[k], v...)
1093 }
1094 }
1095 }
1096 return aps, matched
1097}
1098
1099// normaliseElementName takes a HTML element like <script> which is user input
1100// and returns a lower case version of it that is immune to UTF-8 to ASCII
1101// conversion tricks (like the use of upper case cyrillic i scrİpt which a
1102// strings.ToLower would convert to script). Instead this func will preserve
1103// all non-ASCII as their escaped equivalent, i.e. \u0130 which reveals the
1104// characters when lower cased
1105func normaliseElementName(str string) string {
1106 // that useful QuoteToASCII put quote marks at the start and end
1107 // so those are trimmed off
1108 return strings.TrimSuffix(
1109 strings.TrimPrefix(
1110 strings.ToLower(
1111 strconv.QuoteToASCII(str),
1112 ),
1113 `"`),
1114 `"`,
1115 )
1116}