1// Copyright (c) 2014, David Kitchen <david@buro9.com>
   2//
   3// All rights reserved.
   4//
   5// Redistribution and use in source and binary forms, with or without
   6// modification, are permitted provided that the following conditions are met:
   7//
   8// * Redistributions of source code must retain the above copyright notice, this
   9//   list of conditions and the following disclaimer.
  10//
  11// * Redistributions in binary form must reproduce the above copyright notice,
  12//   this list of conditions and the following disclaimer in the documentation
  13//   and/or other materials provided with the distribution.
  14//
  15// * Neither the name of the organisation (Microcosm) nor the names of its
  16//   contributors may be used to endorse or promote products derived from
  17//   this software without specific prior written permission.
  18//
  19// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  20// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  21// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  22// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
  23// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  24// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  25// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  26// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  27// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  28// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  29
  30package bluemonday
  31
  32import (
  33	"bytes"
  34	"fmt"
  35	"io"
  36	"net/url"
  37	"regexp"
  38	"strconv"
  39	"strings"
  40
  41	"golang.org/x/net/html"
  42
  43	"github.com/aymerick/douceur/parser"
  44)
  45
  46var (
  47	dataAttribute             = regexp.MustCompile("^data-.+")
  48	dataAttributeXMLPrefix    = regexp.MustCompile("^xml.+")
  49	dataAttributeInvalidChars = regexp.MustCompile("[A-Z;]+")
  50	cssUnicodeChar            = regexp.MustCompile(`\\[0-9a-f]{1,6} ?`)
  51	dataURIbase64Prefix       = regexp.MustCompile(`^data:[^,]*;base64,`)
  52)
  53
  54// Sanitize takes a string that contains a HTML fragment or document and applies
  55// the given policy allowlist.
  56//
  57// It returns a HTML string that has been sanitized by the policy or an empty
  58// string if an error has occurred (most likely as a consequence of extremely
  59// malformed input)
  60func (p *Policy) Sanitize(s string) string {
  61	if strings.TrimSpace(s) == "" {
  62		return s
  63	}
  64
  65	return p.sanitizeWithBuff(strings.NewReader(s)).String()
  66}
  67
  68// SanitizeBytes takes a []byte that contains a HTML fragment or document and applies
  69// the given policy allowlist.
  70//
  71// It returns a []byte containing the HTML that has been sanitized by the policy
  72// or an empty []byte if an error has occurred (most likely as a consequence of
  73// extremely malformed input)
  74func (p *Policy) SanitizeBytes(b []byte) []byte {
  75	if len(bytes.TrimSpace(b)) == 0 {
  76		return b
  77	}
  78
  79	return p.sanitizeWithBuff(bytes.NewReader(b)).Bytes()
  80}
  81
  82// SanitizeReader takes an io.Reader that contains a HTML fragment or document
  83// and applies the given policy allowlist.
  84//
  85// It returns a bytes.Buffer containing the HTML that has been sanitized by the
  86// policy. Errors during sanitization will merely return an empty result.
  87func (p *Policy) SanitizeReader(r io.Reader) *bytes.Buffer {
  88	return p.sanitizeWithBuff(r)
  89}
  90
  91// SanitizeReaderToWriter takes an io.Reader that contains a HTML fragment or document
  92// and applies the given policy allowlist and writes to the provided writer returning
  93// an error if there is one.
  94func (p *Policy) SanitizeReaderToWriter(r io.Reader, w io.Writer) error {
  95	return p.sanitize(r, w)
  96}
  97
  98const escapedURLChars = "'<>\"\r"
  99
 100func escapeUrlComponent(w stringWriterWriter, val string) error {
 101	i := strings.IndexAny(val, escapedURLChars)
 102	for i != -1 {
 103		if _, err := w.WriteString(val[:i]); err != nil {
 104			return err
 105		}
 106		var esc string
 107		switch val[i] {
 108		case '\'':
 109			// "&#39;" is shorter than "&apos;" and apos was not in HTML until HTML5.
 110			esc = "&#39;"
 111		case '<':
 112			esc = "&lt;"
 113		case '>':
 114			esc = "&gt;"
 115		case '"':
 116			// "&#34;" is shorter than "&quot;".
 117			esc = "&#34;"
 118		case '\r':
 119			esc = "&#13;"
 120		default:
 121			panic("unrecognized escape character")
 122		}
 123		val = val[i+1:]
 124		if _, err := w.WriteString(esc); err != nil {
 125			return err
 126		}
 127		i = strings.IndexAny(val, escapedURLChars)
 128	}
 129	_, err := w.WriteString(val)
 130	return err
 131}
 132
 133// Query represents a single part of the query string, a query param
 134type Query struct {
 135	Key      string
 136	Value    string
 137	HasValue bool
 138}
 139
 140func parseQuery(query string) (values []Query, err error) {
 141	// This is essentially a copy of parseQuery from
 142	// https://golang.org/src/net/url/url.go but adjusted to build our values
 143	// based on our type, which we need to preserve the ordering of the query
 144	// string
 145	for query != "" {
 146		key := query
 147		if i := strings.IndexAny(key, "&;"); i >= 0 {
 148			key, query = key[:i], key[i+1:]
 149		} else {
 150			query = ""
 151		}
 152		if key == "" {
 153			continue
 154		}
 155		value := ""
 156		hasValue := false
 157		if i := strings.Index(key, "="); i >= 0 {
 158			key, value = key[:i], key[i+1:]
 159			hasValue = true
 160		}
 161		key, err1 := url.QueryUnescape(key)
 162		if err1 != nil {
 163			if err == nil {
 164				err = err1
 165			}
 166			continue
 167		}
 168		value, err1 = url.QueryUnescape(value)
 169		if err1 != nil {
 170			if err == nil {
 171				err = err1
 172			}
 173			continue
 174		}
 175		values = append(values, Query{
 176			Key:      key,
 177			Value:    value,
 178			HasValue: hasValue,
 179		})
 180	}
 181	return values, err
 182}
 183
 184func encodeQueries(queries []Query) string {
 185	var buff bytes.Buffer
 186	for i, query := range queries {
 187		buff.WriteString(url.QueryEscape(query.Key))
 188		if query.HasValue {
 189			buff.WriteString("=")
 190			buff.WriteString(url.QueryEscape(query.Value))
 191		}
 192		if i < len(queries)-1 {
 193			buff.WriteString("&")
 194		}
 195	}
 196	return buff.String()
 197}
 198
 199func sanitizedURL(val string) (string, error) {
 200	u, err := url.Parse(val)
 201	if err != nil {
 202		return "", err
 203	}
 204
 205	// we use parseQuery but not u.Query to keep the order not change because
 206	// url.Values is a map which has a random order.
 207	queryValues, err := parseQuery(u.RawQuery)
 208	if err != nil {
 209		return "", err
 210	}
 211	// sanitize the url query params
 212	for i, query := range queryValues {
 213		queryValues[i].Key = html.EscapeString(query.Key)
 214	}
 215	u.RawQuery = encodeQueries(queryValues)
 216	// u.String() will also sanitize host/scheme/user/pass
 217	return u.String(), nil
 218}
 219
 220// Performs the actual sanitization process.
 221func (p *Policy) sanitizeWithBuff(r io.Reader) *bytes.Buffer {
 222	var buff bytes.Buffer
 223	if err := p.sanitize(r, &buff); err != nil {
 224		return &bytes.Buffer{}
 225	}
 226	return &buff
 227}
 228
 229type asStringWriter struct {
 230	io.Writer
 231}
 232
 233func (a *asStringWriter) WriteString(s string) (int, error) {
 234	return a.Write([]byte(s))
 235}
 236
 237func (p *Policy) sanitize(r io.Reader, w io.Writer) error {
 238	// It is possible that the developer has created the policy via:
 239	//   p := bluemonday.Policy{}
 240	// rather than:
 241	//   p := bluemonday.NewPolicy()
 242	// If this is the case, and if they haven't yet triggered an action that
 243	// would initialize the maps, then we need to do that.
 244	p.init()
 245
 246	buff, ok := w.(stringWriterWriter)
 247	if !ok {
 248		buff = &asStringWriter{w}
 249	}
 250
 251	var (
 252		skipElementContent       bool
 253		skippingElementsCount    int64
 254		skipClosingTag           bool
 255		closingTagToSkipStack    []string
 256		mostRecentlyStartedToken string
 257	)
 258
 259	tokenizer := html.NewTokenizer(r)
 260	for {
 261		if tokenizer.Next() == html.ErrorToken {
 262			err := tokenizer.Err()
 263			if err == io.EOF {
 264				// End of input means end of processing
 265				return nil
 266			}
 267
 268			// Raw tokenizer error
 269			return err
 270		}
 271
 272		token := tokenizer.Token()
 273		switch token.Type {
 274		case html.DoctypeToken:
 275
 276			// DocType is not handled as there is no safe parsing mechanism
 277			// provided by golang.org/x/net/html for the content, and this can
 278			// be misused to insert HTML tags that are not then sanitized
 279			//
 280			// One might wish to recursively sanitize here using the same policy
 281			// but I will need to do some further testing before considering
 282			// this.
 283
 284		case html.CommentToken:
 285
 286			// Comments are ignored by default
 287			if p.allowComments {
 288				// But if allowed then write the comment out as-is
 289				buff.WriteString(token.String())
 290			}
 291
 292		case html.StartTagToken:
 293
 294			mostRecentlyStartedToken = normaliseElementName(token.Data)
 295
 296			switch normaliseElementName(token.Data) {
 297			case `script`:
 298				if !p.allowUnsafe {
 299					continue
 300				}
 301			case `style`:
 302				if !p.allowUnsafe {
 303					continue
 304				}
 305			}
 306
 307			aps, ok := p.elsAndAttrs[token.Data]
 308			if !ok {
 309				aa, matched := p.matchRegex(token.Data)
 310				if !matched {
 311					if _, ok := p.setOfElementsToSkipContent[token.Data]; ok {
 312						skipElementContent = true
 313						skippingElementsCount++
 314					}
 315					if p.addSpaces {
 316						if _, err := buff.WriteString(" "); err != nil {
 317							return err
 318						}
 319					}
 320					break
 321				}
 322				aps = aa
 323			}
 324			if len(token.Attr) != 0 {
 325				token.Attr = p.sanitizeAttrs(token.Data, token.Attr, aps)
 326			}
 327
 328			if len(token.Attr) == 0 {
 329				if !p.allowNoAttrs(token.Data) {
 330					skipClosingTag = true
 331					closingTagToSkipStack = append(closingTagToSkipStack, token.Data)
 332					if p.addSpaces {
 333						if _, err := buff.WriteString(" "); err != nil {
 334							return err
 335						}
 336					}
 337					break
 338				}
 339			}
 340
 341			if !skipElementContent {
 342				if _, err := buff.WriteString(token.String()); err != nil {
 343					return err
 344				}
 345			}
 346
 347		case html.EndTagToken:
 348
 349			if mostRecentlyStartedToken == normaliseElementName(token.Data) {
 350				mostRecentlyStartedToken = ""
 351			}
 352
 353			switch normaliseElementName(token.Data) {
 354			case `script`:
 355				if !p.allowUnsafe {
 356					continue
 357				}
 358			case `style`:
 359				if !p.allowUnsafe {
 360					continue
 361				}
 362			}
 363
 364			if skipClosingTag && closingTagToSkipStack[len(closingTagToSkipStack)-1] == token.Data {
 365				closingTagToSkipStack = closingTagToSkipStack[:len(closingTagToSkipStack)-1]
 366				if len(closingTagToSkipStack) == 0 {
 367					skipClosingTag = false
 368				}
 369				if p.addSpaces {
 370					if _, err := buff.WriteString(" "); err != nil {
 371						return err
 372					}
 373				}
 374				break
 375			}
 376			if _, ok := p.elsAndAttrs[token.Data]; !ok {
 377				match := false
 378				for regex := range p.elsMatchingAndAttrs {
 379					if regex.MatchString(token.Data) {
 380						skipElementContent = false
 381						match = true
 382						break
 383					}
 384				}
 385				if _, ok := p.setOfElementsToSkipContent[token.Data]; ok && !match {
 386					skippingElementsCount--
 387					if skippingElementsCount == 0 {
 388						skipElementContent = false
 389					}
 390				}
 391				if !match {
 392					if p.addSpaces {
 393						if _, err := buff.WriteString(" "); err != nil {
 394							return err
 395						}
 396					}
 397					break
 398				}
 399			}
 400
 401			if !skipElementContent {
 402				if _, err := buff.WriteString(token.String()); err != nil {
 403					return err
 404				}
 405			}
 406
 407		case html.SelfClosingTagToken:
 408
 409			switch normaliseElementName(token.Data) {
 410			case `script`:
 411				if !p.allowUnsafe {
 412					continue
 413				}
 414			case `style`:
 415				if !p.allowUnsafe {
 416					continue
 417				}
 418			}
 419
 420			aps, ok := p.elsAndAttrs[token.Data]
 421			if !ok {
 422				aa, matched := p.matchRegex(token.Data)
 423				if !matched {
 424					if p.addSpaces && !matched {
 425						if _, err := buff.WriteString(" "); err != nil {
 426							return err
 427						}
 428					}
 429					break
 430				}
 431				aps = aa
 432			}
 433
 434			if len(token.Attr) != 0 {
 435				token.Attr = p.sanitizeAttrs(token.Data, token.Attr, aps)
 436			}
 437
 438			if len(token.Attr) == 0 && !p.allowNoAttrs(token.Data) {
 439				if p.addSpaces {
 440					if _, err := buff.WriteString(" "); err != nil {
 441						return err
 442					}
 443				}
 444				break
 445			}
 446			if !skipElementContent {
 447				if _, err := buff.WriteString(token.String()); err != nil {
 448					return err
 449				}
 450			}
 451
 452		case html.TextToken:
 453
 454			if !skipElementContent {
 455				switch mostRecentlyStartedToken {
 456				case `script`:
 457					// not encouraged, but if a policy allows JavaScript we
 458					// should not HTML escape it as that would break the output
 459					//
 460					// requires p.AllowUnsafe()
 461					if p.allowUnsafe {
 462						if _, err := buff.WriteString(token.Data); err != nil {
 463							return err
 464						}
 465					}
 466				case "style":
 467					// not encouraged, but if a policy allows CSS styles we
 468					// should not HTML escape it as that would break the output
 469					//
 470					// requires p.AllowUnsafe()
 471					if p.allowUnsafe {
 472						if _, err := buff.WriteString(token.Data); err != nil {
 473							return err
 474						}
 475					}
 476				default:
 477					// HTML escape the text
 478					if _, err := buff.WriteString(token.String()); err != nil {
 479						return err
 480					}
 481				}
 482			}
 483
 484		default:
 485			// A token that didn't exist in the html package when we wrote this
 486			return fmt.Errorf("unknown token: %v", token)
 487		}
 488	}
 489}
 490
 491// sanitizeAttrs takes a set of element attribute policies and the global
 492// attribute policies and applies them to the []html.Attribute returning a set
 493// of html.Attributes that match the policies
 494func (p *Policy) sanitizeAttrs(
 495	elementName string,
 496	attrs []html.Attribute,
 497	aps map[string][]attrPolicy,
 498) []html.Attribute {
 499
 500	if len(attrs) == 0 {
 501		return attrs
 502	}
 503
 504	hasStylePolicies := false
 505	sps, elementHasStylePolicies := p.elsAndStyles[elementName]
 506	if len(p.globalStyles) > 0 || (elementHasStylePolicies && len(sps) > 0) {
 507		hasStylePolicies = true
 508	}
 509	// no specific element policy found, look for a pattern match
 510	if !hasStylePolicies {
 511		for k, v := range p.elsMatchingAndStyles {
 512			if k.MatchString(elementName) {
 513				if len(v) > 0 {
 514					hasStylePolicies = true
 515					break
 516				}
 517			}
 518		}
 519	}
 520
 521	// Builds a new attribute slice based on the whether the attribute has been
 522	// allowed explicitly or globally.
 523	cleanAttrs := []html.Attribute{}
 524attrsLoop:
 525	for _, htmlAttr := range attrs {
 526		if p.allowDataAttributes {
 527			// If we see a data attribute, let it through.
 528			if isDataAttribute(htmlAttr.Key) {
 529				cleanAttrs = append(cleanAttrs, htmlAttr)
 530				continue
 531			}
 532		}
 533		// Is this a "style" attribute, and if so, do we need to sanitize it?
 534		if htmlAttr.Key == "style" && hasStylePolicies {
 535			htmlAttr = p.sanitizeStyles(htmlAttr, elementName)
 536			if htmlAttr.Val == "" {
 537				// We've sanitized away any and all styles; don't bother to
 538				// output the style attribute (even if it's allowed)
 539				continue
 540			} else {
 541				cleanAttrs = append(cleanAttrs, htmlAttr)
 542				continue
 543			}
 544		}
 545
 546		// Is there an element specific attribute policy that applies?
 547		if apl, ok := aps[htmlAttr.Key]; ok {
 548			for _, ap := range apl {
 549				if ap.regexp != nil {
 550					if ap.regexp.MatchString(htmlAttr.Val) {
 551						cleanAttrs = append(cleanAttrs, htmlAttr)
 552						continue attrsLoop
 553					}
 554				} else {
 555					cleanAttrs = append(cleanAttrs, htmlAttr)
 556					continue attrsLoop
 557				}
 558			}
 559		}
 560
 561		// Is there a global attribute policy that applies?
 562		if apl, ok := p.globalAttrs[htmlAttr.Key]; ok {
 563			for _, ap := range apl {
 564				if ap.regexp != nil {
 565					if ap.regexp.MatchString(htmlAttr.Val) {
 566						cleanAttrs = append(cleanAttrs, htmlAttr)
 567					}
 568				} else {
 569					cleanAttrs = append(cleanAttrs, htmlAttr)
 570				}
 571			}
 572		}
 573	}
 574
 575	if len(cleanAttrs) == 0 {
 576		// If nothing was allowed, let's get out of here
 577		return cleanAttrs
 578	}
 579	// cleanAttrs now contains the attributes that are permitted
 580
 581	if linkable(elementName) {
 582		if p.requireParseableURLs {
 583			// Ensure URLs are parseable:
 584			// - a.href
 585			// - area.href
 586			// - link.href
 587			// - blockquote.cite
 588			// - q.cite
 589			// - img.src
 590			// - script.src
 591			tmpAttrs := []html.Attribute{}
 592			for _, htmlAttr := range cleanAttrs {
 593				switch elementName {
 594				case "a", "area", "base", "link":
 595					if htmlAttr.Key == "href" {
 596						if u, ok := p.validURL(htmlAttr.Val); ok {
 597							htmlAttr.Val = u
 598							tmpAttrs = append(tmpAttrs, htmlAttr)
 599						}
 600						break
 601					}
 602					tmpAttrs = append(tmpAttrs, htmlAttr)
 603				case "blockquote", "del", "ins", "q":
 604					if htmlAttr.Key == "cite" {
 605						if u, ok := p.validURL(htmlAttr.Val); ok {
 606							htmlAttr.Val = u
 607							tmpAttrs = append(tmpAttrs, htmlAttr)
 608						}
 609						break
 610					}
 611					tmpAttrs = append(tmpAttrs, htmlAttr)
 612				case "audio", "embed", "iframe", "img", "script", "source", "track", "video":
 613					if htmlAttr.Key == "src" {
 614						if u, ok := p.validURL(htmlAttr.Val); ok {
 615							htmlAttr.Val = u
 616							tmpAttrs = append(tmpAttrs, htmlAttr)
 617						}
 618						break
 619					}
 620					tmpAttrs = append(tmpAttrs, htmlAttr)
 621				default:
 622					tmpAttrs = append(tmpAttrs, htmlAttr)
 623				}
 624			}
 625			cleanAttrs = tmpAttrs
 626		}
 627
 628		if (p.requireNoFollow ||
 629			p.requireNoFollowFullyQualifiedLinks ||
 630			p.requireNoReferrer ||
 631			p.requireNoReferrerFullyQualifiedLinks ||
 632			p.addTargetBlankToFullyQualifiedLinks) &&
 633			len(cleanAttrs) > 0 {
 634
 635			// Add rel="nofollow" if a "href" exists
 636			switch elementName {
 637			case "a", "area", "base", "link":
 638				var hrefFound bool
 639				var externalLink bool
 640				for _, htmlAttr := range cleanAttrs {
 641					if htmlAttr.Key == "href" {
 642						hrefFound = true
 643
 644						u, err := url.Parse(htmlAttr.Val)
 645						if err != nil {
 646							continue
 647						}
 648						if u.Host != "" {
 649							externalLink = true
 650						}
 651
 652						continue
 653					}
 654				}
 655
 656				if hrefFound {
 657					var (
 658						noFollowFound    bool
 659						noReferrerFound  bool
 660						targetBlankFound bool
 661					)
 662
 663					addNoFollow := (p.requireNoFollow ||
 664						externalLink && p.requireNoFollowFullyQualifiedLinks)
 665
 666					addNoReferrer := (p.requireNoReferrer ||
 667						externalLink && p.requireNoReferrerFullyQualifiedLinks)
 668
 669					addTargetBlank := (externalLink &&
 670						p.addTargetBlankToFullyQualifiedLinks)
 671
 672					tmpAttrs := []html.Attribute{}
 673					for _, htmlAttr := range cleanAttrs {
 674
 675						var appended bool
 676						if htmlAttr.Key == "rel" && (addNoFollow || addNoReferrer) {
 677
 678							if addNoFollow && !strings.Contains(htmlAttr.Val, "nofollow") {
 679								htmlAttr.Val += " nofollow"
 680							}
 681							if addNoReferrer && !strings.Contains(htmlAttr.Val, "noreferrer") {
 682								htmlAttr.Val += " noreferrer"
 683							}
 684							noFollowFound = addNoFollow
 685							noReferrerFound = addNoReferrer
 686							tmpAttrs = append(tmpAttrs, htmlAttr)
 687							appended = true
 688						}
 689
 690						if elementName == "a" && htmlAttr.Key == "target" {
 691							if htmlAttr.Val == "_blank" {
 692								targetBlankFound = true
 693							}
 694							if addTargetBlank && !targetBlankFound {
 695								htmlAttr.Val = "_blank"
 696								targetBlankFound = true
 697								tmpAttrs = append(tmpAttrs, htmlAttr)
 698								appended = true
 699							}
 700						}
 701
 702						if !appended {
 703							tmpAttrs = append(tmpAttrs, htmlAttr)
 704						}
 705					}
 706					if noFollowFound || noReferrerFound || targetBlankFound {
 707						cleanAttrs = tmpAttrs
 708					}
 709
 710					if (addNoFollow && !noFollowFound) || (addNoReferrer && !noReferrerFound) {
 711						rel := html.Attribute{}
 712						rel.Key = "rel"
 713						if addNoFollow {
 714							rel.Val = "nofollow"
 715						}
 716						if addNoReferrer {
 717							if rel.Val != "" {
 718								rel.Val += " "
 719							}
 720							rel.Val += "noreferrer"
 721						}
 722						cleanAttrs = append(cleanAttrs, rel)
 723					}
 724
 725					if elementName == "a" && addTargetBlank && !targetBlankFound {
 726						rel := html.Attribute{}
 727						rel.Key = "target"
 728						rel.Val = "_blank"
 729						targetBlankFound = true
 730						cleanAttrs = append(cleanAttrs, rel)
 731					}
 732
 733					if targetBlankFound {
 734						// target="_blank" has a security risk that allows the
 735						// opened window/tab to issue JavaScript calls against
 736						// window.opener, which in effect allow the destination
 737						// of the link to control the source:
 738						// https://dev.to/ben/the-targetblank-vulnerability-by-example
 739						//
 740						// To mitigate this risk, we need to add a specific rel
 741						// attribute if it is not already present.
 742						// rel="noopener"
 743						//
 744						// Unfortunately this is processing the rel twice (we
 745						// already looked at it earlier ^^) as we cannot be sure
 746						// of the ordering of the href and rel, and whether we
 747						// have fully satisfied that we need to do this. This
 748						// double processing only happens *if* target="_blank"
 749						// is true.
 750						var noOpenerAdded bool
 751						tmpAttrs := []html.Attribute{}
 752						for _, htmlAttr := range cleanAttrs {
 753							var appended bool
 754							if htmlAttr.Key == "rel" {
 755								if strings.Contains(htmlAttr.Val, "noopener") {
 756									noOpenerAdded = true
 757									tmpAttrs = append(tmpAttrs, htmlAttr)
 758								} else {
 759									htmlAttr.Val += " noopener"
 760									noOpenerAdded = true
 761									tmpAttrs = append(tmpAttrs, htmlAttr)
 762								}
 763
 764								appended = true
 765							}
 766							if !appended {
 767								tmpAttrs = append(tmpAttrs, htmlAttr)
 768							}
 769						}
 770						if noOpenerAdded {
 771							cleanAttrs = tmpAttrs
 772						} else {
 773							// rel attr was not found, or else noopener would
 774							// have been added already
 775							rel := html.Attribute{}
 776							rel.Key = "rel"
 777							rel.Val = "noopener"
 778							cleanAttrs = append(cleanAttrs, rel)
 779						}
 780
 781					}
 782				}
 783			default:
 784			}
 785		}
 786	}
 787
 788	if p.requireCrossOriginAnonymous && len(cleanAttrs) > 0 {
 789		switch elementName {
 790		case "audio", "img", "link", "script", "video":
 791			var crossOriginFound bool
 792			for _, htmlAttr := range cleanAttrs {
 793				if htmlAttr.Key == "crossorigin" {
 794					crossOriginFound = true
 795					htmlAttr.Val = "anonymous"
 796				}
 797			}
 798
 799			if !crossOriginFound {
 800				crossOrigin := html.Attribute{}
 801				crossOrigin.Key = "crossorigin"
 802				crossOrigin.Val = "anonymous"
 803				cleanAttrs = append(cleanAttrs, crossOrigin)
 804			}
 805		}
 806	}
 807
 808	if p.requireSandboxOnIFrame != nil && elementName == "iframe" {
 809		var sandboxFound bool
 810		for i, htmlAttr := range cleanAttrs {
 811			if htmlAttr.Key == "sandbox" {
 812				sandboxFound = true
 813				var cleanVals []string
 814				cleanValsSet := make(map[string]bool)
 815				for _, val := range strings.Fields(htmlAttr.Val) {
 816					if p.requireSandboxOnIFrame[val] {
 817						if !cleanValsSet[val] {
 818							cleanVals = append(cleanVals, val)
 819							cleanValsSet[val] = true
 820						}
 821					}
 822				}
 823				cleanAttrs[i].Val = strings.Join(cleanVals, " ")
 824			}
 825		}
 826
 827		if !sandboxFound {
 828			sandbox := html.Attribute{}
 829			sandbox.Key = "sandbox"
 830			sandbox.Val = ""
 831			cleanAttrs = append(cleanAttrs, sandbox)
 832		}
 833	}
 834
 835	return cleanAttrs
 836}
 837
 838func (p *Policy) sanitizeStyles(attr html.Attribute, elementName string) html.Attribute {
 839	sps := p.elsAndStyles[elementName]
 840	if len(sps) == 0 {
 841		sps = map[string][]stylePolicy{}
 842		// check for any matching elements, if we don't already have a policy found
 843		// if multiple matches are found they will be overwritten, it's best
 844		// to not have overlapping matchers
 845		for regex, policies := range p.elsMatchingAndStyles {
 846			if regex.MatchString(elementName) {
 847				for k, v := range policies {
 848					sps[k] = append(sps[k], v...)
 849				}
 850			}
 851		}
 852	}
 853
 854	//Add semi-colon to end to fix parsing issue
 855	attr.Val = strings.TrimRight(attr.Val, " ")
 856	if len(attr.Val) > 0 && attr.Val[len(attr.Val)-1] != ';' {
 857		attr.Val = attr.Val + ";"
 858	}
 859	decs, err := parser.ParseDeclarations(attr.Val)
 860	if err != nil {
 861		attr.Val = ""
 862		return attr
 863	}
 864	clean := []string{}
 865	prefixes := []string{"-webkit-", "-moz-", "-ms-", "-o-", "mso-", "-xv-", "-atsc-", "-wap-", "-khtml-", "prince-", "-ah-", "-hp-", "-ro-", "-rim-", "-tc-"}
 866
 867decLoop:
 868	for _, dec := range decs {
 869		tempProperty := strings.ToLower(dec.Property)
 870		tempValue := removeUnicode(strings.ToLower(dec.Value))
 871		for _, i := range prefixes {
 872			tempProperty = strings.TrimPrefix(tempProperty, i)
 873		}
 874		if spl, ok := sps[tempProperty]; ok {
 875			for _, sp := range spl {
 876				if sp.handler != nil {
 877					if sp.handler(tempValue) {
 878						clean = append(clean, dec.Property+": "+dec.Value)
 879						continue decLoop
 880					}
 881				} else if len(sp.enum) > 0 {
 882					if stringInSlice(tempValue, sp.enum) {
 883						clean = append(clean, dec.Property+": "+dec.Value)
 884						continue decLoop
 885					}
 886				} else if sp.regexp != nil {
 887					if sp.regexp.MatchString(tempValue) {
 888						clean = append(clean, dec.Property+": "+dec.Value)
 889						continue decLoop
 890					}
 891				}
 892			}
 893		}
 894		if spl, ok := p.globalStyles[tempProperty]; ok {
 895			for _, sp := range spl {
 896				if sp.handler != nil {
 897					if sp.handler(tempValue) {
 898						clean = append(clean, dec.Property+": "+dec.Value)
 899						continue decLoop
 900					}
 901				} else if len(sp.enum) > 0 {
 902					if stringInSlice(tempValue, sp.enum) {
 903						clean = append(clean, dec.Property+": "+dec.Value)
 904						continue decLoop
 905					}
 906				} else if sp.regexp != nil {
 907					if sp.regexp.MatchString(tempValue) {
 908						clean = append(clean, dec.Property+": "+dec.Value)
 909						continue decLoop
 910					}
 911				}
 912			}
 913		}
 914	}
 915	if len(clean) > 0 {
 916		attr.Val = strings.Join(clean, "; ")
 917	} else {
 918		attr.Val = ""
 919	}
 920	return attr
 921}
 922
 923func (p *Policy) allowNoAttrs(elementName string) bool {
 924	_, ok := p.setOfElementsAllowedWithoutAttrs[elementName]
 925	if !ok {
 926		for _, r := range p.setOfElementsMatchingAllowedWithoutAttrs {
 927			if r.MatchString(elementName) {
 928				ok = true
 929				break
 930			}
 931		}
 932	}
 933	return ok
 934}
 935
 936func (p *Policy) validURL(rawurl string) (string, bool) {
 937	if p.requireParseableURLs {
 938		// URLs are valid if when space is trimmed the URL is valid
 939		rawurl = strings.TrimSpace(rawurl)
 940
 941		// URLs cannot contain whitespace, unless it is a data-uri
 942		if strings.Contains(rawurl, " ") ||
 943			strings.Contains(rawurl, "\t") ||
 944			strings.Contains(rawurl, "\n") {
 945			if !strings.HasPrefix(rawurl, `data:`) {
 946				return "", false
 947			}
 948
 949			// Remove \r and \n from base64 encoded data to pass url.Parse.
 950			matched := dataURIbase64Prefix.FindString(rawurl)
 951			if matched != "" {
 952				rawurl = matched + strings.Replace(
 953					strings.Replace(
 954						rawurl[len(matched):],
 955						"\r",
 956						"",
 957						-1,
 958					),
 959					"\n",
 960					"",
 961					-1,
 962				)
 963			}
 964		}
 965
 966		// URLs are valid if they parse
 967		u, err := url.Parse(rawurl)
 968		if err != nil {
 969			return "", false
 970		}
 971
 972		if u.Scheme != "" {
 973			for _, r := range p.allowURLSchemeRegexps {
 974				if r.MatchString(u.Scheme) {
 975					return u.String(), true
 976				}
 977			}
 978
 979			urlPolicies, ok := p.allowURLSchemes[u.Scheme]
 980			if !ok {
 981				return "", false
 982			}
 983
 984			if len(urlPolicies) == 0 {
 985				return u.String(), true
 986			}
 987
 988			for _, urlPolicy := range urlPolicies {
 989				if urlPolicy(u) == true {
 990					return u.String(), true
 991				}
 992			}
 993
 994			return "", false
 995		}
 996
 997		if p.allowRelativeURLs {
 998			if u.String() != "" {
 999				return u.String(), true
1000			}
1001		}
1002
1003		return "", false
1004	}
1005
1006	return rawurl, true
1007}
1008
1009func linkable(elementName string) bool {
1010	switch elementName {
1011	case "a", "area", "base", "link":
1012		// elements that allow .href
1013		return true
1014	case "blockquote", "del", "ins", "q":
1015		// elements that allow .cite
1016		return true
1017	case "audio", "embed", "iframe", "img", "input", "script", "track", "video":
1018		// elements that allow .src
1019		return true
1020	default:
1021		return false
1022	}
1023}
1024
1025// stringInSlice returns true if needle exists in haystack
1026func stringInSlice(needle string, haystack []string) bool {
1027	for _, straw := range haystack {
1028		if strings.ToLower(straw) == strings.ToLower(needle) {
1029			return true
1030		}
1031	}
1032	return false
1033}
1034
1035func isDataAttribute(val string) bool {
1036	if !dataAttribute.MatchString(val) {
1037		return false
1038	}
1039	rest := strings.Split(val, "data-")
1040	if len(rest) == 1 {
1041		return false
1042	}
1043	// data-xml* is invalid.
1044	if dataAttributeXMLPrefix.MatchString(rest[1]) {
1045		return false
1046	}
1047	// no uppercase or semi-colons allowed.
1048	if dataAttributeInvalidChars.MatchString(rest[1]) {
1049		return false
1050	}
1051	return true
1052}
1053
1054func removeUnicode(value string) string {
1055	substitutedValue := value
1056	currentLoc := cssUnicodeChar.FindStringIndex(substitutedValue)
1057	for currentLoc != nil {
1058
1059		character := substitutedValue[currentLoc[0]+1 : currentLoc[1]]
1060		character = strings.TrimSpace(character)
1061		if len(character) < 4 {
1062			character = strings.Repeat("0", 4-len(character)) + character
1063		} else {
1064			for len(character) > 4 {
1065				if character[0] != '0' {
1066					character = ""
1067					break
1068				} else {
1069					character = character[1:]
1070				}
1071			}
1072		}
1073		character = "\\u" + character
1074		translatedChar, err := strconv.Unquote(`"` + character + `"`)
1075		translatedChar = strings.TrimSpace(translatedChar)
1076		if err != nil {
1077			return ""
1078		}
1079		substitutedValue = substitutedValue[0:currentLoc[0]] + translatedChar + substitutedValue[currentLoc[1]:]
1080		currentLoc = cssUnicodeChar.FindStringIndex(substitutedValue)
1081	}
1082	return substitutedValue
1083}
1084
1085func (p *Policy) matchRegex(elementName string) (map[string][]attrPolicy, bool) {
1086	aps := make(map[string][]attrPolicy, 0)
1087	matched := false
1088	for regex, attrs := range p.elsMatchingAndAttrs {
1089		if regex.MatchString(elementName) {
1090			matched = true
1091			for k, v := range attrs {
1092				aps[k] = append(aps[k], v...)
1093			}
1094		}
1095	}
1096	return aps, matched
1097}
1098
1099// normaliseElementName takes a HTML element like <script> which is user input
1100// and returns a lower case version of it that is immune to UTF-8 to ASCII
1101// conversion tricks (like the use of upper case cyrillic i scrİpt which a
1102// strings.ToLower would convert to script). Instead this func will preserve
1103// all non-ASCII as their escaped equivalent, i.e. \u0130 which reveals the
1104// characters when lower cased
1105func normaliseElementName(str string) string {
1106	// that useful QuoteToASCII put quote marks at the start and end
1107	// so those are trimmed off
1108	return strings.TrimSuffix(
1109		strings.TrimPrefix(
1110			strings.ToLower(
1111				strconv.QuoteToASCII(str),
1112			),
1113			`"`),
1114		`"`,
1115	)
1116}