upgrade to most recent bluemonday (#11007)

* upgrade to most recent bluemonday * make vendor * update tests for bluemonday * update tests for bluemonday * update tests for bluemonday
2025-07-04 04:55:44 -04:00 · 2020-04-07 16:08:47 -04:00 · 2020-04-07 16:08:47 -04:00 · d00ebf445b
commit d00ebf445b
parent 4c54477bb5
50 changed files with 4977 additions and 300 deletions
--- a/vendor/github.com/microcosm-cc/bluemonday/sanitize.go
+++ b/vendor/github.com/microcosm-cc/bluemonday/sanitize.go
@ -33,9 +33,20 @@ import (
 	"bytes"
 	"io"
 	"net/url"
+	"regexp"
+	"strconv"
 	"strings"

 	"golang.org/x/net/html"
+
+	cssparser "github.com/chris-ramon/douceur/parser"
+)
+
+var (
+	dataAttribute             = regexp.MustCompile("^data-.+")
+	dataAttributeXMLPrefix    = regexp.MustCompile("^xml.+")
+	dataAttributeInvalidChars = regexp.MustCompile("[A-Z;]+")
+	cssUnicodeChar            = regexp.MustCompile(`\\[0-9a-f]{1,6} ?`)
 )

 // Sanitize takes a string that contains a HTML fragment or document and applies
@ -75,6 +86,98 @@ func (p *Policy) SanitizeReader(r io.Reader) *bytes.Buffer {
 	return p.sanitize(r)
 }

+const escapedURLChars = "'<>\"\r"
+
+func escapeUrlComponent(val string) string {
+	w := bytes.NewBufferString("")
+	i := strings.IndexAny(val, escapedURLChars)
+	for i != -1 {
+		if _, err := w.WriteString(val[:i]); err != nil {
+			return w.String()
+		}
+		var esc string
+		switch val[i] {
+		case '\'':
+			// "&#39;" is shorter than "&apos;" and apos was not in HTML until HTML5.
+			esc = "&#39;"
+		case '<':
+			esc = "&lt;"
+		case '>':
+			esc = "&gt;"
+		case '"':
+			// "&#34;" is shorter than "&quot;".
+			esc = "&#34;"
+		case '\r':
+			esc = "&#13;"
+		default:
+			panic("unrecognized escape character")
+		}
+		val = val[i+1:]
+		if _, err := w.WriteString(esc); err != nil {
+			return w.String()
+		}
+		i = strings.IndexAny(val, escapedURLChars)
+	}
+	w.WriteString(val)
+	return w.String()
+}
+
+func sanitizedUrl(val string) (string, error) {
+	u, err := url.Parse(val)
+	if err != nil {
+		return "", err
+	}
+	// sanitize the url query params
+	sanitizedQueryValues := make(url.Values, 0)
+	queryValues := u.Query()
+	for k, vals := range queryValues {
+		sk := html.EscapeString(k)
+		for _, v := range vals {
+			sv := escapeUrlComponent(v)
+			sanitizedQueryValues.Set(sk, sv)
+		}
+	}
+	u.RawQuery = sanitizedQueryValues.Encode()
+	// u.String() will also sanitize host/scheme/user/pass
+	return u.String(), nil
+}
+
+func (p *Policy) writeLinkableBuf(buff *bytes.Buffer, token *html.Token) {
+	// do not escape multiple query parameters
+	tokenBuff := bytes.NewBufferString("")
+	tokenBuff.WriteString("<")
+	tokenBuff.WriteString(token.Data)
+	for _, attr := range token.Attr {
+		tokenBuff.WriteByte(' ')
+		tokenBuff.WriteString(attr.Key)
+		tokenBuff.WriteString(`="`)
+		switch attr.Key {
+		case "href", "src":
+			u, ok := p.validURL(attr.Val)
+			if !ok {
+				tokenBuff.WriteString(html.EscapeString(attr.Val))
+				continue
+			}
+			u, err := sanitizedUrl(u)
+			if err == nil {
+				tokenBuff.WriteString(u)
+			} else {
+				// fallthrough
+				tokenBuff.WriteString(html.EscapeString(attr.Val))
+			}
+		default:
+			// re-apply
+			tokenBuff.WriteString(html.EscapeString(attr.Val))
+		}
+		tokenBuff.WriteByte('"')
+	}
+	if token.Type == html.SelfClosingTagToken {
+		tokenBuff.WriteString("/")
+	}
+	tokenBuff.WriteString(">")
+	buff.WriteString(tokenBuff.String())
+}
+
 // Performs the actual sanitization process.
 func (p *Policy) sanitize(r io.Reader) *bytes.Buffer {

@ -112,9 +215,13 @@ func (p *Policy) sanitize(r io.Reader) *bytes.Buffer {
 		switch token.Type {
 		case html.DoctypeToken:

-			if p.allowDocType {
-				buff.WriteString(token.String())
-			}
+			// DocType is not handled as there is no safe parsing mechanism
+			// provided by golang.org/x/net/html for the content, and this can
+			// be misused to insert HTML tags that are not then sanitized
+			//
+			// One might wish to recursively sanitize here using the same policy
+			// but I will need to do some further testing before considering
+			// this.

 		case html.CommentToken:

@ -122,20 +229,23 @@ func (p *Policy) sanitize(r io.Reader) *bytes.Buffer {

 		case html.StartTagToken:

-			mostRecentlyStartedToken = token.Data
+			mostRecentlyStartedToken = strings.ToLower(token.Data)

 			aps, ok := p.elsAndAttrs[token.Data]
 			if !ok {
-				if _, ok := p.setOfElementsToSkipContent[token.Data]; ok {
-					skipElementContent = true
-					skippingElementsCount++
+				aa, matched := p.matchRegex(token.Data)
+				if !matched {
+					if _, ok := p.setOfElementsToSkipContent[token.Data]; ok {
+						skipElementContent = true
+						skippingElementsCount++
+					}
+					if p.addSpaces {
+						buff.WriteString(" ")
+					}
+					break
 				}
-				if p.addSpaces {
-					buff.WriteString(" ")
-				}
-				break
+				aps = aa
 			}
-
 			if len(token.Attr) != 0 {
 				token.Attr = p.sanitizeAttrs(token.Data, token.Attr, aps)
 			}
@ -152,11 +262,20 @@ func (p *Policy) sanitize(r io.Reader) *bytes.Buffer {
 			}

 			if !skipElementContent {
-				buff.WriteString(token.String())
+				// do not escape multiple query parameters
+				if linkable(token.Data) {
+					p.writeLinkableBuf(&buff, &token)
+				} else {
+					buff.WriteString(token.String())
+				}
 			}

 		case html.EndTagToken:

+			if mostRecentlyStartedToken == strings.ToLower(token.Data) {
+				mostRecentlyStartedToken = ""
+			}
+
 			if skipClosingTag && closingTagToSkipStack[len(closingTagToSkipStack)-1] == token.Data {
 				closingTagToSkipStack = closingTagToSkipStack[:len(closingTagToSkipStack)-1]
 				if len(closingTagToSkipStack) == 0 {
@ -167,18 +286,27 @@ func (p *Policy) sanitize(r io.Reader) *bytes.Buffer {
 				}
 				break
 			}
-
 			if _, ok := p.elsAndAttrs[token.Data]; !ok {
-				if _, ok := p.setOfElementsToSkipContent[token.Data]; ok {
+				match := false
+				for regex := range p.elsMatchingAndAttrs {
+					if regex.MatchString(token.Data) {
+						skipElementContent = false
+						match = true
+						break
+					}
+				}
+				if _, ok := p.setOfElementsToSkipContent[token.Data]; ok && !match {
 					skippingElementsCount--
 					if skippingElementsCount == 0 {
 						skipElementContent = false
 					}
 				}
-				if p.addSpaces {
-					buff.WriteString(" ")
+				if !match {
+					if p.addSpaces {
+						buff.WriteString(" ")
+					}
+					break
 				}
-				break
 			}

 			if !skipElementContent {
@ -189,10 +317,14 @@ func (p *Policy) sanitize(r io.Reader) *bytes.Buffer {

 			aps, ok := p.elsAndAttrs[token.Data]
 			if !ok {
-				if p.addSpaces {
-					buff.WriteString(" ")
+				aa, matched := p.matchRegex(token.Data)
+				if !matched {
+					if p.addSpaces && !matched {
+						buff.WriteString(" ")
+					}
+					break
 				}
-				break
+				aps = aa
 			}

 			if len(token.Attr) != 0 {
@ -202,19 +334,23 @@ func (p *Policy) sanitize(r io.Reader) *bytes.Buffer {
 			if len(token.Attr) == 0 && !p.allowNoAttrs(token.Data) {
 				if p.addSpaces {
 					buff.WriteString(" ")
+					break
 				}
-				break
 			}
-
 			if !skipElementContent {
-				buff.WriteString(token.String())
+				// do not escape multiple query parameters
+				if linkable(token.Data) {
+					p.writeLinkableBuf(&buff, &token)
+				} else {
+					buff.WriteString(token.String())
+				}
 			}

 		case html.TextToken:

 			if !skipElementContent {
-				switch strings.ToLower(mostRecentlyStartedToken) {
-				case "javascript":
+				switch mostRecentlyStartedToken {
+				case "script":
 					// not encouraged, but if a policy allows JavaScript we
 					// should not HTML escape it as that would break the output
 					buff.WriteString(token.Data)
@ -248,10 +384,47 @@ func (p *Policy) sanitizeAttrs(
 		return attrs
 	}

+	hasStylePolicies := false
+	sps, elementHasStylePolicies := p.elsAndStyles[elementName]
+	if len(p.globalStyles) > 0 || (elementHasStylePolicies && len(sps) > 0) {
+		hasStylePolicies = true
+	}
+	// no specific element policy found, look for a pattern match
+	if !hasStylePolicies{
+		for k, v := range p.elsMatchingAndStyles{
+			if k.MatchString(elementName) {
+				if len(v) > 0{
+					hasStylePolicies = true
+					break
+				}
+			}
+		}
+	}
+
 	// Builds a new attribute slice based on the whether the attribute has been
 	// whitelisted explicitly or globally.
 	cleanAttrs := []html.Attribute{}
 	for _, htmlAttr := range attrs {
+		if p.allowDataAttributes {
+			// If we see a data attribute, let it through.
+			if isDataAttribute(htmlAttr.Key) {
+				cleanAttrs = append(cleanAttrs, htmlAttr)
+				continue
+			}
+		}
+		// Is this a "style" attribute, and if so, do we need to sanitize it?
+		if htmlAttr.Key == "style" && hasStylePolicies {
+			htmlAttr = p.sanitizeStyles(htmlAttr, elementName)
+			if htmlAttr.Val == "" {
+				// We've sanitized away any and all styles; don't bother to
+				// output the style attribute (even if it's allowed)
+				continue
+			} else {
+				cleanAttrs = append(cleanAttrs, htmlAttr)
+				continue
+			}
+		}
+
 		// Is there an element specific attribute policy that applies?
 		if ap, ok := aps[htmlAttr.Key]; ok {
 			if ap.regexp != nil {
@ -267,6 +440,7 @@ func (p *Policy) sanitizeAttrs(

 		// Is there a global attribute policy that applies?
 		if ap, ok := p.globalAttrs[htmlAttr.Key]; ok {
+
 			if ap.regexp != nil {
 				if ap.regexp.MatchString(htmlAttr.Val) {
 					cleanAttrs = append(cleanAttrs, htmlAttr)
@ -332,6 +506,8 @@ func (p *Policy) sanitizeAttrs(

 		if (p.requireNoFollow ||
 			p.requireNoFollowFullyQualifiedLinks ||
+			p.requireNoReferrer ||
+			p.requireNoReferrerFullyQualifiedLinks ||
 			p.addTargetBlankToFullyQualifiedLinks) &&
 			len(cleanAttrs) > 0 {

@ -359,12 +535,16 @@ func (p *Policy) sanitizeAttrs(
 				if hrefFound {
 					var (
 						noFollowFound    bool
+						noReferrerFound  bool
 						targetBlankFound bool
 					)

 					addNoFollow := (p.requireNoFollow ||
 						externalLink && p.requireNoFollowFullyQualifiedLinks)

+					addNoReferrer := (p.requireNoReferrer ||
+						externalLink && p.requireNoReferrerFullyQualifiedLinks)
+
 					addTargetBlank := (externalLink &&
 						p.addTargetBlankToFullyQualifiedLinks)

@ -372,18 +552,18 @@ func (p *Policy) sanitizeAttrs(
 					for _, htmlAttr := range cleanAttrs {

 						var appended bool
-						if htmlAttr.Key == "rel" && addNoFollow {
+						if htmlAttr.Key == "rel" && (addNoFollow || addNoReferrer) {

-							if strings.Contains(htmlAttr.Val, "nofollow") {
-								noFollowFound = true
-								tmpAttrs = append(tmpAttrs, htmlAttr)
-								appended = true
-							} else {
+							if addNoFollow && !strings.Contains(htmlAttr.Val, "nofollow") {
 								htmlAttr.Val += " nofollow"
-								noFollowFound = true
-								tmpAttrs = append(tmpAttrs, htmlAttr)
-								appended = true
 							}
+							if addNoReferrer && !strings.Contains(htmlAttr.Val, "noreferrer") {
+								htmlAttr.Val += " noreferrer"
+							}
+							noFollowFound = addNoFollow
+							noReferrerFound = addNoReferrer
+							tmpAttrs = append(tmpAttrs, htmlAttr)
+							appended = true
 						}

 						if elementName == "a" && htmlAttr.Key == "target" {
@ -402,14 +582,22 @@ func (p *Policy) sanitizeAttrs(
 							tmpAttrs = append(tmpAttrs, htmlAttr)
 						}
 					}
-					if noFollowFound || targetBlankFound {
+					if noFollowFound || noReferrerFound || targetBlankFound {
 						cleanAttrs = tmpAttrs
 					}

-					if addNoFollow && !noFollowFound {
+					if (addNoFollow && !noFollowFound) || (addNoReferrer && !noReferrerFound) {
 						rel := html.Attribute{}
 						rel.Key = "rel"
-						rel.Val = "nofollow"
+						if addNoFollow {
+							rel.Val = "nofollow"
+						}
+						if addNoReferrer {
+							if rel.Val != "" {
+								rel.Val += " "
+							}
+							rel.Val += "noreferrer"
+						}
 						cleanAttrs = append(cleanAttrs, rel)
 					}

@ -479,20 +667,112 @@ func (p *Policy) sanitizeAttrs(
 	return cleanAttrs
 }

+func (p *Policy) sanitizeStyles(attr html.Attribute, elementName string) html.Attribute {
+	sps := p.elsAndStyles[elementName]
+	if len(sps) == 0{
+		sps = map[string]stylePolicy{}
+		// check for any matching elements, if we don't already have a policy found
+		// if multiple matches are found they will be overwritten, it's best
+		// to not have overlapping matchers
+		for regex, policies :=range p.elsMatchingAndStyles{
+			if regex.MatchString(elementName){
+				for k, v := range policies{
+					sps[k] = v
+				}
+			}
+		}
+	}
+
+	//Add semi-colon to end to fix parsing issue
+	if len(attr.Val) > 0 && attr.Val[len(attr.Val)-1] != ';' {
+		attr.Val = attr.Val + ";"
+	}
+	decs, err := cssparser.ParseDeclarations(attr.Val)
+	if err != nil {
+		attr.Val = ""
+		return attr
+	}
+	clean := []string{}
+	prefixes := []string{"-webkit-", "-moz-", "-ms-", "-o-", "mso-", "-xv-", "-atsc-", "-wap-", "-khtml-", "prince-", "-ah-", "-hp-", "-ro-", "-rim-", "-tc-"}
+
+	for _, dec := range decs {
+		addedProperty := false
+		tempProperty := strings.ToLower(dec.Property)
+		tempValue := removeUnicode(strings.ToLower(dec.Value))
+		for _, i := range prefixes {
+			tempProperty = strings.TrimPrefix(tempProperty, i)
+		}
+		if sp, ok := sps[tempProperty]; ok {
+			if sp.handler != nil {
+				if sp.handler(tempValue) {
+					clean = append(clean, dec.Property+": "+dec.Value)
+					addedProperty = true
+				}
+			} else if len(sp.enum) > 0 {
+				if stringInSlice(tempValue, sp.enum) {
+					clean = append(clean, dec.Property+": "+dec.Value)
+					addedProperty = true
+				}
+			} else if sp.regexp != nil {
+				if sp.regexp.MatchString(tempValue) {
+					clean = append(clean, dec.Property+": "+dec.Value)
+					addedProperty = true
+				}
+				continue
+			}
+		}
+		if sp, ok := p.globalStyles[tempProperty]; ok && !addedProperty {
+			if sp.handler != nil {
+				if sp.handler(tempValue) {
+					clean = append(clean, dec.Property+": "+dec.Value)
+				}
+			} else if len(sp.enum) > 0 {
+				if stringInSlice(tempValue, sp.enum) {
+					clean = append(clean, dec.Property+": "+dec.Value)
+				}
+			} else if sp.regexp != nil {
+				if sp.regexp.MatchString(tempValue) {
+					clean = append(clean, dec.Property+": "+dec.Value)
+				}
+				continue
+			}
+		}
+	}
+	if len(clean) > 0 {
+		attr.Val = strings.Join(clean, "; ")
+	} else {
+		attr.Val = ""
+	}
+	return attr
+}
+
 func (p *Policy) allowNoAttrs(elementName string) bool {
 	_, ok := p.setOfElementsAllowedWithoutAttrs[elementName]
+	if !ok {
+		for _, r := range p.setOfElementsMatchingAllowedWithoutAttrs {
+			if r.MatchString(elementName) {
+				ok = true
+				break
+			}
+		}
+	}
 	return ok
 }

 func (p *Policy) validURL(rawurl string) (string, bool) {
 	if p.requireParseableURLs {
-		// URLs do not contain whitespace
-		if strings.Contains(rawurl, " ") ||
+		// URLs are valid if when space is trimmed the URL is valid
+		rawurl = strings.TrimSpace(rawurl)
+
+		// URLs cannot contain whitespace, unless it is a data-uri
+		if (strings.Contains(rawurl, " ") ||
 			strings.Contains(rawurl, "\t") ||
-			strings.Contains(rawurl, "\n") {
+			strings.Contains(rawurl, "\n")) &&
+			!strings.HasPrefix(rawurl, `data:`) {
 			return "", false
 		}

+		// URLs are valid if they parse
 		u, err := url.Parse(rawurl)
 		if err != nil {
 			return "", false
@ -533,3 +813,77 @@ func linkable(elementName string) bool {
 		return false
 	}
 }
+
+// stringInSlice returns true if needle exists in haystack
+func stringInSlice(needle string, haystack []string) bool {
+	for _, straw := range haystack {
+		if strings.ToLower(straw) == strings.ToLower(needle) {
+			return true
+		}
+	}
+	return false
+}
+
+func isDataAttribute(val string) bool {
+	if !dataAttribute.MatchString(val) {
+		return false
+	}
+	rest := strings.Split(val, "data-")
+	if len(rest) == 1 {
+		return false
+	}
+	// data-xml* is invalid.
+	if dataAttributeXMLPrefix.MatchString(rest[1]) {
+		return false
+	}
+	// no uppercase or semi-colons allowed.
+	if dataAttributeInvalidChars.MatchString(rest[1]) {
+		return false
+	}
+	return true
+}
+
+func removeUnicode(value string) string {
+	substitutedValue := value
+	currentLoc := cssUnicodeChar.FindStringIndex(substitutedValue)
+	for currentLoc != nil {
+
+		character := substitutedValue[currentLoc[0]+1 : currentLoc[1]]
+		character = strings.TrimSpace(character)
+		if len(character) < 4 {
+			character = strings.Repeat("0", 4-len(character)) + character
+		} else {
+			for len(character) > 4 {
+				if character[0] != '0' {
+					character = ""
+					break
+				} else {
+					character = character[1:]
+				}
+			}
+		}
+		character = "\\u" + character
+		translatedChar, err := strconv.Unquote(`"` + character + `"`)
+		translatedChar = strings.TrimSpace(translatedChar)
+		if err != nil {
+			return ""
+		}
+		substitutedValue = substitutedValue[0:currentLoc[0]] + translatedChar + substitutedValue[currentLoc[1]:]
+		currentLoc = cssUnicodeChar.FindStringIndex(substitutedValue)
+	}
+	return substitutedValue
+}
+
+func (p *Policy) matchRegex(elementName string ) (map[string]attrPolicy, bool) {
+	aps := make(map[string]attrPolicy, 0)
+	matched := false
+	for regex, attrs := range p.elsMatchingAndAttrs {
+		if regex.MatchString(elementName) {
+			matched = true
+			for k, v := range attrs {
+				aps[k] = v
+			}
+		}
+	}
+	return aps, matched
+}