upgrade to most recent bluemonday (#11007)

* upgrade to most recent bluemonday

* make vendor

* update tests for bluemonday

* update tests for bluemonday

* update tests for bluemonday
This commit is contained in:
techknowlogick 2020-04-07 16:08:47 -04:00 committed by GitHub
parent 4c54477bb5
commit d00ebf445b
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
50 changed files with 4977 additions and 300 deletions

View file

@ -33,9 +33,20 @@ import (
"bytes"
"io"
"net/url"
"regexp"
"strconv"
"strings"
"golang.org/x/net/html"
cssparser "github.com/chris-ramon/douceur/parser"
)
var (
dataAttribute = regexp.MustCompile("^data-.+")
dataAttributeXMLPrefix = regexp.MustCompile("^xml.+")
dataAttributeInvalidChars = regexp.MustCompile("[A-Z;]+")
cssUnicodeChar = regexp.MustCompile(`\\[0-9a-f]{1,6} ?`)
)
// Sanitize takes a string that contains a HTML fragment or document and applies
@ -75,6 +86,98 @@ func (p *Policy) SanitizeReader(r io.Reader) *bytes.Buffer {
return p.sanitize(r)
}
const escapedURLChars = "'<>\"\r"
func escapeUrlComponent(val string) string {
w := bytes.NewBufferString("")
i := strings.IndexAny(val, escapedURLChars)
for i != -1 {
if _, err := w.WriteString(val[:i]); err != nil {
return w.String()
}
var esc string
switch val[i] {
case '\'':
// "&#39;" is shorter than "&apos;" and apos was not in HTML until HTML5.
esc = "&#39;"
case '<':
esc = "&lt;"
case '>':
esc = "&gt;"
case '"':
// "&#34;" is shorter than "&quot;".
esc = "&#34;"
case '\r':
esc = "&#13;"
default:
panic("unrecognized escape character")
}
val = val[i+1:]
if _, err := w.WriteString(esc); err != nil {
return w.String()
}
i = strings.IndexAny(val, escapedURLChars)
}
w.WriteString(val)
return w.String()
}
func sanitizedUrl(val string) (string, error) {
u, err := url.Parse(val)
if err != nil {
return "", err
}
// sanitize the url query params
sanitizedQueryValues := make(url.Values, 0)
queryValues := u.Query()
for k, vals := range queryValues {
sk := html.EscapeString(k)
for _, v := range vals {
sv := escapeUrlComponent(v)
sanitizedQueryValues.Set(sk, sv)
}
}
u.RawQuery = sanitizedQueryValues.Encode()
// u.String() will also sanitize host/scheme/user/pass
return u.String(), nil
}
func (p *Policy) writeLinkableBuf(buff *bytes.Buffer, token *html.Token) {
// do not escape multiple query parameters
tokenBuff := bytes.NewBufferString("")
tokenBuff.WriteString("<")
tokenBuff.WriteString(token.Data)
for _, attr := range token.Attr {
tokenBuff.WriteByte(' ')
tokenBuff.WriteString(attr.Key)
tokenBuff.WriteString(`="`)
switch attr.Key {
case "href", "src":
u, ok := p.validURL(attr.Val)
if !ok {
tokenBuff.WriteString(html.EscapeString(attr.Val))
continue
}
u, err := sanitizedUrl(u)
if err == nil {
tokenBuff.WriteString(u)
} else {
// fallthrough
tokenBuff.WriteString(html.EscapeString(attr.Val))
}
default:
// re-apply
tokenBuff.WriteString(html.EscapeString(attr.Val))
}
tokenBuff.WriteByte('"')
}
if token.Type == html.SelfClosingTagToken {
tokenBuff.WriteString("/")
}
tokenBuff.WriteString(">")
buff.WriteString(tokenBuff.String())
}
// Performs the actual sanitization process.
func (p *Policy) sanitize(r io.Reader) *bytes.Buffer {
@ -112,9 +215,13 @@ func (p *Policy) sanitize(r io.Reader) *bytes.Buffer {
switch token.Type {
case html.DoctypeToken:
if p.allowDocType {
buff.WriteString(token.String())
}
// DocType is not handled as there is no safe parsing mechanism
// provided by golang.org/x/net/html for the content, and this can
// be misused to insert HTML tags that are not then sanitized
//
// One might wish to recursively sanitize here using the same policy
// but I will need to do some further testing before considering
// this.
case html.CommentToken:
@ -122,20 +229,23 @@ func (p *Policy) sanitize(r io.Reader) *bytes.Buffer {
case html.StartTagToken:
mostRecentlyStartedToken = token.Data
mostRecentlyStartedToken = strings.ToLower(token.Data)
aps, ok := p.elsAndAttrs[token.Data]
if !ok {
if _, ok := p.setOfElementsToSkipContent[token.Data]; ok {
skipElementContent = true
skippingElementsCount++
aa, matched := p.matchRegex(token.Data)
if !matched {
if _, ok := p.setOfElementsToSkipContent[token.Data]; ok {
skipElementContent = true
skippingElementsCount++
}
if p.addSpaces {
buff.WriteString(" ")
}
break
}
if p.addSpaces {
buff.WriteString(" ")
}
break
aps = aa
}
if len(token.Attr) != 0 {
token.Attr = p.sanitizeAttrs(token.Data, token.Attr, aps)
}
@ -152,11 +262,20 @@ func (p *Policy) sanitize(r io.Reader) *bytes.Buffer {
}
if !skipElementContent {
buff.WriteString(token.String())
// do not escape multiple query parameters
if linkable(token.Data) {
p.writeLinkableBuf(&buff, &token)
} else {
buff.WriteString(token.String())
}
}
case html.EndTagToken:
if mostRecentlyStartedToken == strings.ToLower(token.Data) {
mostRecentlyStartedToken = ""
}
if skipClosingTag && closingTagToSkipStack[len(closingTagToSkipStack)-1] == token.Data {
closingTagToSkipStack = closingTagToSkipStack[:len(closingTagToSkipStack)-1]
if len(closingTagToSkipStack) == 0 {
@ -167,18 +286,27 @@ func (p *Policy) sanitize(r io.Reader) *bytes.Buffer {
}
break
}
if _, ok := p.elsAndAttrs[token.Data]; !ok {
if _, ok := p.setOfElementsToSkipContent[token.Data]; ok {
match := false
for regex := range p.elsMatchingAndAttrs {
if regex.MatchString(token.Data) {
skipElementContent = false
match = true
break
}
}
if _, ok := p.setOfElementsToSkipContent[token.Data]; ok && !match {
skippingElementsCount--
if skippingElementsCount == 0 {
skipElementContent = false
}
}
if p.addSpaces {
buff.WriteString(" ")
if !match {
if p.addSpaces {
buff.WriteString(" ")
}
break
}
break
}
if !skipElementContent {
@ -189,10 +317,14 @@ func (p *Policy) sanitize(r io.Reader) *bytes.Buffer {
aps, ok := p.elsAndAttrs[token.Data]
if !ok {
if p.addSpaces {
buff.WriteString(" ")
aa, matched := p.matchRegex(token.Data)
if !matched {
if p.addSpaces && !matched {
buff.WriteString(" ")
}
break
}
break
aps = aa
}
if len(token.Attr) != 0 {
@ -202,19 +334,23 @@ func (p *Policy) sanitize(r io.Reader) *bytes.Buffer {
if len(token.Attr) == 0 && !p.allowNoAttrs(token.Data) {
if p.addSpaces {
buff.WriteString(" ")
break
}
break
}
if !skipElementContent {
buff.WriteString(token.String())
// do not escape multiple query parameters
if linkable(token.Data) {
p.writeLinkableBuf(&buff, &token)
} else {
buff.WriteString(token.String())
}
}
case html.TextToken:
if !skipElementContent {
switch strings.ToLower(mostRecentlyStartedToken) {
case "javascript":
switch mostRecentlyStartedToken {
case "script":
// not encouraged, but if a policy allows JavaScript we
// should not HTML escape it as that would break the output
buff.WriteString(token.Data)
@ -248,10 +384,47 @@ func (p *Policy) sanitizeAttrs(
return attrs
}
hasStylePolicies := false
sps, elementHasStylePolicies := p.elsAndStyles[elementName]
if len(p.globalStyles) > 0 || (elementHasStylePolicies && len(sps) > 0) {
hasStylePolicies = true
}
// no specific element policy found, look for a pattern match
if !hasStylePolicies{
for k, v := range p.elsMatchingAndStyles{
if k.MatchString(elementName) {
if len(v) > 0{
hasStylePolicies = true
break
}
}
}
}
// Builds a new attribute slice based on the whether the attribute has been
// whitelisted explicitly or globally.
cleanAttrs := []html.Attribute{}
for _, htmlAttr := range attrs {
if p.allowDataAttributes {
// If we see a data attribute, let it through.
if isDataAttribute(htmlAttr.Key) {
cleanAttrs = append(cleanAttrs, htmlAttr)
continue
}
}
// Is this a "style" attribute, and if so, do we need to sanitize it?
if htmlAttr.Key == "style" && hasStylePolicies {
htmlAttr = p.sanitizeStyles(htmlAttr, elementName)
if htmlAttr.Val == "" {
// We've sanitized away any and all styles; don't bother to
// output the style attribute (even if it's allowed)
continue
} else {
cleanAttrs = append(cleanAttrs, htmlAttr)
continue
}
}
// Is there an element specific attribute policy that applies?
if ap, ok := aps[htmlAttr.Key]; ok {
if ap.regexp != nil {
@ -267,6 +440,7 @@ func (p *Policy) sanitizeAttrs(
// Is there a global attribute policy that applies?
if ap, ok := p.globalAttrs[htmlAttr.Key]; ok {
if ap.regexp != nil {
if ap.regexp.MatchString(htmlAttr.Val) {
cleanAttrs = append(cleanAttrs, htmlAttr)
@ -332,6 +506,8 @@ func (p *Policy) sanitizeAttrs(
if (p.requireNoFollow ||
p.requireNoFollowFullyQualifiedLinks ||
p.requireNoReferrer ||
p.requireNoReferrerFullyQualifiedLinks ||
p.addTargetBlankToFullyQualifiedLinks) &&
len(cleanAttrs) > 0 {
@ -359,12 +535,16 @@ func (p *Policy) sanitizeAttrs(
if hrefFound {
var (
noFollowFound bool
noReferrerFound bool
targetBlankFound bool
)
addNoFollow := (p.requireNoFollow ||
externalLink && p.requireNoFollowFullyQualifiedLinks)
addNoReferrer := (p.requireNoReferrer ||
externalLink && p.requireNoReferrerFullyQualifiedLinks)
addTargetBlank := (externalLink &&
p.addTargetBlankToFullyQualifiedLinks)
@ -372,18 +552,18 @@ func (p *Policy) sanitizeAttrs(
for _, htmlAttr := range cleanAttrs {
var appended bool
if htmlAttr.Key == "rel" && addNoFollow {
if htmlAttr.Key == "rel" && (addNoFollow || addNoReferrer) {
if strings.Contains(htmlAttr.Val, "nofollow") {
noFollowFound = true
tmpAttrs = append(tmpAttrs, htmlAttr)
appended = true
} else {
if addNoFollow && !strings.Contains(htmlAttr.Val, "nofollow") {
htmlAttr.Val += " nofollow"
noFollowFound = true
tmpAttrs = append(tmpAttrs, htmlAttr)
appended = true
}
if addNoReferrer && !strings.Contains(htmlAttr.Val, "noreferrer") {
htmlAttr.Val += " noreferrer"
}
noFollowFound = addNoFollow
noReferrerFound = addNoReferrer
tmpAttrs = append(tmpAttrs, htmlAttr)
appended = true
}
if elementName == "a" && htmlAttr.Key == "target" {
@ -402,14 +582,22 @@ func (p *Policy) sanitizeAttrs(
tmpAttrs = append(tmpAttrs, htmlAttr)
}
}
if noFollowFound || targetBlankFound {
if noFollowFound || noReferrerFound || targetBlankFound {
cleanAttrs = tmpAttrs
}
if addNoFollow && !noFollowFound {
if (addNoFollow && !noFollowFound) || (addNoReferrer && !noReferrerFound) {
rel := html.Attribute{}
rel.Key = "rel"
rel.Val = "nofollow"
if addNoFollow {
rel.Val = "nofollow"
}
if addNoReferrer {
if rel.Val != "" {
rel.Val += " "
}
rel.Val += "noreferrer"
}
cleanAttrs = append(cleanAttrs, rel)
}
@ -479,20 +667,112 @@ func (p *Policy) sanitizeAttrs(
return cleanAttrs
}
func (p *Policy) sanitizeStyles(attr html.Attribute, elementName string) html.Attribute {
sps := p.elsAndStyles[elementName]
if len(sps) == 0{
sps = map[string]stylePolicy{}
// check for any matching elements, if we don't already have a policy found
// if multiple matches are found they will be overwritten, it's best
// to not have overlapping matchers
for regex, policies :=range p.elsMatchingAndStyles{
if regex.MatchString(elementName){
for k, v := range policies{
sps[k] = v
}
}
}
}
//Add semi-colon to end to fix parsing issue
if len(attr.Val) > 0 && attr.Val[len(attr.Val)-1] != ';' {
attr.Val = attr.Val + ";"
}
decs, err := cssparser.ParseDeclarations(attr.Val)
if err != nil {
attr.Val = ""
return attr
}
clean := []string{}
prefixes := []string{"-webkit-", "-moz-", "-ms-", "-o-", "mso-", "-xv-", "-atsc-", "-wap-", "-khtml-", "prince-", "-ah-", "-hp-", "-ro-", "-rim-", "-tc-"}
for _, dec := range decs {
addedProperty := false
tempProperty := strings.ToLower(dec.Property)
tempValue := removeUnicode(strings.ToLower(dec.Value))
for _, i := range prefixes {
tempProperty = strings.TrimPrefix(tempProperty, i)
}
if sp, ok := sps[tempProperty]; ok {
if sp.handler != nil {
if sp.handler(tempValue) {
clean = append(clean, dec.Property+": "+dec.Value)
addedProperty = true
}
} else if len(sp.enum) > 0 {
if stringInSlice(tempValue, sp.enum) {
clean = append(clean, dec.Property+": "+dec.Value)
addedProperty = true
}
} else if sp.regexp != nil {
if sp.regexp.MatchString(tempValue) {
clean = append(clean, dec.Property+": "+dec.Value)
addedProperty = true
}
continue
}
}
if sp, ok := p.globalStyles[tempProperty]; ok && !addedProperty {
if sp.handler != nil {
if sp.handler(tempValue) {
clean = append(clean, dec.Property+": "+dec.Value)
}
} else if len(sp.enum) > 0 {
if stringInSlice(tempValue, sp.enum) {
clean = append(clean, dec.Property+": "+dec.Value)
}
} else if sp.regexp != nil {
if sp.regexp.MatchString(tempValue) {
clean = append(clean, dec.Property+": "+dec.Value)
}
continue
}
}
}
if len(clean) > 0 {
attr.Val = strings.Join(clean, "; ")
} else {
attr.Val = ""
}
return attr
}
func (p *Policy) allowNoAttrs(elementName string) bool {
_, ok := p.setOfElementsAllowedWithoutAttrs[elementName]
if !ok {
for _, r := range p.setOfElementsMatchingAllowedWithoutAttrs {
if r.MatchString(elementName) {
ok = true
break
}
}
}
return ok
}
func (p *Policy) validURL(rawurl string) (string, bool) {
if p.requireParseableURLs {
// URLs do not contain whitespace
if strings.Contains(rawurl, " ") ||
// URLs are valid if when space is trimmed the URL is valid
rawurl = strings.TrimSpace(rawurl)
// URLs cannot contain whitespace, unless it is a data-uri
if (strings.Contains(rawurl, " ") ||
strings.Contains(rawurl, "\t") ||
strings.Contains(rawurl, "\n") {
strings.Contains(rawurl, "\n")) &&
!strings.HasPrefix(rawurl, `data:`) {
return "", false
}
// URLs are valid if they parse
u, err := url.Parse(rawurl)
if err != nil {
return "", false
@ -533,3 +813,77 @@ func linkable(elementName string) bool {
return false
}
}
// stringInSlice returns true if needle exists in haystack
func stringInSlice(needle string, haystack []string) bool {
for _, straw := range haystack {
if strings.ToLower(straw) == strings.ToLower(needle) {
return true
}
}
return false
}
func isDataAttribute(val string) bool {
if !dataAttribute.MatchString(val) {
return false
}
rest := strings.Split(val, "data-")
if len(rest) == 1 {
return false
}
// data-xml* is invalid.
if dataAttributeXMLPrefix.MatchString(rest[1]) {
return false
}
// no uppercase or semi-colons allowed.
if dataAttributeInvalidChars.MatchString(rest[1]) {
return false
}
return true
}
func removeUnicode(value string) string {
substitutedValue := value
currentLoc := cssUnicodeChar.FindStringIndex(substitutedValue)
for currentLoc != nil {
character := substitutedValue[currentLoc[0]+1 : currentLoc[1]]
character = strings.TrimSpace(character)
if len(character) < 4 {
character = strings.Repeat("0", 4-len(character)) + character
} else {
for len(character) > 4 {
if character[0] != '0' {
character = ""
break
} else {
character = character[1:]
}
}
}
character = "\\u" + character
translatedChar, err := strconv.Unquote(`"` + character + `"`)
translatedChar = strings.TrimSpace(translatedChar)
if err != nil {
return ""
}
substitutedValue = substitutedValue[0:currentLoc[0]] + translatedChar + substitutedValue[currentLoc[1]:]
currentLoc = cssUnicodeChar.FindStringIndex(substitutedValue)
}
return substitutedValue
}
func (p *Policy) matchRegex(elementName string ) (map[string]attrPolicy, bool) {
aps := make(map[string]attrPolicy, 0)
matched := false
for regex, attrs := range p.elsMatchingAndAttrs {
if regex.MatchString(elementName) {
matched = true
for k, v := range attrs {
aps[k] = v
}
}
}
return aps, matched
}