You can not select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
					
					
						
							535 lines
						
					
					
						
							14 KiB
						
					
					
				
			
		
		
	
	
							535 lines
						
					
					
						
							14 KiB
						
					
					
				| // Copyright (c) 2014, David Kitchen <david@buro9.com>
 | |
| //
 | |
| // All rights reserved.
 | |
| //
 | |
| // Redistribution and use in source and binary forms, with or without
 | |
| // modification, are permitted provided that the following conditions are met:
 | |
| //
 | |
| // * Redistributions of source code must retain the above copyright notice, this
 | |
| //   list of conditions and the following disclaimer.
 | |
| //
 | |
| // * Redistributions in binary form must reproduce the above copyright notice,
 | |
| //   this list of conditions and the following disclaimer in the documentation
 | |
| //   and/or other materials provided with the distribution.
 | |
| //
 | |
| // * Neither the name of the organisation (Microcosm) nor the names of its
 | |
| //   contributors may be used to endorse or promote products derived from
 | |
| //   this software without specific prior written permission.
 | |
| //
 | |
| // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 | |
| // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 | |
| // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 | |
| // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 | |
| // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 | |
| // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 | |
| // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 | |
| // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 | |
| // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 | |
| // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | |
| 
 | |
| package bluemonday
 | |
| 
 | |
| import (
 | |
| 	"bytes"
 | |
| 	"io"
 | |
| 	"net/url"
 | |
| 	"strings"
 | |
| 
 | |
| 	"golang.org/x/net/html"
 | |
| )
 | |
| 
 | |
| // Sanitize takes a string that contains a HTML fragment or document and applies
 | |
| // the given policy whitelist.
 | |
| //
 | |
| // It returns a HTML string that has been sanitized by the policy or an empty
 | |
| // string if an error has occurred (most likely as a consequence of extremely
 | |
| // malformed input)
 | |
| func (p *Policy) Sanitize(s string) string {
 | |
| 	if strings.TrimSpace(s) == "" {
 | |
| 		return s
 | |
| 	}
 | |
| 
 | |
| 	return p.sanitize(strings.NewReader(s)).String()
 | |
| }
 | |
| 
 | |
| // SanitizeBytes takes a []byte that contains a HTML fragment or document and applies
 | |
| // the given policy whitelist.
 | |
| //
 | |
| // It returns a []byte containing the HTML that has been sanitized by the policy
 | |
| // or an empty []byte if an error has occurred (most likely as a consequence of
 | |
| // extremely malformed input)
 | |
| func (p *Policy) SanitizeBytes(b []byte) []byte {
 | |
| 	if len(bytes.TrimSpace(b)) == 0 {
 | |
| 		return b
 | |
| 	}
 | |
| 
 | |
| 	return p.sanitize(bytes.NewReader(b)).Bytes()
 | |
| }
 | |
| 
 | |
| // SanitizeReader takes an io.Reader that contains a HTML fragment or document
 | |
| // and applies the given policy whitelist.
 | |
| //
 | |
| // It returns a bytes.Buffer containing the HTML that has been sanitized by the
 | |
| // policy. Errors during sanitization will merely return an empty result.
 | |
| func (p *Policy) SanitizeReader(r io.Reader) *bytes.Buffer {
 | |
| 	return p.sanitize(r)
 | |
| }
 | |
| 
 | |
| // Performs the actual sanitization process.
 | |
| func (p *Policy) sanitize(r io.Reader) *bytes.Buffer {
 | |
| 
 | |
| 	// It is possible that the developer has created the policy via:
 | |
| 	//   p := bluemonday.Policy{}
 | |
| 	// rather than:
 | |
| 	//   p := bluemonday.NewPolicy()
 | |
| 	// If this is the case, and if they haven't yet triggered an action that
 | |
| 	// would initiliaze the maps, then we need to do that.
 | |
| 	p.init()
 | |
| 
 | |
| 	var (
 | |
| 		buff                     bytes.Buffer
 | |
| 		skipElementContent       bool
 | |
| 		skippingElementsCount    int64
 | |
| 		skipClosingTag           bool
 | |
| 		closingTagToSkipStack    []string
 | |
| 		mostRecentlyStartedToken string
 | |
| 	)
 | |
| 
 | |
| 	tokenizer := html.NewTokenizer(r)
 | |
| 	for {
 | |
| 		if tokenizer.Next() == html.ErrorToken {
 | |
| 			err := tokenizer.Err()
 | |
| 			if err == io.EOF {
 | |
| 				// End of input means end of processing
 | |
| 				return &buff
 | |
| 			}
 | |
| 
 | |
| 			// Raw tokenizer error
 | |
| 			return &bytes.Buffer{}
 | |
| 		}
 | |
| 
 | |
| 		token := tokenizer.Token()
 | |
| 		switch token.Type {
 | |
| 		case html.DoctypeToken:
 | |
| 
 | |
| 			if p.allowDocType {
 | |
| 				buff.WriteString(token.String())
 | |
| 			}
 | |
| 
 | |
| 		case html.CommentToken:
 | |
| 
 | |
| 			// Comments are ignored by default
 | |
| 
 | |
| 		case html.StartTagToken:
 | |
| 
 | |
| 			mostRecentlyStartedToken = token.Data
 | |
| 
 | |
| 			aps, ok := p.elsAndAttrs[token.Data]
 | |
| 			if !ok {
 | |
| 				if _, ok := p.setOfElementsToSkipContent[token.Data]; ok {
 | |
| 					skipElementContent = true
 | |
| 					skippingElementsCount++
 | |
| 				}
 | |
| 				if p.addSpaces {
 | |
| 					buff.WriteString(" ")
 | |
| 				}
 | |
| 				break
 | |
| 			}
 | |
| 
 | |
| 			if len(token.Attr) != 0 {
 | |
| 				token.Attr = p.sanitizeAttrs(token.Data, token.Attr, aps)
 | |
| 			}
 | |
| 
 | |
| 			if len(token.Attr) == 0 {
 | |
| 				if !p.allowNoAttrs(token.Data) {
 | |
| 					skipClosingTag = true
 | |
| 					closingTagToSkipStack = append(closingTagToSkipStack, token.Data)
 | |
| 					if p.addSpaces {
 | |
| 						buff.WriteString(" ")
 | |
| 					}
 | |
| 					break
 | |
| 				}
 | |
| 			}
 | |
| 
 | |
| 			if !skipElementContent {
 | |
| 				buff.WriteString(token.String())
 | |
| 			}
 | |
| 
 | |
| 		case html.EndTagToken:
 | |
| 
 | |
| 			if skipClosingTag && closingTagToSkipStack[len(closingTagToSkipStack)-1] == token.Data {
 | |
| 				closingTagToSkipStack = closingTagToSkipStack[:len(closingTagToSkipStack)-1]
 | |
| 				if len(closingTagToSkipStack) == 0 {
 | |
| 					skipClosingTag = false
 | |
| 				}
 | |
| 				if p.addSpaces {
 | |
| 					buff.WriteString(" ")
 | |
| 				}
 | |
| 				break
 | |
| 			}
 | |
| 
 | |
| 			if _, ok := p.elsAndAttrs[token.Data]; !ok {
 | |
| 				if _, ok := p.setOfElementsToSkipContent[token.Data]; ok {
 | |
| 					skippingElementsCount--
 | |
| 					if skippingElementsCount == 0 {
 | |
| 						skipElementContent = false
 | |
| 					}
 | |
| 				}
 | |
| 				if p.addSpaces {
 | |
| 					buff.WriteString(" ")
 | |
| 				}
 | |
| 				break
 | |
| 			}
 | |
| 
 | |
| 			if !skipElementContent {
 | |
| 				buff.WriteString(token.String())
 | |
| 			}
 | |
| 
 | |
| 		case html.SelfClosingTagToken:
 | |
| 
 | |
| 			aps, ok := p.elsAndAttrs[token.Data]
 | |
| 			if !ok {
 | |
| 				if p.addSpaces {
 | |
| 					buff.WriteString(" ")
 | |
| 				}
 | |
| 				break
 | |
| 			}
 | |
| 
 | |
| 			if len(token.Attr) != 0 {
 | |
| 				token.Attr = p.sanitizeAttrs(token.Data, token.Attr, aps)
 | |
| 			}
 | |
| 
 | |
| 			if len(token.Attr) == 0 && !p.allowNoAttrs(token.Data) {
 | |
| 				if p.addSpaces {
 | |
| 					buff.WriteString(" ")
 | |
| 				}
 | |
| 				break
 | |
| 			}
 | |
| 
 | |
| 			if !skipElementContent {
 | |
| 				buff.WriteString(token.String())
 | |
| 			}
 | |
| 
 | |
| 		case html.TextToken:
 | |
| 
 | |
| 			if !skipElementContent {
 | |
| 				switch strings.ToLower(mostRecentlyStartedToken) {
 | |
| 				case "javascript":
 | |
| 					// not encouraged, but if a policy allows JavaScript we
 | |
| 					// should not HTML escape it as that would break the output
 | |
| 					buff.WriteString(token.Data)
 | |
| 				case "style":
 | |
| 					// not encouraged, but if a policy allows CSS styles we
 | |
| 					// should not HTML escape it as that would break the output
 | |
| 					buff.WriteString(token.Data)
 | |
| 				default:
 | |
| 					// HTML escape the text
 | |
| 					buff.WriteString(token.String())
 | |
| 				}
 | |
| 			}
 | |
| 
 | |
| 		default:
 | |
| 			// A token that didn't exist in the html package when we wrote this
 | |
| 			return &bytes.Buffer{}
 | |
| 		}
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // sanitizeAttrs takes a set of element attribute policies and the global
 | |
| // attribute policies and applies them to the []html.Attribute returning a set
 | |
| // of html.Attributes that match the policies
 | |
| func (p *Policy) sanitizeAttrs(
 | |
| 	elementName string,
 | |
| 	attrs []html.Attribute,
 | |
| 	aps map[string]attrPolicy,
 | |
| ) []html.Attribute {
 | |
| 
 | |
| 	if len(attrs) == 0 {
 | |
| 		return attrs
 | |
| 	}
 | |
| 
 | |
| 	// Builds a new attribute slice based on the whether the attribute has been
 | |
| 	// whitelisted explicitly or globally.
 | |
| 	cleanAttrs := []html.Attribute{}
 | |
| 	for _, htmlAttr := range attrs {
 | |
| 		// Is there an element specific attribute policy that applies?
 | |
| 		if ap, ok := aps[htmlAttr.Key]; ok {
 | |
| 			if ap.regexp != nil {
 | |
| 				if ap.regexp.MatchString(htmlAttr.Val) {
 | |
| 					cleanAttrs = append(cleanAttrs, htmlAttr)
 | |
| 					continue
 | |
| 				}
 | |
| 			} else {
 | |
| 				cleanAttrs = append(cleanAttrs, htmlAttr)
 | |
| 				continue
 | |
| 			}
 | |
| 		}
 | |
| 
 | |
| 		// Is there a global attribute policy that applies?
 | |
| 		if ap, ok := p.globalAttrs[htmlAttr.Key]; ok {
 | |
| 			if ap.regexp != nil {
 | |
| 				if ap.regexp.MatchString(htmlAttr.Val) {
 | |
| 					cleanAttrs = append(cleanAttrs, htmlAttr)
 | |
| 				}
 | |
| 			} else {
 | |
| 				cleanAttrs = append(cleanAttrs, htmlAttr)
 | |
| 			}
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	if len(cleanAttrs) == 0 {
 | |
| 		// If nothing was allowed, let's get out of here
 | |
| 		return cleanAttrs
 | |
| 	}
 | |
| 	// cleanAttrs now contains the attributes that are permitted
 | |
| 
 | |
| 	if linkable(elementName) {
 | |
| 		if p.requireParseableURLs {
 | |
| 			// Ensure URLs are parseable:
 | |
| 			// - a.href
 | |
| 			// - area.href
 | |
| 			// - link.href
 | |
| 			// - blockquote.cite
 | |
| 			// - q.cite
 | |
| 			// - img.src
 | |
| 			// - script.src
 | |
| 			tmpAttrs := []html.Attribute{}
 | |
| 			for _, htmlAttr := range cleanAttrs {
 | |
| 				switch elementName {
 | |
| 				case "a", "area", "link":
 | |
| 					if htmlAttr.Key == "href" {
 | |
| 						if u, ok := p.validURL(htmlAttr.Val); ok {
 | |
| 							htmlAttr.Val = u
 | |
| 							tmpAttrs = append(tmpAttrs, htmlAttr)
 | |
| 						}
 | |
| 						break
 | |
| 					}
 | |
| 					tmpAttrs = append(tmpAttrs, htmlAttr)
 | |
| 				case "blockquote", "q":
 | |
| 					if htmlAttr.Key == "cite" {
 | |
| 						if u, ok := p.validURL(htmlAttr.Val); ok {
 | |
| 							htmlAttr.Val = u
 | |
| 							tmpAttrs = append(tmpAttrs, htmlAttr)
 | |
| 						}
 | |
| 						break
 | |
| 					}
 | |
| 					tmpAttrs = append(tmpAttrs, htmlAttr)
 | |
| 				case "img", "script":
 | |
| 					if htmlAttr.Key == "src" {
 | |
| 						if u, ok := p.validURL(htmlAttr.Val); ok {
 | |
| 							htmlAttr.Val = u
 | |
| 							tmpAttrs = append(tmpAttrs, htmlAttr)
 | |
| 						}
 | |
| 						break
 | |
| 					}
 | |
| 					tmpAttrs = append(tmpAttrs, htmlAttr)
 | |
| 				default:
 | |
| 					tmpAttrs = append(tmpAttrs, htmlAttr)
 | |
| 				}
 | |
| 			}
 | |
| 			cleanAttrs = tmpAttrs
 | |
| 		}
 | |
| 
 | |
| 		if (p.requireNoFollow ||
 | |
| 			p.requireNoFollowFullyQualifiedLinks ||
 | |
| 			p.addTargetBlankToFullyQualifiedLinks) &&
 | |
| 			len(cleanAttrs) > 0 {
 | |
| 
 | |
| 			// Add rel="nofollow" if a "href" exists
 | |
| 			switch elementName {
 | |
| 			case "a", "area", "link":
 | |
| 				var hrefFound bool
 | |
| 				var externalLink bool
 | |
| 				for _, htmlAttr := range cleanAttrs {
 | |
| 					if htmlAttr.Key == "href" {
 | |
| 						hrefFound = true
 | |
| 
 | |
| 						u, err := url.Parse(htmlAttr.Val)
 | |
| 						if err != nil {
 | |
| 							continue
 | |
| 						}
 | |
| 						if u.Host != "" {
 | |
| 							externalLink = true
 | |
| 						}
 | |
| 
 | |
| 						continue
 | |
| 					}
 | |
| 				}
 | |
| 
 | |
| 				if hrefFound {
 | |
| 					var (
 | |
| 						noFollowFound    bool
 | |
| 						targetBlankFound bool
 | |
| 					)
 | |
| 
 | |
| 					addNoFollow := (p.requireNoFollow ||
 | |
| 						externalLink && p.requireNoFollowFullyQualifiedLinks)
 | |
| 
 | |
| 					addTargetBlank := (externalLink &&
 | |
| 						p.addTargetBlankToFullyQualifiedLinks)
 | |
| 
 | |
| 					tmpAttrs := []html.Attribute{}
 | |
| 					for _, htmlAttr := range cleanAttrs {
 | |
| 
 | |
| 						var appended bool
 | |
| 						if htmlAttr.Key == "rel" && addNoFollow {
 | |
| 
 | |
| 							if strings.Contains(htmlAttr.Val, "nofollow") {
 | |
| 								noFollowFound = true
 | |
| 								tmpAttrs = append(tmpAttrs, htmlAttr)
 | |
| 								appended = true
 | |
| 							} else {
 | |
| 								htmlAttr.Val += " nofollow"
 | |
| 								noFollowFound = true
 | |
| 								tmpAttrs = append(tmpAttrs, htmlAttr)
 | |
| 								appended = true
 | |
| 							}
 | |
| 						}
 | |
| 
 | |
| 						if elementName == "a" && htmlAttr.Key == "target" {
 | |
| 							if htmlAttr.Val == "_blank" {
 | |
| 								targetBlankFound = true
 | |
| 							}
 | |
| 							if addTargetBlank && !targetBlankFound {
 | |
| 								htmlAttr.Val = "_blank"
 | |
| 								targetBlankFound = true
 | |
| 								tmpAttrs = append(tmpAttrs, htmlAttr)
 | |
| 								appended = true
 | |
| 							}
 | |
| 						}
 | |
| 
 | |
| 						if !appended {
 | |
| 							tmpAttrs = append(tmpAttrs, htmlAttr)
 | |
| 						}
 | |
| 					}
 | |
| 					if noFollowFound || targetBlankFound {
 | |
| 						cleanAttrs = tmpAttrs
 | |
| 					}
 | |
| 
 | |
| 					if addNoFollow && !noFollowFound {
 | |
| 						rel := html.Attribute{}
 | |
| 						rel.Key = "rel"
 | |
| 						rel.Val = "nofollow"
 | |
| 						cleanAttrs = append(cleanAttrs, rel)
 | |
| 					}
 | |
| 
 | |
| 					if elementName == "a" && addTargetBlank && !targetBlankFound {
 | |
| 						rel := html.Attribute{}
 | |
| 						rel.Key = "target"
 | |
| 						rel.Val = "_blank"
 | |
| 						targetBlankFound = true
 | |
| 						cleanAttrs = append(cleanAttrs, rel)
 | |
| 					}
 | |
| 
 | |
| 					if targetBlankFound {
 | |
| 						// target="_blank" has a security risk that allows the
 | |
| 						// opened window/tab to issue JavaScript calls against
 | |
| 						// window.opener, which in effect allow the destination
 | |
| 						// of the link to control the source:
 | |
| 						// https://dev.to/ben/the-targetblank-vulnerability-by-example
 | |
| 						//
 | |
| 						// To mitigate this risk, we need to add a specific rel
 | |
| 						// attribute if it is not already present.
 | |
| 						// rel="noopener"
 | |
| 						//
 | |
| 						// Unfortunately this is processing the rel twice (we
 | |
| 						// already looked at it earlier ^^) as we cannot be sure
 | |
| 						// of the ordering of the href and rel, and whether we
 | |
| 						// have fully satisfied that we need to do this. This
 | |
| 						// double processing only happens *if* target="_blank"
 | |
| 						// is true.
 | |
| 						var noOpenerAdded bool
 | |
| 						tmpAttrs := []html.Attribute{}
 | |
| 						for _, htmlAttr := range cleanAttrs {
 | |
| 							var appended bool
 | |
| 							if htmlAttr.Key == "rel" {
 | |
| 								if strings.Contains(htmlAttr.Val, "noopener") {
 | |
| 									noOpenerAdded = true
 | |
| 									tmpAttrs = append(tmpAttrs, htmlAttr)
 | |
| 								} else {
 | |
| 									htmlAttr.Val += " noopener"
 | |
| 									noOpenerAdded = true
 | |
| 									tmpAttrs = append(tmpAttrs, htmlAttr)
 | |
| 								}
 | |
| 
 | |
| 								appended = true
 | |
| 							}
 | |
| 							if !appended {
 | |
| 								tmpAttrs = append(tmpAttrs, htmlAttr)
 | |
| 							}
 | |
| 						}
 | |
| 						if noOpenerAdded {
 | |
| 							cleanAttrs = tmpAttrs
 | |
| 						} else {
 | |
| 							// rel attr was not found, or else noopener would
 | |
| 							// have been added already
 | |
| 							rel := html.Attribute{}
 | |
| 							rel.Key = "rel"
 | |
| 							rel.Val = "noopener"
 | |
| 							cleanAttrs = append(cleanAttrs, rel)
 | |
| 						}
 | |
| 
 | |
| 					}
 | |
| 				}
 | |
| 			default:
 | |
| 			}
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	return cleanAttrs
 | |
| }
 | |
| 
 | |
| func (p *Policy) allowNoAttrs(elementName string) bool {
 | |
| 	_, ok := p.setOfElementsAllowedWithoutAttrs[elementName]
 | |
| 	return ok
 | |
| }
 | |
| 
 | |
| func (p *Policy) validURL(rawurl string) (string, bool) {
 | |
| 	if p.requireParseableURLs {
 | |
| 		// URLs do not contain whitespace
 | |
| 		if strings.Contains(rawurl, " ") ||
 | |
| 			strings.Contains(rawurl, "\t") ||
 | |
| 			strings.Contains(rawurl, "\n") {
 | |
| 			return "", false
 | |
| 		}
 | |
| 
 | |
| 		u, err := url.Parse(rawurl)
 | |
| 		if err != nil {
 | |
| 			return "", false
 | |
| 		}
 | |
| 
 | |
| 		if u.Scheme != "" {
 | |
| 
 | |
| 			urlPolicy, ok := p.allowURLSchemes[u.Scheme]
 | |
| 			if !ok {
 | |
| 				return "", false
 | |
| 
 | |
| 			}
 | |
| 
 | |
| 			if urlPolicy == nil || urlPolicy(u) == true {
 | |
| 				return u.String(), true
 | |
| 			}
 | |
| 
 | |
| 			return "", false
 | |
| 		}
 | |
| 
 | |
| 		if p.allowRelativeURLs {
 | |
| 			if u.String() != "" {
 | |
| 				return u.String(), true
 | |
| 			}
 | |
| 		}
 | |
| 
 | |
| 		return "", false
 | |
| 	}
 | |
| 
 | |
| 	return rawurl, true
 | |
| }
 | |
| 
 | |
| func linkable(elementName string) bool {
 | |
| 	switch elementName {
 | |
| 	case "a", "area", "blockquote", "img", "link", "script":
 | |
| 		return true
 | |
| 	default:
 | |
| 		return false
 | |
| 	}
 | |
| }
 | |
| 
 |