Command line client for Write.as https://write.as/apps/cli
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

582 lines
15 KiB

  1. // Copyright (c) 2014, David Kitchen <david@buro9.com>
  2. //
  3. // All rights reserved.
  4. //
  5. // Redistribution and use in source and binary forms, with or without
  6. // modification, are permitted provided that the following conditions are met:
  7. //
  8. // * Redistributions of source code must retain the above copyright notice, this
  9. // list of conditions and the following disclaimer.
  10. //
  11. // * Redistributions in binary form must reproduce the above copyright notice,
  12. // this list of conditions and the following disclaimer in the documentation
  13. // and/or other materials provided with the distribution.
  14. //
  15. // * Neither the name of the organisation (Microcosm) nor the names of its
  16. // contributors may be used to endorse or promote products derived from
  17. // this software without specific prior written permission.
  18. //
  19. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  20. // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  21. // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  22. // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
  23. // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  24. // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  25. // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  26. // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  27. // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  28. // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  29. package bluemonday
  30. import (
  31. "bytes"
  32. "io"
  33. "net/url"
  34. "regexp"
  35. "strings"
  36. "golang.org/x/net/html"
  37. )
  38. var (
  39. dataAttribute = regexp.MustCompile("^data-.+")
  40. dataAttributeXMLPrefix = regexp.MustCompile("^xml.+")
  41. dataAttributeInvalidChars = regexp.MustCompile("[A-Z;]+")
  42. )
  43. // Sanitize takes a string that contains a HTML fragment or document and applies
  44. // the given policy whitelist.
  45. //
  46. // It returns a HTML string that has been sanitized by the policy or an empty
  47. // string if an error has occurred (most likely as a consequence of extremely
  48. // malformed input)
  49. func (p *Policy) Sanitize(s string) string {
  50. if strings.TrimSpace(s) == "" {
  51. return s
  52. }
  53. return p.sanitize(strings.NewReader(s)).String()
  54. }
  55. // SanitizeBytes takes a []byte that contains a HTML fragment or document and applies
  56. // the given policy whitelist.
  57. //
  58. // It returns a []byte containing the HTML that has been sanitized by the policy
  59. // or an empty []byte if an error has occurred (most likely as a consequence of
  60. // extremely malformed input)
  61. func (p *Policy) SanitizeBytes(b []byte) []byte {
  62. if len(bytes.TrimSpace(b)) == 0 {
  63. return b
  64. }
  65. return p.sanitize(bytes.NewReader(b)).Bytes()
  66. }
  67. // SanitizeReader takes an io.Reader that contains a HTML fragment or document
  68. // and applies the given policy whitelist.
  69. //
  70. // It returns a bytes.Buffer containing the HTML that has been sanitized by the
  71. // policy. Errors during sanitization will merely return an empty result.
  72. func (p *Policy) SanitizeReader(r io.Reader) *bytes.Buffer {
  73. return p.sanitize(r)
  74. }
  75. // Performs the actual sanitization process.
  76. func (p *Policy) sanitize(r io.Reader) *bytes.Buffer {
  77. // It is possible that the developer has created the policy via:
  78. // p := bluemonday.Policy{}
  79. // rather than:
  80. // p := bluemonday.NewPolicy()
  81. // If this is the case, and if they haven't yet triggered an action that
  82. // would initiliaze the maps, then we need to do that.
  83. p.init()
  84. var (
  85. buff bytes.Buffer
  86. skipElementContent bool
  87. skippingElementsCount int64
  88. skipClosingTag bool
  89. closingTagToSkipStack []string
  90. mostRecentlyStartedToken string
  91. )
  92. tokenizer := html.NewTokenizer(r)
  93. for {
  94. if tokenizer.Next() == html.ErrorToken {
  95. err := tokenizer.Err()
  96. if err == io.EOF {
  97. // End of input means end of processing
  98. return &buff
  99. }
  100. // Raw tokenizer error
  101. return &bytes.Buffer{}
  102. }
  103. token := tokenizer.Token()
  104. switch token.Type {
  105. case html.DoctypeToken:
  106. // DocType is not handled as there is no safe parsing mechanism
  107. // provided by golang.org/x/net/html for the content, and this can
  108. // be misused to insert HTML tags that are not then sanitized
  109. //
  110. // One might wish to recursively sanitize here using the same policy
  111. // but I will need to do some further testing before considering
  112. // this.
  113. case html.CommentToken:
  114. // Comments are ignored by default
  115. case html.StartTagToken:
  116. mostRecentlyStartedToken = token.Data
  117. aps, ok := p.elsAndAttrs[token.Data]
  118. if !ok {
  119. if _, ok := p.setOfElementsToSkipContent[token.Data]; ok {
  120. skipElementContent = true
  121. skippingElementsCount++
  122. }
  123. if p.addSpaces {
  124. buff.WriteString(" ")
  125. }
  126. break
  127. }
  128. if len(token.Attr) != 0 {
  129. token.Attr = p.sanitizeAttrs(token.Data, token.Attr, aps)
  130. }
  131. if len(token.Attr) == 0 {
  132. if !p.allowNoAttrs(token.Data) {
  133. skipClosingTag = true
  134. closingTagToSkipStack = append(closingTagToSkipStack, token.Data)
  135. if p.addSpaces {
  136. buff.WriteString(" ")
  137. }
  138. break
  139. }
  140. }
  141. if !skipElementContent {
  142. buff.WriteString(token.String())
  143. }
  144. case html.EndTagToken:
  145. if mostRecentlyStartedToken == token.Data {
  146. mostRecentlyStartedToken = ""
  147. }
  148. if skipClosingTag && closingTagToSkipStack[len(closingTagToSkipStack)-1] == token.Data {
  149. closingTagToSkipStack = closingTagToSkipStack[:len(closingTagToSkipStack)-1]
  150. if len(closingTagToSkipStack) == 0 {
  151. skipClosingTag = false
  152. }
  153. if p.addSpaces {
  154. buff.WriteString(" ")
  155. }
  156. break
  157. }
  158. if _, ok := p.elsAndAttrs[token.Data]; !ok {
  159. if _, ok := p.setOfElementsToSkipContent[token.Data]; ok {
  160. skippingElementsCount--
  161. if skippingElementsCount == 0 {
  162. skipElementContent = false
  163. }
  164. }
  165. if p.addSpaces {
  166. buff.WriteString(" ")
  167. }
  168. break
  169. }
  170. if !skipElementContent {
  171. buff.WriteString(token.String())
  172. }
  173. case html.SelfClosingTagToken:
  174. aps, ok := p.elsAndAttrs[token.Data]
  175. if !ok {
  176. if p.addSpaces {
  177. buff.WriteString(" ")
  178. }
  179. break
  180. }
  181. if len(token.Attr) != 0 {
  182. token.Attr = p.sanitizeAttrs(token.Data, token.Attr, aps)
  183. }
  184. if len(token.Attr) == 0 && !p.allowNoAttrs(token.Data) {
  185. if p.addSpaces {
  186. buff.WriteString(" ")
  187. }
  188. break
  189. }
  190. if !skipElementContent {
  191. buff.WriteString(token.String())
  192. }
  193. case html.TextToken:
  194. if !skipElementContent {
  195. switch mostRecentlyStartedToken {
  196. case "script":
  197. // not encouraged, but if a policy allows JavaScript we
  198. // should not HTML escape it as that would break the output
  199. buff.WriteString(token.Data)
  200. case "style":
  201. // not encouraged, but if a policy allows CSS styles we
  202. // should not HTML escape it as that would break the output
  203. buff.WriteString(token.Data)
  204. default:
  205. // HTML escape the text
  206. buff.WriteString(token.String())
  207. }
  208. }
  209. default:
  210. // A token that didn't exist in the html package when we wrote this
  211. return &bytes.Buffer{}
  212. }
  213. }
  214. }
  215. // sanitizeAttrs takes a set of element attribute policies and the global
  216. // attribute policies and applies them to the []html.Attribute returning a set
  217. // of html.Attributes that match the policies
  218. func (p *Policy) sanitizeAttrs(
  219. elementName string,
  220. attrs []html.Attribute,
  221. aps map[string]attrPolicy,
  222. ) []html.Attribute {
  223. if len(attrs) == 0 {
  224. return attrs
  225. }
  226. // Builds a new attribute slice based on the whether the attribute has been
  227. // whitelisted explicitly or globally.
  228. cleanAttrs := []html.Attribute{}
  229. for _, htmlAttr := range attrs {
  230. if p.allowDataAttributes {
  231. // If we see a data attribute, let it through.
  232. if isDataAttribute(htmlAttr.Key) {
  233. cleanAttrs = append(cleanAttrs, htmlAttr)
  234. continue
  235. }
  236. }
  237. // Is there an element specific attribute policy that applies?
  238. if ap, ok := aps[htmlAttr.Key]; ok {
  239. if ap.regexp != nil {
  240. if ap.regexp.MatchString(htmlAttr.Val) {
  241. cleanAttrs = append(cleanAttrs, htmlAttr)
  242. continue
  243. }
  244. } else {
  245. cleanAttrs = append(cleanAttrs, htmlAttr)
  246. continue
  247. }
  248. }
  249. // Is there a global attribute policy that applies?
  250. if ap, ok := p.globalAttrs[htmlAttr.Key]; ok {
  251. if ap.regexp != nil {
  252. if ap.regexp.MatchString(htmlAttr.Val) {
  253. cleanAttrs = append(cleanAttrs, htmlAttr)
  254. }
  255. } else {
  256. cleanAttrs = append(cleanAttrs, htmlAttr)
  257. }
  258. }
  259. }
  260. if len(cleanAttrs) == 0 {
  261. // If nothing was allowed, let's get out of here
  262. return cleanAttrs
  263. }
  264. // cleanAttrs now contains the attributes that are permitted
  265. if linkable(elementName) {
  266. if p.requireParseableURLs {
  267. // Ensure URLs are parseable:
  268. // - a.href
  269. // - area.href
  270. // - link.href
  271. // - blockquote.cite
  272. // - q.cite
  273. // - img.src
  274. // - script.src
  275. tmpAttrs := []html.Attribute{}
  276. for _, htmlAttr := range cleanAttrs {
  277. switch elementName {
  278. case "a", "area", "link":
  279. if htmlAttr.Key == "href" {
  280. if u, ok := p.validURL(htmlAttr.Val); ok {
  281. htmlAttr.Val = u
  282. tmpAttrs = append(tmpAttrs, htmlAttr)
  283. }
  284. break
  285. }
  286. tmpAttrs = append(tmpAttrs, htmlAttr)
  287. case "blockquote", "q":
  288. if htmlAttr.Key == "cite" {
  289. if u, ok := p.validURL(htmlAttr.Val); ok {
  290. htmlAttr.Val = u
  291. tmpAttrs = append(tmpAttrs, htmlAttr)
  292. }
  293. break
  294. }
  295. tmpAttrs = append(tmpAttrs, htmlAttr)
  296. case "img", "script":
  297. if htmlAttr.Key == "src" {
  298. if u, ok := p.validURL(htmlAttr.Val); ok {
  299. htmlAttr.Val = u
  300. tmpAttrs = append(tmpAttrs, htmlAttr)
  301. }
  302. break
  303. }
  304. tmpAttrs = append(tmpAttrs, htmlAttr)
  305. default:
  306. tmpAttrs = append(tmpAttrs, htmlAttr)
  307. }
  308. }
  309. cleanAttrs = tmpAttrs
  310. }
  311. if (p.requireNoFollow ||
  312. p.requireNoFollowFullyQualifiedLinks ||
  313. p.addTargetBlankToFullyQualifiedLinks) &&
  314. len(cleanAttrs) > 0 {
  315. // Add rel="nofollow" if a "href" exists
  316. switch elementName {
  317. case "a", "area", "link":
  318. var hrefFound bool
  319. var externalLink bool
  320. for _, htmlAttr := range cleanAttrs {
  321. if htmlAttr.Key == "href" {
  322. hrefFound = true
  323. u, err := url.Parse(htmlAttr.Val)
  324. if err != nil {
  325. continue
  326. }
  327. if u.Host != "" {
  328. externalLink = true
  329. }
  330. continue
  331. }
  332. }
  333. if hrefFound {
  334. var (
  335. noFollowFound bool
  336. targetBlankFound bool
  337. )
  338. addNoFollow := (p.requireNoFollow ||
  339. externalLink && p.requireNoFollowFullyQualifiedLinks)
  340. addTargetBlank := (externalLink &&
  341. p.addTargetBlankToFullyQualifiedLinks)
  342. tmpAttrs := []html.Attribute{}
  343. for _, htmlAttr := range cleanAttrs {
  344. var appended bool
  345. if htmlAttr.Key == "rel" && addNoFollow {
  346. if strings.Contains(htmlAttr.Val, "nofollow") {
  347. noFollowFound = true
  348. tmpAttrs = append(tmpAttrs, htmlAttr)
  349. appended = true
  350. } else {
  351. htmlAttr.Val += " nofollow"
  352. noFollowFound = true
  353. tmpAttrs = append(tmpAttrs, htmlAttr)
  354. appended = true
  355. }
  356. }
  357. if elementName == "a" && htmlAttr.Key == "target" {
  358. if htmlAttr.Val == "_blank" {
  359. targetBlankFound = true
  360. }
  361. if addTargetBlank && !targetBlankFound {
  362. htmlAttr.Val = "_blank"
  363. targetBlankFound = true
  364. tmpAttrs = append(tmpAttrs, htmlAttr)
  365. appended = true
  366. }
  367. }
  368. if !appended {
  369. tmpAttrs = append(tmpAttrs, htmlAttr)
  370. }
  371. }
  372. if noFollowFound || targetBlankFound {
  373. cleanAttrs = tmpAttrs
  374. }
  375. if addNoFollow && !noFollowFound {
  376. rel := html.Attribute{}
  377. rel.Key = "rel"
  378. rel.Val = "nofollow"
  379. cleanAttrs = append(cleanAttrs, rel)
  380. }
  381. if elementName == "a" && addTargetBlank && !targetBlankFound {
  382. rel := html.Attribute{}
  383. rel.Key = "target"
  384. rel.Val = "_blank"
  385. targetBlankFound = true
  386. cleanAttrs = append(cleanAttrs, rel)
  387. }
  388. if targetBlankFound {
  389. // target="_blank" has a security risk that allows the
  390. // opened window/tab to issue JavaScript calls against
  391. // window.opener, which in effect allow the destination
  392. // of the link to control the source:
  393. // https://dev.to/ben/the-targetblank-vulnerability-by-example
  394. //
  395. // To mitigate this risk, we need to add a specific rel
  396. // attribute if it is not already present.
  397. // rel="noopener"
  398. //
  399. // Unfortunately this is processing the rel twice (we
  400. // already looked at it earlier ^^) as we cannot be sure
  401. // of the ordering of the href and rel, and whether we
  402. // have fully satisfied that we need to do this. This
  403. // double processing only happens *if* target="_blank"
  404. // is true.
  405. var noOpenerAdded bool
  406. tmpAttrs := []html.Attribute{}
  407. for _, htmlAttr := range cleanAttrs {
  408. var appended bool
  409. if htmlAttr.Key == "rel" {
  410. if strings.Contains(htmlAttr.Val, "noopener") {
  411. noOpenerAdded = true
  412. tmpAttrs = append(tmpAttrs, htmlAttr)
  413. } else {
  414. htmlAttr.Val += " noopener"
  415. noOpenerAdded = true
  416. tmpAttrs = append(tmpAttrs, htmlAttr)
  417. }
  418. appended = true
  419. }
  420. if !appended {
  421. tmpAttrs = append(tmpAttrs, htmlAttr)
  422. }
  423. }
  424. if noOpenerAdded {
  425. cleanAttrs = tmpAttrs
  426. } else {
  427. // rel attr was not found, or else noopener would
  428. // have been added already
  429. rel := html.Attribute{}
  430. rel.Key = "rel"
  431. rel.Val = "noopener"
  432. cleanAttrs = append(cleanAttrs, rel)
  433. }
  434. }
  435. }
  436. default:
  437. }
  438. }
  439. }
  440. return cleanAttrs
  441. }
  442. func (p *Policy) allowNoAttrs(elementName string) bool {
  443. _, ok := p.setOfElementsAllowedWithoutAttrs[elementName]
  444. return ok
  445. }
  446. func (p *Policy) validURL(rawurl string) (string, bool) {
  447. if p.requireParseableURLs {
  448. // URLs are valid if when space is trimmed the URL is valid
  449. rawurl = strings.TrimSpace(rawurl)
  450. // URLs cannot contain whitespace, unless it is a data-uri
  451. if (strings.Contains(rawurl, " ") ||
  452. strings.Contains(rawurl, "\t") ||
  453. strings.Contains(rawurl, "\n")) &&
  454. !strings.HasPrefix(rawurl, `data:`) {
  455. return "", false
  456. }
  457. // URLs are valid if they parse
  458. u, err := url.Parse(rawurl)
  459. if err != nil {
  460. return "", false
  461. }
  462. if u.Scheme != "" {
  463. urlPolicy, ok := p.allowURLSchemes[u.Scheme]
  464. if !ok {
  465. return "", false
  466. }
  467. if urlPolicy == nil || urlPolicy(u) == true {
  468. return u.String(), true
  469. }
  470. return "", false
  471. }
  472. if p.allowRelativeURLs {
  473. if u.String() != "" {
  474. return u.String(), true
  475. }
  476. }
  477. return "", false
  478. }
  479. return rawurl, true
  480. }
  481. func linkable(elementName string) bool {
  482. switch elementName {
  483. case "a", "area", "blockquote", "img", "link", "script":
  484. return true
  485. default:
  486. return false
  487. }
  488. }
  489. func isDataAttribute(val string) bool {
  490. if !dataAttribute.MatchString(val) {
  491. return false
  492. }
  493. rest := strings.Split(val, "data-")
  494. if len(rest) == 1 {
  495. return false
  496. }
  497. // data-xml* is invalid.
  498. if dataAttributeXMLPrefix.MatchString(rest[1]) {
  499. return false
  500. }
  501. // no uppercase or semi-colons allowed.
  502. if dataAttributeInvalidChars.MatchString(rest[1]) {
  503. return false
  504. }
  505. return true
  506. }