[IMP] html_sanitize: now uses the builtin cleaner of lxml.html. We also allows style attribute; the cleaner ensures no javascript or malicious code goes through those attributes. Updated and added tests for the sanitizer, especially about malicious code, and EDI-like html code.

bzr revid: tde@openerp.com-20121226162505-8rq060p375z1k9il
2012-12-26 17:25:05 +01:00 · 2012-12-26 17:25:05 +01:00 · d7185be38c
parent 980e61f7f6
commit d7185be38c
2 changed files with 118 additions and 71 deletions
--- a/openerp/tests/test_mail.py
+++ b/openerp/tests/test_mail.py
@ -43,6 +43,47 @@ test12</font></div><div><font color="#1f1f1f" face="monospace" size="2"><br></fo
 <a href="javascript:alert('malicious code')">test link</a>
 """

+EDI_LIKE_HTML_SOURCE = """<div style="font-family: 'Lucica Grande', Ubuntu, Arial, Verdana, sans-serif; font-size: 12px; color: rgb(34, 34, 34); background-color: #FFF; ">
+    <p>Hello ${object.partner_id.name},</p>
+    <p>A new invoice is available for you: </p>
+    <p style="border-left: 1px solid #8e0000; margin-left: 30px;">
+       &nbsp;&nbsp;<strong>REFERENCES</strong><br />
+       &nbsp;&nbsp;Invoice number: <strong>${object.number}</strong><br />
+       &nbsp;&nbsp;Invoice total: <strong>${object.amount_total} ${object.currency_id.name}</strong><br />
+       &nbsp;&nbsp;Invoice date: ${object.date_invoice}<br />
+       &nbsp;&nbsp;Order reference: ${object.origin}<br />
+       &nbsp;&nbsp;Your contact: <a href="mailto:${object.user_id.email or ''}?subject=Invoice%20${object.number}">${object.user_id.name}</a>
+    </p>
+    <br/>
+    <p>It is also possible to directly pay with Paypal:</p>
+    <a style="margin-left: 120px;" href="${object.paypal_url}">
+        <img class="oe_edi_paypal_button" src="https://www.paypal.com/en_US/i/btn/btn_paynowCC_LG.gif"/>
+    </a>
+    <br/>
+    <p>If you have any question, do not hesitate to contact us.</p>
+    <p>Thank you for choosing ${object.company_id.name or 'us'}!</p>
+    <br/>
+    <br/>
+    <div style="width: 375px; margin: 0px; padding: 0px; background-color: #8E0000; border-top-left-radius: 5px 5px; border-top-right-radius: 5px 5px; background-repeat: repeat no-repeat;">
+        <h3 style="margin: 0px; padding: 2px 14px; font-size: 12px; color: #DDD;">
+            <strong style="text-transform:uppercase;">${object.company_id.name}</strong></h3>
+    </div>
+    <div style="width: 347px; margin: 0px; padding: 5px 14px; line-height: 16px; background-color: #F2F2F2;">
+        <span style="color: #222; margin-bottom: 5px; display: block; ">
+        ${object.company_id.street}<br/>
+        ${object.company_id.street2}<br/>
+        ${object.company_id.zip} ${object.company_id.city}<br/>
+        ${object.company_id.state_id and ('%s, ' % object.company_id.state_id.name) or ''} ${object.company_id.country_id.name or ''}<br/>
+        </span>
+        <div style="margin-top: 0px; margin-right: 0px; margin-bottom: 0px; margin-left: 0px; padding-top: 0px; padding-right: 0px; padding-bottom: 0px; padding-left: 0px; ">
+            Phone:&nbsp; ${object.company_id.phone}
+        </div>
+        <div>
+            Web :&nbsp;<a href="${object.company_id.website}">${object.company_id.website}</a>
+        </div>
+    </div>
+</div></body></html>"""
+
 TEXT_MAIL1 = """I contact you about our meeting for tomorrow. Here is the schedule I propose:
 9 AM: brainstorming about our new amazing business app</span></li>
 9.45 AM: summary
@ -126,23 +167,76 @@ bert.tartopoils@miam.miam
 class TestSanitizer(unittest2.TestCase):
    """ Test the html sanitizer that filters html to remove unwanted attributes """

-    def test_simple(self):
-        x = "yop"
-        self.assertEqual(x, html_sanitize(x))
+    def test_basic_sanitizer(self):
+        cases = [
+            ("yop", "<p>yop</p>"),  # simple
+            ("lala<p>yop</p>xxx", "<div><p>lala</p><p>yop</p>xxx</div>"),  # trailing text
+            ("Merci à l'intérêt pour notre produit.nous vous contacterons bientôt. Merci",
+                u"<p>Merci à l'intérêt pour notre produit.nous vous contacterons bientôt. Merci</p>"),  # unicode
+        ]
+        for content, expected in cases:
+            html = html_sanitize(content)
+            self.assertEqual(html, expected, 'html_sanitize is broken')

-    def test_trailing_text(self):
-        x = 'lala<p>yop</p>xxx'
-        self.assertEqual(x, html_sanitize(x))
+    def test_evil_malicious_code(self):
+        cases = [
+            ("<IMG SRC=javascript:alert('XSS')>"),  # no quotes and semicolons
+            ("<IMG SRC=&#106;&#97;&#118;&#97;&#115;&#99;&#114;&#105;&#112;&#116;&#58;&#97;&#108;&#101;&#114;&#116;&#40;&#39;&#88;&#83;&#83;&#39;&#41;>"),  # UTF-8 Unicode encoding
+            ("<IMG SRC=&#x6A&#x61&#x76&#x61&#x73&#x63&#x72&#x69&#x70&#x74&#x3A&#x61&#x6C&#x65&#x72&#x74&#x28&#x27&#x58&#x53&#x53&#x27&#x29>"),  # hex encoding
+            ("<IMG SRC=\"jav&#x0D;ascript:alert('XSS');\">"),  # embedded carriage return
+            ("<IMG SRC=\"jav&#x0A;ascript:alert('XSS');\">"),  # embedded newline
+            ("<IMG SRC=\"jav   ascript:alert('XSS');\">"),  # embedded tab
+            ("<IMG SRC=\"jav&#x09;ascript:alert('XSS');\">"),  # embedded encoded tab
+            ("<IMG SRC=\" &#14;  javascript:alert('XSS');\">"),  # spaces and meta-characters
+            ("<IMG SRC=\"javascript:alert('XSS')\""),  # half-open html
+            ("<IMG \"\"\"><SCRIPT>alert(\"XSS\")</SCRIPT>\">"),  # malformed tag
+            ("<SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT>"),  # non-alpha-non-digits
+            ("<SCRIPT/SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT>"),  # non-alpha-non-digits
+            ("<<SCRIPT>alert(\"XSS\");//<</SCRIPT>"),  # extraneous open brackets
+            ("<SCRIPT SRC=http://ha.ckers.org/xss.js?< B >"),  # non-closing script tags
+            ("<INPUT TYPE=\"IMAGE\" SRC=\"javascript:alert('XSS');\">"),  # input image
+            ("<BODY BACKGROUND=\"javascript:alert('XSS')\">"),  # body image
+            ("<IMG DYNSRC=\"javascript:alert('XSS')\">"),  # img dynsrc
+            ("<IMG LOWSRC=\"javascript:alert('XSS')\">"),  # img lowsrc
+            ("<TABLE BACKGROUND=\"javascript:alert('XSS')\">"),  # table
+            ("<TABLE><TD BACKGROUND=\"javascript:alert('XSS')\">"),  # td
+            ("<DIV STYLE=\"background-image: url(javascript:alert('XSS'))\">"),  # div background
+            ("<DIV STYLE=\"background-image:\0075\0072\006C\0028'\006a\0061\0076\0061\0073\0063\0072\0069\0070\0074\003a\0061\006c\0065\0072\0074\0028.1027\0058.1053\0053\0027\0029'\0029\">"),  # div background with unicoded exploit
+            ("<DIV STYLE=\"background-image: url(&#1;javascript:alert('XSS'))\">"),  # div background + extra characters
+            ("<IMG SRC='vbscript:msgbox(\"XSS\")'>"),  # VBscrip in an image
+            ("<BODY ONLOAD=alert('XSS')>"),  # event handler
+            ("<BR SIZE=\"&{alert('XSS')}\>"),  # & javascript includes
+            ("<LINK REL=\"stylesheet\" HREF=\"javascript:alert('XSS');\">"),  # style sheet
+            ("<LINK REL=\"stylesheet\" HREF=\"http://ha.ckers.org/xss.css\">"),  # remote style sheet
+            ("<STYLE>@import'http://ha.ckers.org/xss.css';</STYLE>"),  # remote style sheet 2
+            ("<META HTTP-EQUIV=\"Link\" Content=\"<http://ha.ckers.org/xss.css>; REL=stylesheet\">"),  # remote style sheet 3
+            ("<STYLE>BODY{-moz-binding:url(\"http://ha.ckers.org/xssmoz.xml#xss\")}</STYLE>"),  # remote style sheet 4
+            ("<IMG STYLE=\"xss:expr/*XSS*/ession(alert('XSS'))\">"),  # style attribute using a comment to break up expression
+            ("""<!--[if gte IE 4]>
+                <SCRIPT>alert('XSS');</SCRIPT>
+                <![endif]-->"""),  # down-level hidden block
+        ]
+        for content in cases:
+            html = html_sanitize(content)
+            self.assertNotIn('javascript', html, 'html_sanitize did not remove a malicious javascript')
+            self.assertTrue('ha.ckers.org' not in html or 'http://ha.ckers.org/xss.css' in html, 'html_sanitize did not remove a malicious code in %s (%s)' % (content, html))

    def test_html(self):
        sanitized_html = html_sanitize(HTML_SOURCE)
-        for tag in ['<font>', '<div>', '<b>', '<i>', '<u>', '<strike>', '<li>', '<blockquote>', '<a href']:
+        for tag in ['<div', '<b', '<i', '<u', '<strike', '<li', '<blockquote', '<a href']:
            self.assertIn(tag, sanitized_html, 'html_sanitize stripped too much of original html')
-        for attr in ['style', 'javascript']:
+        for attr in ['javascript']:
            self.assertNotIn(attr, sanitized_html, 'html_sanitize did not remove enough unwanted attributes')

-    def test_unicode(self):
-        html_sanitize("Merci à l'intérêt pour notre produit.nous vous contacterons bientôt. Merci")
+    def test_edi_source(self):
+        html = html_sanitize(EDI_LIKE_HTML_SOURCE)
+        self.assertIn('div style="font-family: \'Lucica Grande\', Ubuntu, Arial, Verdana, sans-serif; font-size: 12px; color: rgb(34, 34, 34); background-color: #FFF;', html,
+            'html_sanitize removed valid style attribute')
+        self.assertIn('<span style="color: #222; margin-bottom: 5px; display: block; ">', html,
+            'html_sanitize removed valid style attribute')
+        self.assertIn('img class="oe_edi_paypal_button" src="https://www.paypal.com/en_US/i/btn/btn_paynowCC_LG.gif"', html,
+            'html_sanitize removed valid img')
+        self.assertNotIn('</body></html>', html, 'html_sanitize did not remove extra closing tags')


 class TestCleaner(unittest2.TestCase):
@ -181,6 +275,7 @@ class TestCleaner(unittest2.TestCase):
        new_html = html_email_clean(u'<?xml version="1.0" encoding="iso-8859-1"?>\n<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"\n         "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\n<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">\n <head>\n  <title>404 - Not Found</title>\n </head>\n <body>\n  <h1>404 - Not Found</h1>\n </body>\n</html>\n')
        self.assertNotIn('encoding', new_html, 'html_email_cleaner did not remove correctly encoding attributes')

+
 class TestHtmlTools(unittest2.TestCase):
    """ Test some of our generic utility functions about html """

--- a/openerp/tools/mail.py
+++ b/openerp/tools/mail.py
@ -23,8 +23,8 @@ from lxml import etree
 import cgi
 import logging
 import lxml.html
+import lxml.html.clean as clean
 import openerp.pooler as pooler
-import operator
 import random
 import re
 import socket
@ -40,71 +40,23 @@ _logger = logging.getLogger(__name__)
 # HTML Sanitizer
 #----------------------------------------------------------

+# FIXME: shouldn't this be a whitelist rather than a blacklist?!
+tags_to_kill = ["script", "head", "meta", "title", "link", "style", "frame", "iframe", "base", "object", "embed"]
+tags_to_remove = ['html', 'body', 'font']
+
+
 def html_sanitize(src):
    if not src:
        return src
    src = ustr(src, errors='replace')
-    root = lxml.html.fromstring(u"<div>%s</div>" % src)
-    result = handle_element(root)
-    res = []
-    for element in children(result[0]):
-        if isinstance(element, basestring):
-            res.append(element)
-        else:
-            element.tail = ""
-            res.append(lxml.html.tostring(element))
-    return ''.join(res)

-# FIXME: shouldn't this be a whitelist rather than a blacklist?!
-to_remove = set(["script", "head", "meta", "title", "link", "img"])
-to_unwrap = set(["html", "body"])
-
-javascript_regex = re.compile(r"^\s*javascript\s*:.*$", re.IGNORECASE)
-
-def handle_a(el, new):
-    href = el.get("href", "#")
-    if javascript_regex.search(href):
-        href = "#"
-    new.set("href", href)
-
-special = {
-    "a": handle_a,
-}
-
-def handle_element(element):
-    if isinstance(element, basestring):
-        return [element]
-    if element.tag in to_remove:
-        return []
-    if element.tag in to_unwrap:
-        return reduce(operator.add, [handle_element(x) for x in children(element)])
-    result = lxml.html.fromstring("<%s />" % element.tag)
-    for c in children(element):
-        append_to(handle_element(c), result)
-    if element.tag in special:
-        special[element.tag](element, result)
-    return [result]
-
-def children(node):
-    res = []
-    if node.text is not None:
-        res.append(node.text)
-    for child_node in node.getchildren():
-        res.append(child_node)
-        if child_node.tail is not None:
-            res.append(child_node.tail)
-    return res
-
-def append_to(elements, dest_node):
-    for element in elements:
-        if isinstance(element, basestring):
-            children = dest_node.getchildren()
-            if len(children) == 0:
-                dest_node.text = element
-            else:
-                children[-1].tail = element
-        else:
-            dest_node.append(element)
+    # some cases make the parser crash (such as SCRIPT/XSS in test_mail)
+    try:
+        cleaner = clean.Cleaner(page_structure=True, style=False, safe_attrs_only=False, forms=False, kill_tags=tags_to_kill, remove_tags=tags_to_remove)
+        cleaned = cleaner.clean_html(src)
+    except:
+        cleaned = 'Impossible to parse'
+    return cleaned


 #----------------------------------------------------------