[IMP] html_sanitize: attempt to make slightly more pythonic/readable + re-enable test. Proper review + test still needed.

bzr revid: odo@openerp.com-20120905225746-npjjzy1w00k05vtc
2012-09-06 00:57:46 +02:00 · 2012-09-06 00:57:46 +02:00 · efa0f9b263
parent 8e96d71bf3
commit efa0f9b263
2 changed files with 69 additions and 43 deletions
--- a/openerp/tests/init.py
+++ b/openerp/tests/init.py
@ -17,6 +17,7 @@ fast_suite = [

 checks = [
    test_expression,
+    test_html_sanitize,
    test_orm,
    test_view_validation,
    test_misc,
--- a/openerp/tools/html_sanitize.py
+++ b/openerp/tools/html_sanitize.py
@ -1,67 +1,92 @@
+# -*- coding: utf-8 -*-
+##############################################################################
+#
+#    OpenERP, Open Source Business Applications
+#    Copyright (C) 2012 OpenERP S.A. (<http://openerp.com>).
+#
+#    This program is free software: you can redistribute it and/or modify
+#    it under the terms of the GNU Affero General Public License as
+#    published by the Free Software Foundation, either version 3 of the
+#    License, or (at your option) any later version.
+#
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU Affero General Public License for more details.
+#
+#    You should have received a copy of the GNU Affero General Public License
+#    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+##############################################################################

 import lxml.html
+import operator
 import re

-def html_sanitize(x):
-    if not x:
-        return x
-    if type(x) == str:
-        x = unicode(x, "utf8", "replace")
-    root = lxml.html.fromstring("<div>%s</div>" % x)
-    result = handle_element(root)
-    res = ""
-    for el in children(result[0]):
-        if type(el) == str or type(el) == unicode:
-            res += el
-        else:
-            el.tail = ""
-            res += lxml.html.tostring(el)
-    return res
+from openerp.loglevels import ustr

+def html_sanitize(src):
+    if not src:
+        return src
+    src = ustr(src, errors='replace')
+    root = lxml.html.fromstring(u"<div>%s</div>" % src)
+    result = handle_element(root)
+    res = []
+    for element in children(result[0]):
+        if isinstance(element, basestring):
+            res.append(element)
+        else:
+            element.tail = ""
+            res.append(lxml.html.tostring(element))
+    return ''.join(res)
+
+# FIXME: shouldn't this be a whitelist rather than a blacklist?!
 to_remove = set(["script", "head", "meta", "title", "link", "img"])
 to_unwrap = set(["html", "body"])

-javascript_regex = re.compile("""^\s*javascript\s*\:.*$""")
+javascript_regex = re.compile(r"^\s*javascript\s*:.*$", re.IGNORECASE)
+
 def handle_a(el, new):
    href = el.get("href", "#")
    if javascript_regex.search(href):
        href = "#"
    new.set("href", href)
+
 special = {
    "a": handle_a,
 }

-def handle_element(el):
-    if type(el) == str or type(el) == unicode:
-        return [el]
-    if el.tag in to_remove:
+def handle_element(element):
+    if isinstance(element, basestring):
+        return [element]
+    if element.tag in to_remove:
        return []
-    if el.tag in to_unwrap:
-        return reduce(lambda x,y: x+y, [handle_element(x) for x in children(el)])
-    new = lxml.html.fromstring("<%s />" % el.tag)
-    for i in children(el):
-        append_to(handle_element(i), new)
-    if el.tag in special:
-        special[el.tag](el, new)
-    return [new]
-    
-def children(el):
+    if element.tag in to_unwrap:
+        return reduce(operator.add, [handle_element(x) for x in children(element)])
+    result = lxml.html.fromstring("<%s />" % element.tag)
+    for c in children(element):
+        append_to(handle_element(c), result)
+    if element.tag in special:
+        special[element.tag](element, result)
+    return [result]
+
+def children(node):
    res = []
-    if el.text is not None:
-        res.append(el.text)
-    for i in el.getchildren():
-        res.append(i)
-        if i.tail is not None:
-            res.append(i.tail)
+    if node.text is not None:
+        res.append(node.text)
+    for child_node in node.getchildren():
+        res.append(child_node)
+        if child_node.tail is not None:
+            res.append(child_node.tail)
    return res

-def append_to(new_ones, el):
-    for i in new_ones:
-        if type(i) == str or type(i) == unicode:
-            children = el.getchildren()
+def append_to(elements, dest_node):
+    for element in elements:
+        if isinstance(element, basestring):
+            children = dest_node.getchildren()
            if len(children) == 0:
-                el.text = i
+                dest_node.text = element
            else:
-                children[-1].tail = i
+                children[-1].tail = element
        else:
-            el.append(i)
+            dest_node.append(element)