[IMP] html_sanitize: attempt to make slightly more pythonic/readable + re-enable test. Proper review + test still needed.

bzr revid: odo@openerp.com-20120905225746-npjjzy1w00k05vtc
This commit is contained in:
Olivier Dony 2012-09-06 00:57:46 +02:00
parent 8e96d71bf3
commit efa0f9b263
2 changed files with 69 additions and 43 deletions

View File

@ -17,6 +17,7 @@ fast_suite = [
checks = [
test_expression,
test_html_sanitize,
test_orm,
test_view_validation,
test_misc,

View File

@ -1,67 +1,92 @@
# -*- coding: utf-8 -*-
##############################################################################
#
# OpenERP, Open Source Business Applications
# Copyright (C) 2012 OpenERP S.A. (<http://openerp.com>).
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
##############################################################################
import lxml.html
import operator
import re
def html_sanitize(x):
if not x:
return x
if type(x) == str:
x = unicode(x, "utf8", "replace")
root = lxml.html.fromstring("<div>%s</div>" % x)
result = handle_element(root)
res = ""
for el in children(result[0]):
if type(el) == str or type(el) == unicode:
res += el
else:
el.tail = ""
res += lxml.html.tostring(el)
return res
from openerp.loglevels import ustr
def html_sanitize(src):
if not src:
return src
src = ustr(src, errors='replace')
root = lxml.html.fromstring(u"<div>%s</div>" % src)
result = handle_element(root)
res = []
for element in children(result[0]):
if isinstance(element, basestring):
res.append(element)
else:
element.tail = ""
res.append(lxml.html.tostring(element))
return ''.join(res)
# FIXME: shouldn't this be a whitelist rather than a blacklist?!
to_remove = set(["script", "head", "meta", "title", "link", "img"])
to_unwrap = set(["html", "body"])
javascript_regex = re.compile("""^\s*javascript\s*\:.*$""")
javascript_regex = re.compile(r"^\s*javascript\s*:.*$", re.IGNORECASE)
def handle_a(el, new):
href = el.get("href", "#")
if javascript_regex.search(href):
href = "#"
new.set("href", href)
special = {
"a": handle_a,
}
def handle_element(el):
if type(el) == str or type(el) == unicode:
return [el]
if el.tag in to_remove:
def handle_element(element):
if isinstance(element, basestring):
return [element]
if element.tag in to_remove:
return []
if el.tag in to_unwrap:
return reduce(lambda x,y: x+y, [handle_element(x) for x in children(el)])
new = lxml.html.fromstring("<%s />" % el.tag)
for i in children(el):
append_to(handle_element(i), new)
if el.tag in special:
special[el.tag](el, new)
return [new]
def children(el):
if element.tag in to_unwrap:
return reduce(operator.add, [handle_element(x) for x in children(element)])
result = lxml.html.fromstring("<%s />" % element.tag)
for c in children(element):
append_to(handle_element(c), result)
if element.tag in special:
special[element.tag](element, result)
return [result]
def children(node):
res = []
if el.text is not None:
res.append(el.text)
for i in el.getchildren():
res.append(i)
if i.tail is not None:
res.append(i.tail)
if node.text is not None:
res.append(node.text)
for child_node in node.getchildren():
res.append(child_node)
if child_node.tail is not None:
res.append(child_node.tail)
return res
def append_to(new_ones, el):
for i in new_ones:
if type(i) == str or type(i) == unicode:
children = el.getchildren()
def append_to(elements, dest_node):
for element in elements:
if isinstance(element, basestring):
children = dest_node.getchildren()
if len(children) == 0:
el.text = i
dest_node.text = element
else:
children[-1].tail = i
children[-1].tail = element
else:
el.append(i)
dest_node.append(element)