[IMP] html_sanitize: attempt to make slightly more pythonic/readable + re-enable test. Proper review + test still needed.
bzr revid: odo@openerp.com-20120905225746-npjjzy1w00k05vtc
This commit is contained in:
parent
8e96d71bf3
commit
efa0f9b263
|
@ -17,6 +17,7 @@ fast_suite = [
|
|||
|
||||
checks = [
|
||||
test_expression,
|
||||
test_html_sanitize,
|
||||
test_orm,
|
||||
test_view_validation,
|
||||
test_misc,
|
||||
|
|
|
@ -1,67 +1,92 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
##############################################################################
|
||||
#
|
||||
# OpenERP, Open Source Business Applications
|
||||
# Copyright (C) 2012 OpenERP S.A. (<http://openerp.com>).
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU Affero General Public License as
|
||||
# published by the Free Software Foundation, either version 3 of the
|
||||
# License, or (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU Affero General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License
|
||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
#
|
||||
##############################################################################
|
||||
|
||||
import lxml.html
|
||||
import operator
|
||||
import re
|
||||
|
||||
def html_sanitize(x):
|
||||
if not x:
|
||||
return x
|
||||
if type(x) == str:
|
||||
x = unicode(x, "utf8", "replace")
|
||||
root = lxml.html.fromstring("<div>%s</div>" % x)
|
||||
result = handle_element(root)
|
||||
res = ""
|
||||
for el in children(result[0]):
|
||||
if type(el) == str or type(el) == unicode:
|
||||
res += el
|
||||
else:
|
||||
el.tail = ""
|
||||
res += lxml.html.tostring(el)
|
||||
return res
|
||||
from openerp.loglevels import ustr
|
||||
|
||||
def html_sanitize(src):
|
||||
if not src:
|
||||
return src
|
||||
src = ustr(src, errors='replace')
|
||||
root = lxml.html.fromstring(u"<div>%s</div>" % src)
|
||||
result = handle_element(root)
|
||||
res = []
|
||||
for element in children(result[0]):
|
||||
if isinstance(element, basestring):
|
||||
res.append(element)
|
||||
else:
|
||||
element.tail = ""
|
||||
res.append(lxml.html.tostring(element))
|
||||
return ''.join(res)
|
||||
|
||||
# FIXME: shouldn't this be a whitelist rather than a blacklist?!
|
||||
to_remove = set(["script", "head", "meta", "title", "link", "img"])
|
||||
to_unwrap = set(["html", "body"])
|
||||
|
||||
javascript_regex = re.compile("""^\s*javascript\s*\:.*$""")
|
||||
javascript_regex = re.compile(r"^\s*javascript\s*:.*$", re.IGNORECASE)
|
||||
|
||||
def handle_a(el, new):
|
||||
href = el.get("href", "#")
|
||||
if javascript_regex.search(href):
|
||||
href = "#"
|
||||
new.set("href", href)
|
||||
|
||||
special = {
|
||||
"a": handle_a,
|
||||
}
|
||||
|
||||
def handle_element(el):
|
||||
if type(el) == str or type(el) == unicode:
|
||||
return [el]
|
||||
if el.tag in to_remove:
|
||||
def handle_element(element):
|
||||
if isinstance(element, basestring):
|
||||
return [element]
|
||||
if element.tag in to_remove:
|
||||
return []
|
||||
if el.tag in to_unwrap:
|
||||
return reduce(lambda x,y: x+y, [handle_element(x) for x in children(el)])
|
||||
new = lxml.html.fromstring("<%s />" % el.tag)
|
||||
for i in children(el):
|
||||
append_to(handle_element(i), new)
|
||||
if el.tag in special:
|
||||
special[el.tag](el, new)
|
||||
return [new]
|
||||
|
||||
def children(el):
|
||||
if element.tag in to_unwrap:
|
||||
return reduce(operator.add, [handle_element(x) for x in children(element)])
|
||||
result = lxml.html.fromstring("<%s />" % element.tag)
|
||||
for c in children(element):
|
||||
append_to(handle_element(c), result)
|
||||
if element.tag in special:
|
||||
special[element.tag](element, result)
|
||||
return [result]
|
||||
|
||||
def children(node):
|
||||
res = []
|
||||
if el.text is not None:
|
||||
res.append(el.text)
|
||||
for i in el.getchildren():
|
||||
res.append(i)
|
||||
if i.tail is not None:
|
||||
res.append(i.tail)
|
||||
if node.text is not None:
|
||||
res.append(node.text)
|
||||
for child_node in node.getchildren():
|
||||
res.append(child_node)
|
||||
if child_node.tail is not None:
|
||||
res.append(child_node.tail)
|
||||
return res
|
||||
|
||||
def append_to(new_ones, el):
|
||||
for i in new_ones:
|
||||
if type(i) == str or type(i) == unicode:
|
||||
children = el.getchildren()
|
||||
def append_to(elements, dest_node):
|
||||
for element in elements:
|
||||
if isinstance(element, basestring):
|
||||
children = dest_node.getchildren()
|
||||
if len(children) == 0:
|
||||
el.text = i
|
||||
dest_node.text = element
|
||||
else:
|
||||
children[-1].tail = i
|
||||
children[-1].tail = element
|
||||
else:
|
||||
el.append(i)
|
||||
dest_node.append(element)
|
||||
|
|
Loading…
Reference in New Issue