2012-09-05 22:57:46 +00:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
##############################################################################
|
|
|
|
#
|
|
|
|
# OpenERP, Open Source Business Applications
|
|
|
|
# Copyright (C) 2012 OpenERP S.A. (<http://openerp.com>).
|
|
|
|
#
|
|
|
|
# This program is free software: you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU Affero General Public License as
|
|
|
|
# published by the Free Software Foundation, either version 3 of the
|
|
|
|
# License, or (at your option) any later version.
|
|
|
|
#
|
|
|
|
# This program is distributed in the hope that it will be useful,
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
# GNU Affero General Public License for more details.
|
|
|
|
#
|
|
|
|
# You should have received a copy of the GNU Affero General Public License
|
|
|
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
#
|
|
|
|
##############################################################################
|
2012-08-13 12:53:07 +00:00
|
|
|
|
2012-09-05 15:32:12 +00:00
|
|
|
import lxml.html
|
2012-09-05 22:57:46 +00:00
|
|
|
import operator
|
2012-08-13 15:52:05 +00:00
|
|
|
import re
|
2012-08-13 12:53:07 +00:00
|
|
|
|
2012-09-05 22:57:46 +00:00
|
|
|
from openerp.loglevels import ustr
|
|
|
|
|
|
|
|
def html_sanitize(src):
|
|
|
|
if not src:
|
|
|
|
return src
|
|
|
|
src = ustr(src, errors='replace')
|
|
|
|
root = lxml.html.fromstring(u"<div>%s</div>" % src)
|
2012-09-05 15:32:12 +00:00
|
|
|
result = handle_element(root)
|
2012-09-05 22:57:46 +00:00
|
|
|
res = []
|
|
|
|
for element in children(result[0]):
|
|
|
|
if isinstance(element, basestring):
|
|
|
|
res.append(element)
|
2012-09-05 15:32:12 +00:00
|
|
|
else:
|
2012-09-05 22:57:46 +00:00
|
|
|
element.tail = ""
|
|
|
|
res.append(lxml.html.tostring(element))
|
|
|
|
return ''.join(res)
|
2012-08-13 14:22:32 +00:00
|
|
|
|
2012-09-05 22:57:46 +00:00
|
|
|
# FIXME: shouldn't this be a whitelist rather than a blacklist?!
|
2012-08-13 14:44:01 +00:00
|
|
|
to_remove = set(["script", "head", "meta", "title", "link", "img"])
|
2012-08-13 14:37:55 +00:00
|
|
|
to_unwrap = set(["html", "body"])
|
|
|
|
|
2012-09-05 22:57:46 +00:00
|
|
|
javascript_regex = re.compile(r"^\s*javascript\s*:.*$", re.IGNORECASE)
|
|
|
|
|
2012-08-13 14:37:55 +00:00
|
|
|
def handle_a(el, new):
|
2012-08-13 15:52:05 +00:00
|
|
|
href = el.get("href", "#")
|
|
|
|
if javascript_regex.search(href):
|
|
|
|
href = "#"
|
|
|
|
new.set("href", href)
|
2012-09-05 22:57:46 +00:00
|
|
|
|
2012-08-13 14:37:55 +00:00
|
|
|
special = {
|
|
|
|
"a": handle_a,
|
|
|
|
}
|
|
|
|
|
2012-09-05 22:57:46 +00:00
|
|
|
def handle_element(element):
|
|
|
|
if isinstance(element, basestring):
|
|
|
|
return [element]
|
|
|
|
if element.tag in to_remove:
|
2012-08-13 14:37:55 +00:00
|
|
|
return []
|
2012-09-05 22:57:46 +00:00
|
|
|
if element.tag in to_unwrap:
|
|
|
|
return reduce(operator.add, [handle_element(x) for x in children(element)])
|
|
|
|
result = lxml.html.fromstring("<%s />" % element.tag)
|
|
|
|
for c in children(element):
|
|
|
|
append_to(handle_element(c), result)
|
|
|
|
if element.tag in special:
|
|
|
|
special[element.tag](element, result)
|
|
|
|
return [result]
|
|
|
|
|
|
|
|
def children(node):
|
2012-08-13 14:22:32 +00:00
|
|
|
res = []
|
2012-09-05 22:57:46 +00:00
|
|
|
if node.text is not None:
|
|
|
|
res.append(node.text)
|
|
|
|
for child_node in node.getchildren():
|
|
|
|
res.append(child_node)
|
|
|
|
if child_node.tail is not None:
|
|
|
|
res.append(child_node.tail)
|
2012-08-13 14:22:32 +00:00
|
|
|
return res
|
|
|
|
|
2012-09-05 22:57:46 +00:00
|
|
|
def append_to(elements, dest_node):
|
|
|
|
for element in elements:
|
|
|
|
if isinstance(element, basestring):
|
|
|
|
children = dest_node.getchildren()
|
2012-08-13 14:22:32 +00:00
|
|
|
if len(children) == 0:
|
2012-09-05 22:57:46 +00:00
|
|
|
dest_node.text = element
|
2012-08-13 14:22:32 +00:00
|
|
|
else:
|
2012-09-05 22:57:46 +00:00
|
|
|
children[-1].tail = element
|
2012-08-13 14:22:32 +00:00
|
|
|
else:
|
2012-09-05 22:57:46 +00:00
|
|
|
dest_node.append(element)
|