[IMP] tools.misc: cleanup and fallback mechanism for html2plaintext + other minor cleanup

bzr revid: odo@openerp.com-20100628181244-s3djk9c7ycbba9vk
This commit is contained in:
Olivier Dony, Christophe Simonis 2010-06-28 20:12:44 +02:00 committed by Olivier Dony
parent 573b05d6b6
commit c6bbbab4c8
1 changed files with 34 additions and 40 deletions

View File

@ -349,35 +349,39 @@ def html2plaintext(html, body_id=None, encoding='utf-8'):
"""
html = ustr(html)
try:
from BeautifulSoup import BeautifulSoup, SoupStrainer, Comment
except:
return html
urls = []
from lxml.etree import Element, tostring
try:
from lxml.html.soupparser import fromstring
kwargs = {}
except ImportError:
_logger.debug('tools.misc.html2plaintext: cannot use BeautifulSoup, fallback to lxml.etree.HTMLParser')
from lxml.etree import fromstring, HTMLParser
kwargs = dict(parser=HTMLParser())
tree = fromstring(html, **kwargs)
if body_id is not None:
strainer = SoupStrainer(id=body_id)
source = tree.xpath('//*[@id=%s]'%(body_id,))
else:
strainer = SoupStrainer('body')
soup = BeautifulSoup(html, parseOnlyThese=strainer, fromEncoding=encoding)
for link in soup.findAll('a'):
title = link.renderContents()
for url in [x[1] for x in link.attrs if x[0]=='href']:
urls.append(dict(url=ustr(url), tag=ustr(link), title=ustr(title)))
html = ustr(soup.__str__())
source = tree.xpath('//body')
if len(source):
tree = source[0]
url_index = []
i = 0
for d in urls:
if d['title'] == d['url'] or 'http://'+d['title'] == d['url']:
html = html.replace(d['tag'], d['url'])
else:
for link in tree.findall('.//a'):
title = link.text
url = link.get('href')
if url:
i += 1
html = html.replace(d['tag'], '%s [%s]' % (d['title'], i))
url_index.append(d['url'])
urls.append(dict(url=ustr(url), tag=ustr(link), title=ustr(title)))
link.tag = 'span'
link.text = '%s [%s]' % (link.text, i)
url_index.append(url)
html = ustr(tostring(tree, encoding=encoding))
html = html.replace('<strong>','*').replace('</strong>','*')
html = html.replace('<b>','*').replace('</b>','*')
@ -385,25 +389,15 @@ def html2plaintext(html, body_id=None, encoding='utf-8'):
html = html.replace('<h2>','**').replace('</h2>','**')
html = html.replace('<h1>','**').replace('</h1>','**')
html = html.replace('<em>','/').replace('</em>','/')
html = html.replace('<br>', '\n')
html = html.replace('<tr>', '\n')
html = html.replace('</p>', '\n\n')
html = re.sub('<br\s*/>', '\n', html)
html = html.replace('</p>', '\n')
html = re.sub('<br\s*/?>', '\n', html)
html = re.sub('<.*?>', ' ', html)
html = html.replace(' ' * 2, ' ')
# for all other tags we failed to clean up, just remove then and
# complain about them on the stderr
def desperate_fixer(g):
#print >>sys.stderr, "failed to clean up %s" % str(g.group())
return ' '
html = re.sub('<.*?>', desperate_fixer, html)
# lstrip all lines
html = '\n'.join([x.lstrip() for x in html.splitlines()])
# strip all lines
html = '\n'.join([x.strip() for x in html.splitlines()])
html = html.replace('\n' * 2, '\n')
for i, url in enumerate(url_index):
if i == 0:
@ -862,13 +856,13 @@ def ustr(value):
try:
return unicode(value)
except:
except Exception:
pass
for ln in get_encodings():
try:
return unicode(value, ln)
except:
except Exception:
pass
raise UnicodeError('unable de to convert %r' % (orig,))