from lxml import html, etree
import re
register = Library()
css_cleanup_regex = re.compile('((font|padding|margin)(-[^:]+)?|line-height):\s*[^;]+;')
def _cleanup_elements(elem):
"""
Removes empty elements from HTML (i.e. those without text inside).
If the tag has a 'style' attribute, we remove the css attributes we don't want.
"""
if elem.text_content().strip() == '':
elem.drop_tree()
else:
if elem.attrib.has_key('style'):
elem.attrib['style'] = css_cleanup_regex.sub('', elem.attrib['style'])
for sub in elem:
_cleanup_elements(sub)
@register.simple_tag
def cleanup_html(string):
"""
Makes generated HTML (i.e. ouput from the WYSISYG) look almost decent.
"""
try:
elem = html.fromstring(string)
_cleanup_elements(elem)
html_string = html.tostring(elem)
lines = []
for line in html_string.splitlines():
line = line.rstrip()
if line != '': lines.append(line)
return '\n'.join(lines)
except etree.XMLSyntaxError:
return string