import re def clean_word(txt, its): for i in "font div span font img hr table td tr".split(): r=re.compile(r'?%s[^>]*>' % i) txt = r.sub('',txt) for i in [ r'', r'', r'<(\w:[^>]*?)>.*\1>', r'class=".*?"', r'<.--.*?-->', r'<!--.*?-->', #r'
]*>
]*>', #r']*>\s*
]*>', r"""align=["'][^"']*["']""", r"""style=["'][^"']*["']""", r'{mso-[^}]*}', r'<[^>]*>(( )|\s*)[^>]*>', ]: r=re.compile(i, re.DOTALL) txt = r.sub('',txt) if its>0: return clean_word(txt, its-1) r = re.compile(r'(",txt) return txt