@register.filter def truncatehtml_at_word(s, chars): """ Truncate a string to the nearest word boundary less than the given number of characters. Whitespace is not included in the character count. If the string contains HTML, tags and comments are also not included in the character count. Closes opened HTML tags whose closing tags might have been truncated. """ length = int(chars) if length <= 0: return u'' re_words = truncatehtml_at_word.re_words re_tag = truncatehtml_at_word.re_tag html4_singlets = truncatehtml_at_word.html4_singlets # Count non-HTML characters and keep note of open tags. open_tags = [] count = 0 pos = 0 truncate_at = 0 while count < length: m = re_words.search(s, pos) if not m: # No more words in the string. break pos = m.end(0) if m.group(1): # It's an actual non-HTML word. If adding this word would exceed # our length threshold, then we're done. count += len(m.group(1)) if count > length: break # Otherwise, update our truncation point to include the word. truncate_at = pos continue # Check for tag. tag = re_tag.match(m.group(0)) if not tag: continue closing_tag, tagname, self_closing = tag.groups() tagname = tagname.lower() # Element names are always case-insensitive if self_closing or tagname in html4_singlets: pass elif closing_tag: # Check for match in open tags list try: i = open_tags.index(tagname) except ValueError: pass else: # SGML: An end tag closes, back to the matching start tag, all # unclosed intervening start tags with omitted end tags open_tags = open_tags[i+1:] else: # Add it to the start of the open tags list open_tags.insert(0, tagname) truncate_at = pos # Don't bother closing tags if we didn't need to truncate. if truncate_at >= len(s): return s out = s[:truncate_at] for tag in open_tags: out += '%s>' % tag if len(out) < len(s): out += ' …' return out truncatehtml_at_word.re_words = re.compile(r'&.*?;|<.*?>|(\w[\w-]*)', re.U) truncatehtml_at_word.re_tag = re.compile(r'<(/)?([^ ]+?)(?: (/)| .*?)?>') #
is included here despite not being a true singlet to avoid adding # incorrect closing tags to something like "para 1
para 2
". truncatehtml_at_word.html4_singlets = ('br', 'col', 'link', 'base', 'img', 'param', 'area', 'hr', 'input', 'p')