truncatehtml_at_word

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
@register.filter
def truncatehtml_at_word(s, chars):
    """
    Truncate a string to the nearest word boundary less than the given number
    of characters.  Whitespace is not included in the character count.  If the
    string contains HTML, tags and comments are also not included in the
    character count.  Closes opened HTML tags whose closing tags might have
    been truncated.
    """

    length = int(chars)
    if length <= 0:
        return u''
    re_words = truncatehtml_at_word.re_words
    re_tag = truncatehtml_at_word.re_tag
    html4_singlets = truncatehtml_at_word.html4_singlets

    # Count non-HTML characters and keep note of open tags.
    open_tags = []
    count = 0
    pos = 0
    truncate_at = 0
    while count < length:
        m = re_words.search(s, pos)
        if not m:
            # No more words in the string.
            break
        pos = m.end(0)
        if m.group(1):
            # It's an actual non-HTML word.  If adding this word would exceed
            # our length threshold, then we're done.
            count += len(m.group(1))
            if count > length:
                break
            # Otherwise, update our truncation point to include the word.
            truncate_at = pos
            continue
        # Check for tag.
        tag = re_tag.match(m.group(0))
        if not tag:
            continue
        closing_tag, tagname, self_closing = tag.groups()
        tagname = tagname.lower()  # Element names are always case-insensitive
        if self_closing or tagname in html4_singlets:
            pass
        elif closing_tag:
            # Check for match in open tags list
            try:
                i = open_tags.index(tagname)
            except ValueError:
                pass
            else:
                # SGML: An end tag closes, back to the matching start tag, all
                # unclosed intervening start tags with omitted end tags
                open_tags = open_tags[i+1:]
        else:
            # Add it to the start of the open tags list
            open_tags.insert(0, tagname)
        truncate_at = pos
    # Don't bother closing tags if we didn't need to truncate.
    if truncate_at >= len(s):
        return s
    out = s[:truncate_at]
    for tag in open_tags:
        out += '</%s>' % tag
    if len(out) < len(s):
        out += '&nbsp;&hellip;'
    return out
truncatehtml_at_word.re_words = re.compile(r'&.*?;|<.*?>|(\w[\w-]*)', re.U)
truncatehtml_at_word.re_tag = re.compile(r'<(/)?([^ ]+?)(?: (/)| .*?)?>')
# <p> is included here despite not being a true singlet to avoid adding
# incorrect closing tags to something like "para 1 <p> para 2 <p>".
truncatehtml_at_word.html4_singlets = ('br', 'col', 'link', 'base', 'img',
                                       'param', 'area', 'hr', 'input', 'p')

More like this

  1. Truncate string after a given number of chars keeping whole words by rix 5 years, 3 months ago
  2. Truncate words by characters by trodrigues 5 years, 11 months ago
  3. wordbreak filter by soniiic 5 years ago
  4. Precise truncate chars filter by davmuz 2 years, 2 months ago
  5. Audit word filter by shinyzhu 4 years, 2 months ago

Comments

(Forgotten your password?)