Tags & filters for rendering search results

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
from django import template
from django.conf import settings
from django.template import Node, TemplateSyntaxError
from django.utils.safestring import mark_safe
from django.utils.datastructures import SortedDict
from itertools import ifilter, takewhile
import re

register = template.Library()

SETTINGS_PREFIX = 'SEARCH_'
SETTINGS_DEFAULTS = {
    'CONTEXT_WORDS': 10,
    'IGNORE_CASE': True,
    'WORD_BOUNDARY': False,
    'HIGHLIGHT_CLASS': "highlight"
}

def get_setting(name):
    return getattr(settings, SETTINGS_PREFIX + name, SETTINGS_DEFAULTS[name])

def searchexcerpt(text, phrases, context_words=None, ignore_case=None, word_boundary=None):
    if isinstance(phrases, basestring):
        phrases = [phrases]
    if context_words is None:
        context_words = get_setting('CONTEXT_WORDS')
    if ignore_case is None:
        ignore_case = get_setting('IGNORE_CASE')
    if word_boundary is None:
        word_boundary = get_setting('WORD_BOUNDARY')
    
    phrases = map(re.escape, phrases)
    flags = ignore_case and re.I or 0
    exprs = [re.compile(r"^%s$" % p, flags) for p in phrases]
    whitespace = re.compile(r'\s+')
    
    re_template = word_boundary and r"\b(%s)\b" or r"(%s)"
    pieces = re.compile(re_template % "|".join(phrases), flags).split(text)
    matches = {}
    word_lists = []
    index = {}
    for i, piece in enumerate(pieces):
        word_lists.append(whitespace.split(piece))
        if i % 2:
            index[i] = expr = ifilter(lambda e: e.match(piece), exprs).next()
            matches.setdefault(expr, []).append(i)
    
    def merge(lists):
        merged = []
        for words in lists:
            if merged:
                merged[-1] += words[0]
                del words[0]
            merged.extend(words)
        return merged
    
    i = 0
    merged = []
    for j in map(min, matches.itervalues()):
        merged.append(merge(word_lists[i:j]))
        merged.append(word_lists[j])
        i = j + 1
    merged.append(merge(word_lists[i:]))
    
    output = []
    for i, words in enumerate(merged):
        omit = None
        if i == len(merged) - 1:
            omit = slice(max(1, 2 - i) * context_words + 1, None)
        elif i == 0:
            omit = slice(-context_words - 1)
        elif not i % 2:
            omit = slice(context_words + 1, -context_words - 1)
        if omit and words[omit]:
            words[omit] = ["..."]
        output.append(" ".join(words))
    
    return dict(original=text, excerpt="".join(output), hits=len(index))

class FunctionProxyNode(Node):
    def __init__(self, nodelist, args, variable_name=None):
        self.nodelist = nodelist
        self.args = args
        self.variable_name = variable_name
    
    def render(self, context):
        args = [arg.resolve(context) for arg in self.args]
        text = self.nodelist.render(context)
        value = self.get_value(text, *args)
        if self.variable_name:
            context[self.variable_name] = value
            return ""
        else:
            return self.string_value(value)
    
    def get_value(self, *args):
        raise NotImplementedError
    
    def string_value(self, value):
        return value

class SearchContextNode(FunctionProxyNode):
    def get_value(self, *args):
        return searchexcerpt(*args)
    
    def string_value(self, value):
        return value['excerpt']

@register.tag(name='searchexcerpt')
def searchexcerpt_tag(parser, token):
    """
        {% searchexcerpt search_terms [context_words] [ignore_case] [word_boundary] [as name] %}
        ...text...
        {% endsearchexcerpt %}
    """
    bits = list(token.split_contents())
    if not 3 <= len(bits) <= 8:
        usage = searchexcerpt_tag.__doc__.strip()
        raise TemplateSyntaxError("%r expected usage: %s" % (bits[0], usage))
    
    if len(bits) > 4 and bits[-2] == "as":
        args, name = bits[1:-2], bits[-1]
    else:
        args, name = bits[1:], None
    
    nodelist = parser.parse(('endsearchexcerpt',))
    parser.delete_first_token()
    return SearchContextNode(nodelist, map(parser.compile_filter, args), name)

@register.filter(name='searchexcerpt')
def searchexcerpt_filter(value, arg):
    return searchexcerpt(value, arg)['excerpt']
searchexcerpt_filter.is_safe = True

def highlight(text, phrases, ignore_case=None, word_boundary=None, class_name=None):
    if isinstance(phrases, basestring):
        phrases = [phrases]
    if ignore_case is None:
        ignore_case = get_setting('IGNORE_CASE')
    if word_boundary is None:
        word_boundary = get_setting('WORD_BOUNDARY')
    if class_name is None:
        class_name = get_setting('HIGHLIGHT_CLASS')
        
    phrases = map(re.escape, phrases)
    flags = ignore_case and re.I or 0
    re_template = word_boundary and r"\b(%s)\b" or r"(%s)"
    expr = re.compile(re_template % "|".join(phrases), flags)
    template = '<span class="%s">%%s</span>' % class_name
    matches = []
    
    def replace(match):
        matches.append(match)
        return template % match.group(0)
    
    highlighted = mark_safe(expr.sub(replace, text))
    count = len(matches)
    return dict(original=text, highlighted=highlighted, hits=count)

class HighlightNode(FunctionProxyNode):
    def get_value(self, *args):
        return highlight(*args)
    
    def string_value(self, value):
        return value['highlighted']

@register.tag(name='highlight')
def highlight_tag(parser, token):
    """
        {% highlight search_terms [ignore_case] [word_boundary] [class_name] [as name] %}
        ...text...
        {% endhighlight %}
    """
    bits = list(token.split_contents())
    if not 2 <= len(bits) <= 7:
        usage = highlight_tag.__doc__.strip()
        raise TemplateSyntaxError("%r expected usage: %s" % (bits[0], usage))
    
    if len(bits) > 3 and bits[-2] == "as":
        args, name = bits[1:-2], bits[-1]
    else:
        args, name = bits[1:], None
    
    nodelist = parser.parse(('endhighlight',))
    parser.delete_first_token()
    return HighlightNode(nodelist, map(parser.compile_filter, args), name)

@register.filter(name='highlight')
def highlight_filter(value, arg):
    return highlight(value, arg)['highlighted']

def hits(text, phrases, ignore_case=None, word_boundary=None):
    if isinstance(phrases, basestring):
        phrases = [phrases]
    if ignore_case is None:
        ignore_case = get_setting('IGNORE_CASE')
    if word_boundary is None:
        word_boundary = get_setting('WORD_BOUNDARY')    

    phrases = map(re.escape, phrases)
    flags = ignore_case and re.I or 0
    re_template = word_boundary and r"\b(%s)\b" or r"(%s)"
    expr = re.compile(re_template % "|".join(phrases), flags)
    return len(expr.findall(text))

class HitsNode(FunctionProxyNode):
    def get_value(self, *args):
        return hits(*args)
    
    def string_value(self, value):
        return "%d" % value

@register.tag(name='hits')
def hits_tag(parser, token):
    """
        {% hits search_terms [ignore_case] [word_boundary] [as name] %}
        ...text...
        {% endhits %}
    """
    bits = list(token.split_contents())
    if not 2 <= len(bits) <= 6:
        usage = hits_tag.__doc__.strip()
        raise TemplateSyntaxError("%r expected usage: %s" % (bits[0], usage))
    
    if len(bits) > 3 and bits[-2] == "as":
        args, name = bits[1:-2], bits[-1]
    else:
        args, name = bits[1:], None
    
    nodelist = parser.parse(('endhits',))
    parser.delete_first_token()
    return HitsNode(nodelist, map(parser.compile_filter, args), name)

@register.filter(name='hits')
def hits_filter(value, arg):
    return hits(value, arg)
hits.is_safe = True

More like this

  1. Text Highlighting Filter by girasquid 5 years, 4 months ago
  2. Template filter that divides a list into exact columns by davmuz 2 years, 2 months ago
  3. Yet another SQL debugging facility by miracle2k 6 years, 8 months ago
  4. Sphinx Search ORM by zeeg 6 years, 11 months ago
  5. Paginator TemplateTag by trbs 6 years ago

Comments

richardh (on March 29, 2008):

Very nice. I particularly like the excerpts.

One question. Is there a simple way of ordering the search results in the html page by the number of hits (highest first)?

Richard

#

nikhil (on June 7, 2008):

Thanks. Great work.

#

UloPe (on March 23, 2009):

Very useful!

I added some code to the highlight function to avoid highlighting strings inside of href attributes:

def highlight(text, phrases, ignore_case=None, word_boundary=None, class_name=None):
    if isinstance(phrases, basestring):
        phrases = [phrases]
    if ignore_case is None:
        ignore_case = get_setting('IGNORE_CASE')
    if word_boundary is None:
        word_boundary = get_setting('WORD_BOUNDARY')
    if class_name is None:
        class_name = get_setting('HIGHLIGHT_CLASS')

    phrases = map(re.escape, phrases)
    flags = ignore_case and re.I or 0
    re_template = word_boundary and r"\b(%s)\b" or r"(%s)"
    expr = re.compile(re_template % "|".join(phrases), flags)
    inner_expr = re.compile('<a href="[^>]*?(%s)$' % "|".join(phrases), flags)
    template = '<span class="%s">%%s</span>' % class_name
    matches = []

    def replace(match):
        import pdb
        pdb.set_trace()
        if not word_boundary:
            span = match.span()
            if inner_expr.search(text, span[0]-100, span[1]):
                return match.group(0)
        matches.append(match)
        return template % match.group(0)

    highlighted = mark_safe(expr.sub(replace, text))
    count = len(matches)
    return dict(original=text, highlighted=highlighted, hits=count)

#

UloPe (on March 23, 2009):

Argh... please ignore the code in the previous post. That was my devel version.

Here is the correct one:

def highlight(text, phrases, ignore_case=None, word_boundary=None, class_name=None):
    if isinstance(phrases, basestring):
        phrases = [phrases]
    if ignore_case is None:
        ignore_case = get_setting('IGNORE_CASE')
    if word_boundary is None:
        word_boundary = get_setting('WORD_BOUNDARY')
    if class_name is None:
        class_name = get_setting('HIGHLIGHT_CLASS')

    phrases = map(re.escape, phrases)
    flags = ignore_case and re.I or 0
    re_template = word_boundary and r"\b(%s)\b" or r"(%s)"
    expr = re.compile(re_template % "|".join(phrases), flags)
    inner_expr = re.compile('<a[^>]+?href="[^>]*?(%s)$' % "|".join(phrases), flags)
    template = '<span class="%s">%%s</span>' % class_name
    matches = []

    def replace(match):
        if not word_boundary:
            span = match.span()
            if inner_expr.search(text, span[0]-100, span[1]):
                return match.group(0)
        matches.append(match)
        return template % match.group(0)

    highlighted = mark_safe(expr.sub(replace, text))
    count = len(matches)
    return dict(original=text, highlighted=highlighted, hits=count)

#

(Forgotten your password?)