root/projects/whoosh/trunk/src/whoosh/qparser/default.py @ 415

Revision 415, 15.0 KB (checked in by matt, 7 months ago)

Work on NUMERIC, DATETIME, and BOOLEAN field types.
Changes instances of test_index to testindex.

Line 
1"""
2This module contains the default search query parser.
3
4This uses the excellent Pyparsing module
5(http://pyparsing.sourceforge.net/) to parse search query strings
6into nodes from the query module.
7
8This parser handles:
9
10* 'AND', 'OR', 'NOT'
11* grouping with parentheses
12* quoted phrase searching
13* wildcards, e.g. help*
14* ranges, e.g. [a TO b]
15* fields, e.g. title:whoosh
16
17This parser was originally based on the searchparser example code available at:
18
19http://pyparsing.wikispaces.com/space/showimage/searchparser.py
20"""
21
22# The code upon which this parser was based was made available by the authors under
23# the following copyright and conditions:
24
25# Copyright (c) 2006, Estrate, the Netherlands
26# All rights reserved.
27#
28# Redistribution and use in source and binary forms, with or without modification,
29# are permitted provided that the following conditions are met:
30#
31# * Redistributions of source code must retain the above copyright notice, this
32#   list of conditions and the following disclaimer.
33# * Redistributions in binary form must reproduce the above copyright notice,
34#   this list of conditions and the following disclaimer in the documentation
35#   and/or other materials provided with the distribution.
36# * Neither the name of Estrate nor the names of its contributors may be used
37#   to endorse or promote products derived from this software without specific
38#   prior written permission.
39#
40# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
41# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
42# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
43# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
44# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
45# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
46# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
47# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
48# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
49# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
50#
51# CONTRIBUTORS:
52# - Steven Mooij
53# - Rudolph Froger
54# - Paul McGuire
55
56from whoosh.support.pyparsing import (printables, alphanums, OneOrMore,
57                                      Group, Combine, Suppress, Optional, FollowedBy,
58                                      Literal, CharsNotIn, Word, Keyword,
59                                      Empty, White, Forward, QuotedString,
60                                      StringEnd)
61from whoosh.query import *
62
63
64def _make_default_parser():
65    escapechar = "\\"
66   
67    #wordchars = printables
68    #for specialchar in '*?^():"{}[] ' + escapechar:
69    #    wordchars = wordchars.replace(specialchar, "")
70    #wordtext = Word(wordchars)
71   
72    wordtext = CharsNotIn('\\*?^():"{}[] ')
73    escape = Suppress(escapechar) + (Word(printables, exact=1) | White(exact=1))
74    wordtoken = Combine(OneOrMore(wordtext | escape))
75   
76    # A plain old word.
77    plainWord = Group(wordtoken).setResultsName("Word")
78   
79    # A wildcard word containing * or ?.
80    wildchars = Word("?*")
81    # Start with word chars and then have wild chars mixed in
82    wildmixed = wordtoken + OneOrMore(wildchars + Optional(wordtoken))
83    # Or, start with wildchars, and then either a mixture of word and wild chars, or the next token
84    wildstart = wildchars + (OneOrMore(wordtoken + Optional(wildchars)) | FollowedBy(White() | StringEnd()))
85    wildcard = Group(Combine(wildmixed | wildstart)).setResultsName("Wildcard")
86   
87    # A range of terms
88    startfence = Literal("[") | Literal("{")
89    endfence = Literal("]") | Literal("}")
90    rangeitem = QuotedString('"') | wordtoken
91    openstartrange = Group(Empty()) + Suppress(Keyword("TO") + White()) + Group(rangeitem)
92    openendrange = Group(rangeitem) + Suppress(White() + Keyword("TO")) + Group(Empty())
93    normalrange = Group(rangeitem) + Suppress(White() + Keyword("TO") + White()) + Group(rangeitem)
94    range = Group(startfence + (normalrange | openstartrange | openendrange) + endfence).setResultsName("Range")
95   
96#    rangeitem = QuotedString('"') | wordtoken
97#    rangestartitem = Group((rangeitem + Suppress(White())) | Empty()).setResultsName("rangestart")
98#    rangeenditem = Group((Suppress(White()) + rangeitem) | Empty()).setResultsName("rangeend")
99#    rangestart = (Literal("{") | Literal("[")) + rangestartitem
100#    rangeend = rangeenditem + (Literal("}") | Literal("]"))
101#    range =  Group(rangestart + Suppress(Literal("TO")) + rangeend).setResultsName("Range")
102   
103    # A word-like thing
104    generalWord = range | wildcard | plainWord
105   
106    # A quoted phrase
107    quotedPhrase = Group(QuotedString('"')).setResultsName("Quotes")
108   
109    expression = Forward()
110   
111    # Parentheses can enclose (group) any expression
112    parenthetical = Group((Suppress("(") + expression + Suppress(")"))).setResultsName("Group")
113
114    boostableUnit = generalWord | quotedPhrase
115    boostedUnit = Group(boostableUnit + Suppress("^") + Word("0123456789", ".0123456789")).setResultsName("Boost")
116
117    # The user can flag that a parenthetical group, quoted phrase, or word
118    # should be searched in a particular field by prepending 'fn:', where fn is
119    # the name of the field.
120    fieldableUnit = parenthetical | boostedUnit | boostableUnit
121    fieldedUnit = Group(Word(alphanums + "_") + Suppress(':') + fieldableUnit).setResultsName("Field")
122   
123    # Units of content
124    unit = fieldedUnit | fieldableUnit
125
126    # A unit may be "not"-ed.
127    operatorNot = Group(Suppress(Keyword("not", caseless=True)) +  Suppress(White()) + unit).setResultsName("Not")
128    generalUnit = operatorNot | unit
129
130    andToken = Keyword("AND", caseless=False)
131    orToken = Keyword("OR", caseless=False)
132    andNotToken = Keyword("ANDNOT", caseless=False)
133   
134    operatorAnd = Group(generalUnit +  Suppress(White()) + Suppress(andToken) +  Suppress(White()) + expression).setResultsName("And")
135    operatorOr = Group(generalUnit +  Suppress(White()) + Suppress(orToken) +  Suppress(White()) + expression).setResultsName("Or")
136    operatorAndNot = Group(unit + Suppress(White()) + Suppress(andNotToken) + Suppress(White()) + unit).setResultsName("AndNot")
137
138    expression << (OneOrMore(operatorAnd | operatorOr | operatorAndNot | generalUnit | Suppress(White())) | Empty())
139   
140    toplevel = Group(expression).setResultsName("Toplevel") + StringEnd()
141   
142    return toplevel.parseString
143
144DEFAULT_PARSER = _make_default_parser()
145
146
147# Query parser objects
148
149class PyparsingBasedParser(object):
150    def _field(self, fieldname):
151        if self.schema:
152            return self.schema[fieldname]
153   
154    def parse(self, input, normalize=True):
155        """Parses the input string and returns a Query object/tree.
156       
157        This method may return None if the input string does not result in any
158        valid queries. It may also raise a variety of exceptions if the input
159        string is malformed.
160       
161        :param input: the unicode string to parse.
162        :param normalize: whether to call normalize() on the query object/tree
163            before returning it. This should be left on unless you're trying to
164            debug the parser output.
165        :rtype: :class:`whoosh.query.Query`
166        """
167       
168        ast = self.parser(input)[0]
169        q = self._eval(ast, self.default_field)
170        if q and normalize:
171            q = q.normalize()
172        return q
173   
174    # These methods are called by the parsing code to generate query
175    # objects. They are useful for subclassing.
176
177    def _eval(self, node, fieldname):
178        # Get the name of the AST node and call the corresponding
179        # method to get a query object
180        name = node.getName()
181        return getattr(self, "_" + name)(node, fieldname)
182   
183    def get_term_text(self, field, text, **kwargs):
184        # Just take the first token
185        for t in field.process_text(text, mode="query", **kwargs):
186            return t
187   
188    def make_term(self, fieldname, text):
189        field = self._field(fieldname)
190        if field:
191            if field.parse_query:
192                return field.parse_query(fieldname, text)
193            else:
194                text = self.get_term_text(field, text)
195       
196        if text is None:
197            return NullQuery
198        return self.termclass(fieldname, text)
199   
200    def make_phrase(self, fieldname, text):
201        field = self._field(fieldname)
202        if field:
203            if field.parse_query:
204                return field.parse_query(fieldname, text)
205           
206            texts = list(field.process_text(text, mode="query"))
207            if not texts:
208                return self.termclass(fieldname, u'')
209            elif len(texts) == 1:
210                return self.termclass(fieldname, texts[0])
211            else:
212                return Phrase(fieldname, texts)
213        else:
214            return Phrase(fieldname, text.split(" "))
215   
216    def make_wildcard(self, fieldname, text):
217        field = self._field(fieldname)
218        if field:
219            text = self.get_term_text(field, text, tokenize=False, removestops=False)
220        return Wildcard(fieldname, text)
221   
222    def make_range(self, fieldname, start, end, startexcl, endexcl):
223        field = self._field(fieldname)
224        if field:
225            if start:
226                start = self.get_term_text(field, start, tokenize=False, removestops=False)
227            if end:
228                end = self.get_term_text(field, end, tokenize=False, removestops=False)
229       
230        if not start and not end:
231            raise QueryError("TermRange must have start and/or end")
232        if not start:
233            start = u''
234        if not end:
235            end = u'\uFFFF'
236        return TermRange(fieldname, start, end, startexcl, endexcl)
237   
238    def make_and(self, qs):
239        return And(qs)
240   
241    def make_or(self, qs):
242        return Or(qs)
243   
244    def make_andnot(self, positive, negative):
245        return AndNot(positive, negative)
246   
247    def make_not(self, q):
248        return Not(q)
249
250
251class QueryParser(PyparsingBasedParser):
252    """The default parser for Whoosh, implementing a powerful fielded
253    query language similar to Lucene's.
254    """
255   
256    __inittypes__ = dict(default_field=str, schema="whoosh.fields.Schema",
257                         conjunction="whoosh.query.Query", termclass="whoosh.query.Query")
258   
259    def __init__(self, default_field, schema=None,
260                 conjunction=And, termclass=Term):
261        """
262        :param default_field: Use this as the field for any terms without
263            an explicit field. For example, if the query string is
264            "hello f1:there" and the default field is "f2", the parsed
265            query will be as if the user had entered "f2:hello f1:there".
266            This argument is required.
267        :param conjuction: Use this query.Query class to join together clauses
268            where the user has not explictly specified a join. For example,
269            if this is query.And, the query string "a b c" will be parsed as
270            "a AND b AND c". If this is query.Or, the string will be parsed as
271            "a OR b OR c".
272        :param termclass: Use this query.Query class for bare terms. For example,
273            query.Term or query.Variations.
274        :param schema: An optional fields.Schema object. If this argument is present,
275            the appropriate field will be used to tokenize terms/phrases before
276            they are turned into query objects.
277        """
278
279        self.default_field = default_field
280        self.conjunction = conjunction
281        self.termclass = termclass
282        self.schema = schema
283        self.parser = DEFAULT_PARSER
284       
285    # These methods take the AST from pyparsing, extract the
286    # relevant data, and call the appropriate make_* methods to
287    # create query objects.
288
289    def _Toplevel(self, node, fieldname):
290        return self.conjunction([self._eval(s, fieldname) for s in node])
291
292    def _Word(self, node, fieldname):
293        return self.make_term(fieldname, node[0])
294   
295    def _Quotes(self, node, fieldname):
296        return self.make_phrase(fieldname, node[0])
297
298    def _Range(self, node, fieldname):
299        startchar, start, end, endchar = node
300        startexcl = startchar == "{"
301        endexcl = endchar == "}"
302        starttext = endtext = None
303        if start:
304            starttext = start[0]
305        if end:
306            endtext = end[0]
307        return self.make_range(fieldname, starttext, endtext, startexcl, endexcl)
308   
309    def _Wildcard(self, node, fieldname):
310        return self.make_wildcard(fieldname, node[0])
311   
312    def _And(self, node, fieldname):
313        return self.make_and([self._eval(s, fieldname) for s in node])
314   
315    def _Or(self, node, fieldname):
316        return self.make_or([self._eval(s, fieldname) for s in node])
317   
318    def _AndNot(self, node, fieldname):
319        return self.make_andnot(self._eval(node[0], fieldname),
320                                self._eval(node[1], fieldname))
321   
322    def _Not(self, node, fieldname):
323        return self.make_not(self._eval(node[0], fieldname))
324   
325    def _Group(self, node, fieldname):
326        return self.conjunction([self._eval(s, fieldname) for s in node])
327   
328    def _Field(self, node, fieldname):
329        return self._eval(node[1], node[0])
330   
331    def _Boost(self, node, fieldname):
332        obj = self._eval(node[0], fieldname)
333        obj.boost = float(node[1])
334        return obj
335
336
337class MultifieldParser(QueryParser):
338    """A subclass of QueryParser. Instead of assigning unfielded clauses
339    to a default field, this class transforms them into an OR clause that
340    searches a list of fields. For example, if the list of multi-fields
341    is "f1", "f2" and the query string is "hello there", the class will
342    parse "(f1:hello OR f2:hello) (f1:there OR f2:there)". This is very
343    useful when you have two textual fields (e.g. "title" and "content")
344    you want to search by default.
345    """
346
347    __inittypes__ = dict(fieldnames=list, schema="whoosh.fields.Schema",
348                         conjunction="whoosh.query.Query", termclass="whoosh.query.Query")
349
350    def __init__(self, fieldnames, schema=None, conjunction=And, termclass=Term):
351        super(MultifieldParser, self).__init__(None, schema=schema,
352                                               conjunction=conjunction, termclass=termclass)
353        self.fieldnames = fieldnames
354   
355    def _make(self, methodname, fieldname, *args):
356        method = getattr(super(MultifieldParser, self), methodname)
357        if fieldname is None:
358            return Or([method(fn, *args) for fn in self.fieldnames])
359        else:
360            return method(fieldname, *args)
361   
362    def make_term(self, fieldname, text):
363        return self._make("make_term", fieldname, text)
364   
365    def make_range(self, fieldname, start, end, startexcl, endexcl):
366        return self._make("make_range", fieldname, start, end, startexcl, endexcl)
367   
368    def make_wildcard(self, fieldname, text):
369        return self._make("make_wildcard", fieldname, text)
370   
371    def make_phrase(self, fieldname, text):
372        return self._make("make_phrase", fieldname, text)
373
374
375
376
377
378
Note: See TracBrowser for help on using the browser.