| 1 | """ |
|---|
| 2 | This module contains the default search query parser. |
|---|
| 3 | |
|---|
| 4 | This uses the excellent Pyparsing module |
|---|
| 5 | (http://pyparsing.sourceforge.net/) to parse search query strings |
|---|
| 6 | into nodes from the query module. |
|---|
| 7 | |
|---|
| 8 | This parser handles: |
|---|
| 9 | |
|---|
| 10 | * 'AND', 'OR', 'NOT' |
|---|
| 11 | * grouping with parentheses |
|---|
| 12 | * quoted phrase searching |
|---|
| 13 | * wildcards, e.g. help* |
|---|
| 14 | * ranges, e.g. [a TO b] |
|---|
| 15 | * fields, e.g. title:whoosh |
|---|
| 16 | |
|---|
| 17 | This parser was originally based on the searchparser example code available at: |
|---|
| 18 | |
|---|
| 19 | http://pyparsing.wikispaces.com/space/showimage/searchparser.py |
|---|
| 20 | """ |
|---|
| 21 | |
|---|
| 22 | |
|---|
| 23 | |
|---|
| 24 | |
|---|
| 25 | |
|---|
| 26 | |
|---|
| 27 | |
|---|
| 28 | |
|---|
| 29 | |
|---|
| 30 | |
|---|
| 31 | |
|---|
| 32 | |
|---|
| 33 | |
|---|
| 34 | |
|---|
| 35 | |
|---|
| 36 | |
|---|
| 37 | |
|---|
| 38 | |
|---|
| 39 | |
|---|
| 40 | |
|---|
| 41 | |
|---|
| 42 | |
|---|
| 43 | |
|---|
| 44 | |
|---|
| 45 | |
|---|
| 46 | |
|---|
| 47 | |
|---|
| 48 | |
|---|
| 49 | |
|---|
| 50 | |
|---|
| 51 | |
|---|
| 52 | |
|---|
| 53 | |
|---|
| 54 | |
|---|
| 55 | |
|---|
| 56 | from whoosh.support.pyparsing import (printables, alphanums, OneOrMore, |
|---|
| 57 | Group, Combine, Suppress, Optional, FollowedBy, |
|---|
| 58 | Literal, CharsNotIn, Word, Keyword, |
|---|
| 59 | Empty, White, Forward, QuotedString, |
|---|
| 60 | StringEnd) |
|---|
| 61 | from whoosh.query import * |
|---|
| 62 | |
|---|
| 63 | |
|---|
| 64 | def _make_default_parser(): |
|---|
| 65 | escapechar = "\\" |
|---|
| 66 | |
|---|
| 67 | |
|---|
| 68 | |
|---|
| 69 | |
|---|
| 70 | |
|---|
| 71 | |
|---|
| 72 | wordtext = CharsNotIn('\\*?^():"{}[] ') |
|---|
| 73 | escape = Suppress(escapechar) + (Word(printables, exact=1) | White(exact=1)) |
|---|
| 74 | wordtoken = Combine(OneOrMore(wordtext | escape)) |
|---|
| 75 | |
|---|
| 76 | |
|---|
| 77 | plainWord = Group(wordtoken).setResultsName("Word") |
|---|
| 78 | |
|---|
| 79 | |
|---|
| 80 | wildchars = Word("?*") |
|---|
| 81 | |
|---|
| 82 | wildmixed = wordtoken + OneOrMore(wildchars + Optional(wordtoken)) |
|---|
| 83 | |
|---|
| 84 | wildstart = wildchars + (OneOrMore(wordtoken + Optional(wildchars)) | FollowedBy(White() | StringEnd())) |
|---|
| 85 | wildcard = Group(Combine(wildmixed | wildstart)).setResultsName("Wildcard") |
|---|
| 86 | |
|---|
| 87 | |
|---|
| 88 | startfence = Literal("[") | Literal("{") |
|---|
| 89 | endfence = Literal("]") | Literal("}") |
|---|
| 90 | rangeitem = QuotedString('"') | wordtoken |
|---|
| 91 | openstartrange = Group(Empty()) + Suppress(Keyword("TO") + White()) + Group(rangeitem) |
|---|
| 92 | openendrange = Group(rangeitem) + Suppress(White() + Keyword("TO")) + Group(Empty()) |
|---|
| 93 | normalrange = Group(rangeitem) + Suppress(White() + Keyword("TO") + White()) + Group(rangeitem) |
|---|
| 94 | range = Group(startfence + (normalrange | openstartrange | openendrange) + endfence).setResultsName("Range") |
|---|
| 95 | |
|---|
| 96 | |
|---|
| 97 | |
|---|
| 98 | |
|---|
| 99 | |
|---|
| 100 | |
|---|
| 101 | |
|---|
| 102 | |
|---|
| 103 | |
|---|
| 104 | generalWord = range | wildcard | plainWord |
|---|
| 105 | |
|---|
| 106 | |
|---|
| 107 | quotedPhrase = Group(QuotedString('"')).setResultsName("Quotes") |
|---|
| 108 | |
|---|
| 109 | expression = Forward() |
|---|
| 110 | |
|---|
| 111 | |
|---|
| 112 | parenthetical = Group((Suppress("(") + expression + Suppress(")"))).setResultsName("Group") |
|---|
| 113 | |
|---|
| 114 | boostableUnit = generalWord | quotedPhrase |
|---|
| 115 | boostedUnit = Group(boostableUnit + Suppress("^") + Word("0123456789", ".0123456789")).setResultsName("Boost") |
|---|
| 116 | |
|---|
| 117 | |
|---|
| 118 | |
|---|
| 119 | |
|---|
| 120 | fieldableUnit = parenthetical | boostedUnit | boostableUnit |
|---|
| 121 | fieldedUnit = Group(Word(alphanums + "_") + Suppress(':') + fieldableUnit).setResultsName("Field") |
|---|
| 122 | |
|---|
| 123 | |
|---|
| 124 | unit = fieldedUnit | fieldableUnit |
|---|
| 125 | |
|---|
| 126 | |
|---|
| 127 | operatorNot = Group(Suppress(Keyword("not", caseless=True)) + Suppress(White()) + unit).setResultsName("Not") |
|---|
| 128 | generalUnit = operatorNot | unit |
|---|
| 129 | |
|---|
| 130 | andToken = Keyword("AND", caseless=False) |
|---|
| 131 | orToken = Keyword("OR", caseless=False) |
|---|
| 132 | andNotToken = Keyword("ANDNOT", caseless=False) |
|---|
| 133 | |
|---|
| 134 | operatorAnd = Group(generalUnit + Suppress(White()) + Suppress(andToken) + Suppress(White()) + expression).setResultsName("And") |
|---|
| 135 | operatorOr = Group(generalUnit + Suppress(White()) + Suppress(orToken) + Suppress(White()) + expression).setResultsName("Or") |
|---|
| 136 | operatorAndNot = Group(unit + Suppress(White()) + Suppress(andNotToken) + Suppress(White()) + unit).setResultsName("AndNot") |
|---|
| 137 | |
|---|
| 138 | expression << (OneOrMore(operatorAnd | operatorOr | operatorAndNot | generalUnit | Suppress(White())) | Empty()) |
|---|
| 139 | |
|---|
| 140 | toplevel = Group(expression).setResultsName("Toplevel") + StringEnd() |
|---|
| 141 | |
|---|
| 142 | return toplevel.parseString |
|---|
| 143 | |
|---|
| 144 | DEFAULT_PARSER = _make_default_parser() |
|---|
| 145 | |
|---|
| 146 | |
|---|
| 147 | |
|---|
| 148 | |
|---|
| 149 | class PyparsingBasedParser(object): |
|---|
| 150 | def _field(self, fieldname): |
|---|
| 151 | if self.schema: |
|---|
| 152 | return self.schema[fieldname] |
|---|
| 153 | |
|---|
| 154 | def parse(self, input, normalize=True): |
|---|
| 155 | """Parses the input string and returns a Query object/tree. |
|---|
| 156 | |
|---|
| 157 | This method may return None if the input string does not result in any |
|---|
| 158 | valid queries. It may also raise a variety of exceptions if the input |
|---|
| 159 | string is malformed. |
|---|
| 160 | |
|---|
| 161 | :param input: the unicode string to parse. |
|---|
| 162 | :param normalize: whether to call normalize() on the query object/tree |
|---|
| 163 | before returning it. This should be left on unless you're trying to |
|---|
| 164 | debug the parser output. |
|---|
| 165 | :rtype: :class:`whoosh.query.Query` |
|---|
| 166 | """ |
|---|
| 167 | |
|---|
| 168 | ast = self.parser(input)[0] |
|---|
| 169 | q = self._eval(ast, self.default_field) |
|---|
| 170 | if q and normalize: |
|---|
| 171 | q = q.normalize() |
|---|
| 172 | return q |
|---|
| 173 | |
|---|
| 174 | |
|---|
| 175 | |
|---|
| 176 | |
|---|
| 177 | def _eval(self, node, fieldname): |
|---|
| 178 | |
|---|
| 179 | |
|---|
| 180 | name = node.getName() |
|---|
| 181 | return getattr(self, "_" + name)(node, fieldname) |
|---|
| 182 | |
|---|
| 183 | def get_term_text(self, field, text, **kwargs): |
|---|
| 184 | |
|---|
| 185 | for t in field.process_text(text, mode="query", **kwargs): |
|---|
| 186 | return t |
|---|
| 187 | |
|---|
| 188 | def make_term(self, fieldname, text): |
|---|
| 189 | field = self._field(fieldname) |
|---|
| 190 | if field: |
|---|
| 191 | if field.parse_query: |
|---|
| 192 | return field.parse_query(fieldname, text) |
|---|
| 193 | else: |
|---|
| 194 | text = self.get_term_text(field, text) |
|---|
| 195 | |
|---|
| 196 | if text is None: |
|---|
| 197 | return NullQuery |
|---|
| 198 | return self.termclass(fieldname, text) |
|---|
| 199 | |
|---|
| 200 | def make_phrase(self, fieldname, text): |
|---|
| 201 | field = self._field(fieldname) |
|---|
| 202 | if field: |
|---|
| 203 | if field.parse_query: |
|---|
| 204 | return field.parse_query(fieldname, text) |
|---|
| 205 | |
|---|
| 206 | texts = list(field.process_text(text, mode="query")) |
|---|
| 207 | if not texts: |
|---|
| 208 | return self.termclass(fieldname, u'') |
|---|
| 209 | elif len(texts) == 1: |
|---|
| 210 | return self.termclass(fieldname, texts[0]) |
|---|
| 211 | else: |
|---|
| 212 | return Phrase(fieldname, texts) |
|---|
| 213 | else: |
|---|
| 214 | return Phrase(fieldname, text.split(" ")) |
|---|
| 215 | |
|---|
| 216 | def make_wildcard(self, fieldname, text): |
|---|
| 217 | field = self._field(fieldname) |
|---|
| 218 | if field: |
|---|
| 219 | text = self.get_term_text(field, text, tokenize=False, removestops=False) |
|---|
| 220 | return Wildcard(fieldname, text) |
|---|
| 221 | |
|---|
| 222 | def make_range(self, fieldname, start, end, startexcl, endexcl): |
|---|
| 223 | field = self._field(fieldname) |
|---|
| 224 | if field: |
|---|
| 225 | if start: |
|---|
| 226 | start = self.get_term_text(field, start, tokenize=False, removestops=False) |
|---|
| 227 | if end: |
|---|
| 228 | end = self.get_term_text(field, end, tokenize=False, removestops=False) |
|---|
| 229 | |
|---|
| 230 | if not start and not end: |
|---|
| 231 | raise QueryError("TermRange must have start and/or end") |
|---|
| 232 | if not start: |
|---|
| 233 | start = u'' |
|---|
| 234 | if not end: |
|---|
| 235 | end = u'\uFFFF' |
|---|
| 236 | return TermRange(fieldname, start, end, startexcl, endexcl) |
|---|
| 237 | |
|---|
| 238 | def make_and(self, qs): |
|---|
| 239 | return And(qs) |
|---|
| 240 | |
|---|
| 241 | def make_or(self, qs): |
|---|
| 242 | return Or(qs) |
|---|
| 243 | |
|---|
| 244 | def make_andnot(self, positive, negative): |
|---|
| 245 | return AndNot(positive, negative) |
|---|
| 246 | |
|---|
| 247 | def make_not(self, q): |
|---|
| 248 | return Not(q) |
|---|
| 249 | |
|---|
| 250 | |
|---|
| 251 | class QueryParser(PyparsingBasedParser): |
|---|
| 252 | """The default parser for Whoosh, implementing a powerful fielded |
|---|
| 253 | query language similar to Lucene's. |
|---|
| 254 | """ |
|---|
| 255 | |
|---|
| 256 | __inittypes__ = dict(default_field=str, schema="whoosh.fields.Schema", |
|---|
| 257 | conjunction="whoosh.query.Query", termclass="whoosh.query.Query") |
|---|
| 258 | |
|---|
| 259 | def __init__(self, default_field, schema=None, |
|---|
| 260 | conjunction=And, termclass=Term): |
|---|
| 261 | """ |
|---|
| 262 | :param default_field: Use this as the field for any terms without |
|---|
| 263 | an explicit field. For example, if the query string is |
|---|
| 264 | "hello f1:there" and the default field is "f2", the parsed |
|---|
| 265 | query will be as if the user had entered "f2:hello f1:there". |
|---|
| 266 | This argument is required. |
|---|
| 267 | :param conjuction: Use this query.Query class to join together clauses |
|---|
| 268 | where the user has not explictly specified a join. For example, |
|---|
| 269 | if this is query.And, the query string "a b c" will be parsed as |
|---|
| 270 | "a AND b AND c". If this is query.Or, the string will be parsed as |
|---|
| 271 | "a OR b OR c". |
|---|
| 272 | :param termclass: Use this query.Query class for bare terms. For example, |
|---|
| 273 | query.Term or query.Variations. |
|---|
| 274 | :param schema: An optional fields.Schema object. If this argument is present, |
|---|
| 275 | the appropriate field will be used to tokenize terms/phrases before |
|---|
| 276 | they are turned into query objects. |
|---|
| 277 | """ |
|---|
| 278 | |
|---|
| 279 | self.default_field = default_field |
|---|
| 280 | self.conjunction = conjunction |
|---|
| 281 | self.termclass = termclass |
|---|
| 282 | self.schema = schema |
|---|
| 283 | self.parser = DEFAULT_PARSER |
|---|
| 284 | |
|---|
| 285 | |
|---|
| 286 | |
|---|
| 287 | |
|---|
| 288 | |
|---|
| 289 | def _Toplevel(self, node, fieldname): |
|---|
| 290 | return self.conjunction([self._eval(s, fieldname) for s in node]) |
|---|
| 291 | |
|---|
| 292 | def _Word(self, node, fieldname): |
|---|
| 293 | return self.make_term(fieldname, node[0]) |
|---|
| 294 | |
|---|
| 295 | def _Quotes(self, node, fieldname): |
|---|
| 296 | return self.make_phrase(fieldname, node[0]) |
|---|
| 297 | |
|---|
| 298 | def _Range(self, node, fieldname): |
|---|
| 299 | startchar, start, end, endchar = node |
|---|
| 300 | startexcl = startchar == "{" |
|---|
| 301 | endexcl = endchar == "}" |
|---|
| 302 | starttext = endtext = None |
|---|
| 303 | if start: |
|---|
| 304 | starttext = start[0] |
|---|
| 305 | if end: |
|---|
| 306 | endtext = end[0] |
|---|
| 307 | return self.make_range(fieldname, starttext, endtext, startexcl, endexcl) |
|---|
| 308 | |
|---|
| 309 | def _Wildcard(self, node, fieldname): |
|---|
| 310 | return self.make_wildcard(fieldname, node[0]) |
|---|
| 311 | |
|---|
| 312 | def _And(self, node, fieldname): |
|---|
| 313 | return self.make_and([self._eval(s, fieldname) for s in node]) |
|---|
| 314 | |
|---|
| 315 | def _Or(self, node, fieldname): |
|---|
| 316 | return self.make_or([self._eval(s, fieldname) for s in node]) |
|---|
| 317 | |
|---|
| 318 | def _AndNot(self, node, fieldname): |
|---|
| 319 | return self.make_andnot(self._eval(node[0], fieldname), |
|---|
| 320 | self._eval(node[1], fieldname)) |
|---|
| 321 | |
|---|
| 322 | def _Not(self, node, fieldname): |
|---|
| 323 | return self.make_not(self._eval(node[0], fieldname)) |
|---|
| 324 | |
|---|
| 325 | def _Group(self, node, fieldname): |
|---|
| 326 | return self.conjunction([self._eval(s, fieldname) for s in node]) |
|---|
| 327 | |
|---|
| 328 | def _Field(self, node, fieldname): |
|---|
| 329 | return self._eval(node[1], node[0]) |
|---|
| 330 | |
|---|
| 331 | def _Boost(self, node, fieldname): |
|---|
| 332 | obj = self._eval(node[0], fieldname) |
|---|
| 333 | obj.boost = float(node[1]) |
|---|
| 334 | return obj |
|---|
| 335 | |
|---|
| 336 | |
|---|
| 337 | class MultifieldParser(QueryParser): |
|---|
| 338 | """A subclass of QueryParser. Instead of assigning unfielded clauses |
|---|
| 339 | to a default field, this class transforms them into an OR clause that |
|---|
| 340 | searches a list of fields. For example, if the list of multi-fields |
|---|
| 341 | is "f1", "f2" and the query string is "hello there", the class will |
|---|
| 342 | parse "(f1:hello OR f2:hello) (f1:there OR f2:there)". This is very |
|---|
| 343 | useful when you have two textual fields (e.g. "title" and "content") |
|---|
| 344 | you want to search by default. |
|---|
| 345 | """ |
|---|
| 346 | |
|---|
| 347 | __inittypes__ = dict(fieldnames=list, schema="whoosh.fields.Schema", |
|---|
| 348 | conjunction="whoosh.query.Query", termclass="whoosh.query.Query") |
|---|
| 349 | |
|---|
| 350 | def __init__(self, fieldnames, schema=None, conjunction=And, termclass=Term): |
|---|
| 351 | super(MultifieldParser, self).__init__(None, schema=schema, |
|---|
| 352 | conjunction=conjunction, termclass=termclass) |
|---|
| 353 | self.fieldnames = fieldnames |
|---|
| 354 | |
|---|
| 355 | def _make(self, methodname, fieldname, *args): |
|---|
| 356 | method = getattr(super(MultifieldParser, self), methodname) |
|---|
| 357 | if fieldname is None: |
|---|
| 358 | return Or([method(fn, *args) for fn in self.fieldnames]) |
|---|
| 359 | else: |
|---|
| 360 | return method(fieldname, *args) |
|---|
| 361 | |
|---|
| 362 | def make_term(self, fieldname, text): |
|---|
| 363 | return self._make("make_term", fieldname, text) |
|---|
| 364 | |
|---|
| 365 | def make_range(self, fieldname, start, end, startexcl, endexcl): |
|---|
| 366 | return self._make("make_range", fieldname, start, end, startexcl, endexcl) |
|---|
| 367 | |
|---|
| 368 | def make_wildcard(self, fieldname, text): |
|---|
| 369 | return self._make("make_wildcard", fieldname, text) |
|---|
| 370 | |
|---|
| 371 | def make_phrase(self, fieldname, text): |
|---|
| 372 | return self._make("make_phrase", fieldname, text) |
|---|
| 373 | |
|---|
| 374 | |
|---|
| 375 | |
|---|
| 376 | |
|---|
| 377 | |
|---|
| 378 | |
|---|