| 1 | |
|---|
| 2 | |
|---|
| 3 | |
|---|
| 4 | |
|---|
| 5 | |
|---|
| 6 | |
|---|
| 7 | |
|---|
| 8 | |
|---|
| 9 | |
|---|
| 10 | |
|---|
| 11 | |
|---|
| 12 | |
|---|
| 13 | |
|---|
| 14 | |
|---|
| 15 | |
|---|
| 16 | |
|---|
| 17 | from whoosh.query import Term, DisjunctionMax, And, Or, AndMaybe, AndNot, Phrase |
|---|
| 18 | |
|---|
| 19 | |
|---|
| 20 | class SimpleParser(object): |
|---|
| 21 | def __init__(self, fieldname, schema=None, termclass=Term, phraseclass=Phrase, |
|---|
| 22 | minmatch=0, minpercent=0.75, |
|---|
| 23 | phrasefields=None): |
|---|
| 24 | self.fieldname = fieldname |
|---|
| 25 | self.schema = schema |
|---|
| 26 | self.termclass = termclass |
|---|
| 27 | self.phraseclass = phraseclass |
|---|
| 28 | self.minmatch = minmatch |
|---|
| 29 | self.minpercent = minpercent |
|---|
| 30 | self.phrasefields = phrasefields |
|---|
| 31 | |
|---|
| 32 | def _split(self, input): |
|---|
| 33 | clauses = [] |
|---|
| 34 | pos = 0 |
|---|
| 35 | start = 0 |
|---|
| 36 | while pos < len(input): |
|---|
| 37 | c = input[pos] |
|---|
| 38 | wordstart = start == pos |
|---|
| 39 | if wordstart and c == '"': |
|---|
| 40 | end = input.find('"', pos+1) |
|---|
| 41 | if end > pos+1: |
|---|
| 42 | clauses.append(input[pos+1:end]) |
|---|
| 43 | pos = end + 1 |
|---|
| 44 | start = pos |
|---|
| 45 | else: |
|---|
| 46 | pos += 1 |
|---|
| 47 | start = pos |
|---|
| 48 | elif wordstart and c == "+": |
|---|
| 49 | clauses.append(True) |
|---|
| 50 | pos += 1 |
|---|
| 51 | start = pos |
|---|
| 52 | elif wordstart and c == "-": |
|---|
| 53 | clauses.append(False) |
|---|
| 54 | pos += 1 |
|---|
| 55 | start = pos |
|---|
| 56 | elif c.isspace(): |
|---|
| 57 | if not wordstart: |
|---|
| 58 | clauses.append(input[start:pos]) |
|---|
| 59 | pos += 1 |
|---|
| 60 | start = pos |
|---|
| 61 | else: |
|---|
| 62 | pos += 1 |
|---|
| 63 | |
|---|
| 64 | if start < len(input) - 1: |
|---|
| 65 | clauses.append(input[start:]) |
|---|
| 66 | |
|---|
| 67 | return clauses |
|---|
| 68 | |
|---|
| 69 | def _sort(self, parts): |
|---|
| 70 | reqs = [] |
|---|
| 71 | opts = [] |
|---|
| 72 | nots = [] |
|---|
| 73 | phrase = [] |
|---|
| 74 | nextlist = opts |
|---|
| 75 | for part in parts: |
|---|
| 76 | if part is True: |
|---|
| 77 | nextlist = reqs |
|---|
| 78 | elif part is False: |
|---|
| 79 | nextlist = nots |
|---|
| 80 | else: |
|---|
| 81 | nextlist.append(part) |
|---|
| 82 | phrase.append(part) |
|---|
| 83 | nextlist = opts |
|---|
| 84 | |
|---|
| 85 | return (reqs, opts, nots, phrase) |
|---|
| 86 | |
|---|
| 87 | def get_term_text(self, fieldname, text, **kwargs): |
|---|
| 88 | if self.schema: |
|---|
| 89 | field = self.schema[fieldname] |
|---|
| 90 | if not field.format: |
|---|
| 91 | raise Exception("%s field has no format" % field) |
|---|
| 92 | return [token.text for token in field.format.analyze(text, mode="query", **kwargs)] |
|---|
| 93 | else: |
|---|
| 94 | return [text] |
|---|
| 95 | |
|---|
| 96 | def make_basic_clause(self, fieldname, text, boost=1.0): |
|---|
| 97 | parts = self.get_term_text(fieldname, text) |
|---|
| 98 | if len(parts) > 1: |
|---|
| 99 | return self.phraseclass(fieldname, parts, boost=boost) |
|---|
| 100 | else: |
|---|
| 101 | return self.termclass(fieldname, parts[0], boost=boost) |
|---|
| 102 | |
|---|
| 103 | def make_clause(self, text, boost=1.0): |
|---|
| 104 | return self.make_basic_clause(self.fieldname, text, boost=boost) |
|---|
| 105 | |
|---|
| 106 | def make_filter_clause(self, text): |
|---|
| 107 | return self.make_basic_clause(self.fieldname, text) |
|---|
| 108 | |
|---|
| 109 | def parse(self, input, normalize=True): |
|---|
| 110 | reqs, opts, nots, phrase = self._sort(self._split(input)) |
|---|
| 111 | make_clause = self.make_clause |
|---|
| 112 | make_filter_clause = self.make_filter_clause |
|---|
| 113 | |
|---|
| 114 | reqs = [make_clause(text) for text in reqs] |
|---|
| 115 | opts = [make_clause(text) for text in opts] |
|---|
| 116 | nots = [make_filter_clause(text) for text in nots] |
|---|
| 117 | |
|---|
| 118 | pctmatch = int((len(reqs) + len(opts)) * self.minpercent) - len(reqs) |
|---|
| 119 | minmatch = max(pctmatch, self.minmatch - len(reqs), 0) |
|---|
| 120 | |
|---|
| 121 | q = Or(opts, minmatch=minmatch) |
|---|
| 122 | if reqs: q = AndMaybe(And(reqs), q) |
|---|
| 123 | if nots: q = AndNot(q, Or(nots)) |
|---|
| 124 | |
|---|
| 125 | if normalize: |
|---|
| 126 | q = q.normalize() |
|---|
| 127 | return q |
|---|
| 128 | |
|---|
| 129 | |
|---|
| 130 | class DisMaxParser(SimpleParser): |
|---|
| 131 | def __init__(self, fieldboosts, schema=None, termclass=Term, phraseclass=Phrase, |
|---|
| 132 | minmatch=0, minpercent=0.75, tiebreak=0.0, |
|---|
| 133 | phrasefields=None): |
|---|
| 134 | self.fieldboosts = fieldboosts |
|---|
| 135 | self.schema = schema |
|---|
| 136 | self.termclass = termclass |
|---|
| 137 | self.phraseclass = phraseclass |
|---|
| 138 | self.minmatch = minmatch |
|---|
| 139 | self.minpercent = minpercent |
|---|
| 140 | self.tiebreak = tiebreak |
|---|
| 141 | self.phrasefields = phrasefields |
|---|
| 142 | |
|---|
| 143 | def make_clause(self, text): |
|---|
| 144 | clauses = [self.make_basic_clause(fieldname, text, boost=boost) |
|---|
| 145 | for fieldname, boost in self.fieldboosts.iteritems()] |
|---|
| 146 | return DisjunctionMax(clauses, tiebreak=self.tiebreak) |
|---|
| 147 | |
|---|
| 148 | def make_filter_clause(self, text): |
|---|
| 149 | return Or([self.make_basic_clause(fieldname, text) |
|---|
| 150 | for fieldname in self.fieldboosts.iterkeys()]) |
|---|
| 151 | |
|---|
| 152 | |
|---|
| 153 | |
|---|
| 154 | if __name__ == "__main__": |
|---|
| 155 | print SimpleParser("a").parse('alfa +bravo -"charlie delta" echo') |
|---|
| 156 | print DisMaxParser({"a": 1.0, "b": 0.5}, minpercent=0.8).parse('alfa bravo charlie delta echo foxtrot golf hotel india') |
|---|
| 157 | |
|---|
| 158 | |
|---|
| 159 | |
|---|