root/projects/whoosh/trunk/src/whoosh/qparser/simple.py @ 404

Revision 404, 5.5 KB (checked in by matt, 8 months ago)

Reorganized qparser module into a package.
Added DisMaxParser?.
Reorganized DisjunctionMax? query.

Line 
1#===============================================================================
2# Copyright 2010 Matt Chaput
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8#    http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15#===============================================================================
16
17from whoosh.query import Term, DisjunctionMax, And, Or, AndMaybe, AndNot, Phrase
18
19
20class SimpleParser(object):
21    def __init__(self, fieldname, schema=None, termclass=Term, phraseclass=Phrase,
22                 minmatch=0, minpercent=0.75,
23                 phrasefields=None):
24        self.fieldname = fieldname
25        self.schema = schema
26        self.termclass = termclass
27        self.phraseclass = phraseclass
28        self.minmatch = minmatch
29        self.minpercent = minpercent
30        self.phrasefields = phrasefields
31   
32    def _split(self, input):
33        clauses = []
34        pos = 0
35        start = 0
36        while pos < len(input):
37            c = input[pos]
38            wordstart = start == pos
39            if wordstart and c == '"':
40                end = input.find('"', pos+1)
41                if end > pos+1:
42                    clauses.append(input[pos+1:end])
43                    pos = end + 1
44                    start = pos
45                else:
46                    pos += 1
47                    start = pos
48            elif wordstart and c == "+":
49                clauses.append(True)
50                pos += 1
51                start = pos
52            elif wordstart and c == "-":
53                clauses.append(False)
54                pos += 1
55                start = pos
56            elif c.isspace():
57                if not wordstart:
58                    clauses.append(input[start:pos])
59                pos += 1
60                start = pos
61            else:
62                pos += 1
63       
64        if start < len(input) - 1:
65            clauses.append(input[start:])
66       
67        return clauses
68   
69    def _sort(self, parts):
70        reqs = []
71        opts = []
72        nots = []
73        phrase = []
74        nextlist = opts
75        for part in parts:
76            if part is True:
77                nextlist = reqs
78            elif part is False:
79                nextlist = nots
80            else:
81                nextlist.append(part)
82                phrase.append(part)
83                nextlist = opts
84       
85        return (reqs, opts, nots, phrase)
86   
87    def get_term_text(self, fieldname, text, **kwargs):
88        if self.schema:
89            field = self.schema[fieldname]
90            if not field.format:
91                raise Exception("%s field has no format" % field)
92            return [token.text for token in field.format.analyze(text, mode="query", **kwargs)]
93        else:
94            return [text]
95   
96    def make_basic_clause(self, fieldname, text, boost=1.0):
97        parts = self.get_term_text(fieldname, text)
98        if len(parts) > 1:
99            return self.phraseclass(fieldname, parts, boost=boost)
100        else:
101            return self.termclass(fieldname, parts[0], boost=boost)
102       
103    def make_clause(self, text, boost=1.0):
104        return self.make_basic_clause(self.fieldname, text, boost=boost)
105   
106    def make_filter_clause(self, text):
107        return self.make_basic_clause(self.fieldname, text)
108   
109    def parse(self, input, normalize=True):
110        reqs, opts, nots, phrase = self._sort(self._split(input))
111        make_clause = self.make_clause
112        make_filter_clause = self.make_filter_clause
113       
114        reqs = [make_clause(text) for text in reqs]
115        opts = [make_clause(text) for text in opts]
116        nots = [make_filter_clause(text) for text in nots]
117       
118        pctmatch = int((len(reqs) + len(opts)) * self.minpercent) - len(reqs)
119        minmatch = max(pctmatch, self.minmatch - len(reqs), 0)
120       
121        q = Or(opts, minmatch=minmatch)
122        if reqs: q = AndMaybe(And(reqs), q)
123        if nots: q = AndNot(q, Or(nots))
124       
125        if normalize:
126            q = q.normalize()
127        return q
128
129
130class DisMaxParser(SimpleParser):
131    def __init__(self, fieldboosts, schema=None, termclass=Term, phraseclass=Phrase,
132                 minmatch=0, minpercent=0.75, tiebreak=0.0,
133                 phrasefields=None):
134        self.fieldboosts = fieldboosts
135        self.schema = schema
136        self.termclass = termclass
137        self.phraseclass = phraseclass
138        self.minmatch = minmatch
139        self.minpercent = minpercent
140        self.tiebreak = tiebreak
141        self.phrasefields = phrasefields
142       
143    def make_clause(self, text):
144        clauses = [self.make_basic_clause(fieldname, text, boost=boost)
145                   for fieldname, boost in self.fieldboosts.iteritems()]
146        return DisjunctionMax(clauses, tiebreak=self.tiebreak)
147
148    def make_filter_clause(self, text):
149        return Or([self.make_basic_clause(fieldname, text)
150                   for fieldname in self.fieldboosts.iterkeys()])
151       
152
153
154if __name__ == "__main__":
155    print SimpleParser("a").parse('alfa +bravo -"charlie delta" echo')
156    print DisMaxParser({"a": 1.0, "b": 0.5}, minpercent=0.8).parse('alfa bravo charlie delta echo foxtrot golf hotel india')
157   
158
159
Note: See TracBrowser for help on using the browser.