root/projects/whoosh/trunk/src/whoosh/searching.py @ 414

Revision 414, 25.4 KB (checked in by matt, 7 months ago)

Fixed docstring problem.

Line 
1#===============================================================================
2# Copyright 2007 Matt Chaput
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8#    http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15#===============================================================================
16
17"""
18This module contains classes and functions related to searching the index.
19"""
20
21
22from __future__ import division
23from heapq import heappush, heapreplace
24from math import log
25import sys, time
26
27from whoosh import classify, query, scoring
28from whoosh.scoring import Sorter, FieldSorter
29from whoosh.support.bitvector import BitVector
30
31if sys.platform == 'win32':
32    now = time.clock
33else:
34    now = time.time
35
36
37# Searcher class
38
39class Searcher(object):
40    """Wraps an :class:`~whoosh.reading.IndexReader` object and provides methods
41    for searching the index.
42    """
43   
44    def __init__(self, ixreader, weighting=scoring.BM25F):
45        """
46        :param ixreader: An :class:`~whoosh.reading.IndexReader` object for
47            the index to search.
48        :param weighting: A :class:`whoosh.scoring.Weighting` object to use to
49            score found documents.
50        """
51       
52        self.ixreader = ixreader
53       
54        # Copy attributes/methods from wrapped reader
55        for name in ("stored_fields", "postings", "vector", "vector_as", "schema"):
56            setattr(self, name, getattr(ixreader, name))
57       
58        if type(weighting) is type:
59            self.weighting = weighting()
60        else:
61            self.weighting = weighting
62       
63        self.is_closed = False
64        self._idf_cache = {}
65   
66    #def __del__(self):
67    #    if hasattr(self, "is_closed") and not self.is_closed:
68    #        self.close()
69   
70    def close(self):
71        self.ixreader.close()
72        self.is_closed = True
73   
74    def reader(self):
75        """Returns the underlying :class:`~whoosh.reading.IndexReader`."""
76        return self.ixreader
77   
78    def idf(self, fieldid, text):
79        """Calculates the Inverse Document Frequency of the
80        current term. Subclasses may want to override this.
81        """
82       
83        fieldnum = self.fieldname_to_num(fieldid)
84        cache = self._idf_cache
85        term = (fieldnum, text)
86        if term in cache: return cache[term]
87       
88        df = self.ixreader.doc_frequency(fieldnum, text)
89        idf = log(self.ixreader.doc_count_all() / (df + 1)) + 1.0
90        cache[term] = idf
91        return idf
92   
93    def document(self, **kw):
94        """Convenience method returns the stored fields of a document
95        matching the given keyword arguments, where the keyword keys are
96        field names and the values are terms that must appear in the field.
97       
98        This method is equivalent to::
99       
100            searcher.stored_fields(searcher.document_number(<keyword args>))
101       
102        Where Searcher.documents() returns a generator, this function returns
103        either a dictionary or None. Use it when you assume the given keyword
104        arguments either match zero or one documents (i.e. at least one of the
105        fields is a unique key).
106       
107        >>> stored_fields = searcher.document(path=u"/a/b")
108        >>> if stored_fields:
109        ...   print stored_fields['title']
110        ... else:
111        ...   print "There is no document with the path /a/b"
112        """
113       
114        for p in self.documents(**kw):
115            return p
116   
117    def documents(self, **kw):
118        """Convenience method returns the stored fields of a document
119        matching the given keyword arguments, where the keyword keys are
120        field names and the values are terms that must appear in the field.
121       
122        Returns a generator of dictionaries containing the
123        stored fields of any documents matching the keyword arguments.
124       
125        >>> for stored_fields in searcher.documents(emailto=u"matt@whoosh.ca"):
126        ...   print "Email subject:", stored_fields['subject']
127        """
128       
129        ixreader = self.ixreader
130        return (ixreader.stored_fields(docnum) for docnum in self.document_numbers(**kw))
131   
132    def document_number(self, **kw):
133        """Returns the document number of the document matching the
134        given keyword arguments, where the keyword keys are
135        field names and the values are terms that must appear in the field.
136       
137        >>> docnum = searcher.document_number(path=u"/a/b")
138       
139        Where Searcher.document_numbers() returns a generator, this function returns
140        either an int or None. Use it when you assume the given keyword arguments
141        either match zero or one documents (i.e. at least one of the fields is a
142        unique key).
143       
144        :rtype: int
145        """
146       
147        for docnum in self.document_numbers(**kw):
148            return docnum
149       
150    def document_numbers(self, **kw):
151        """Returns a generator of the document numbers for documents
152        matching the given keyword arguments, where the keyword keys are
153        field names and the values are terms that must appear in the field.
154       
155        >>> docnums = list(searcher.document_numbers(emailto=u"matt@whoosh.ca"))
156        """
157       
158        q = query.And([query.Term(k, v) for k, v in kw.iteritems()])
159        q = q.normalize()
160        if q:
161            return q.docs(self)
162   
163    def key_terms(self, docnums, fieldname, numterms = 5,
164                  model = classify.Bo1Model, normalize = True):
165        """Returns the 'numterms' most important terms from the documents listed
166        (by number) in 'docnums'. You can get document numbers for the documents
167        your interested in with the document_number() and document_numbers() methods.
168       
169        >>> docnum = searcher.document_number(path=u"/a/b")
170        >>> keywords = list(searcher.key_terms([docnum], "content"))
171       
172        "Most important" is generally defined as terms that occur
173        frequently in the top hits but relatively infrequently in the collection as
174        a whole.
175       
176        :param fieldname: Look at the terms in this field. This field must store vectors.
177        :param docnums: A sequence of document numbers specifying which documents to
178            extract key terms from.
179        :param numterms: Return this number of important terms.
180        :param model: The classify.ExpansionModel to use. See the classify module.
181        """
182       
183        ixreader = self.ixreader
184        fieldnum = self.fieldname_to_num(fieldname)
185       
186        expander = classify.Expander(self, fieldname, model = model)
187        for docnum in docnums:
188            expander.add(ixreader.vector_as(docnum, fieldnum, "weight"))
189        return expander.expanded_terms(numterms, normalize = normalize)
190   
191    def search_page(self, query, pagenum, pagelen=10, **kwargs):
192        results = self.search(query, limit=pagenum * pagelen, **kwargs)
193        return ResultsPage(results, pagenum, pagelen)
194   
195    def find(self, defaultfield, querystring, **kwargs):
196        from whoosh.qparser import QueryParser
197        qp = QueryParser(defaultfield)
198        q = qp.parse(querystring)
199        return self.search(q, **kwargs)
200   
201    def search(self, query, limit=5000, sortedby=None, reverse=False, minscore=0.0001):
202        """Runs the query represented by the ``query`` object and returns a Results object.
203       
204        :param query: a :class:`whoosh.query.Query` object.
205        :param limit: the maximum number of documents to score. If you're only interested in
206            the top N documents, you can set limit=N to limit the scoring for a faster
207            search.
208        :param sortedby: if this parameter is not None, the results are sorted instead of scored.
209            If this value is a string, the results are sorted by the field named in the string.
210            If this value is a list or tuple, it is assumed to be a sequence of strings and the
211            results are sorted by the fieldnames in the sequence. Otherwise 'sortedby' should be
212            a scoring.Sorter object.
213           
214            The fields you want to sort by must be indexed.
215           
216            For example, to sort the results by the 'path' field::
217           
218                searcher.find(q, sortedby = "path")
219               
220            To sort the results by the 'path' field and then the 'category' field::
221               
222                searcher.find(q, sortedby = ("path", "category"))
223               
224            To use a sorting object::
225           
226                searcher.find(q, sortedby = scoring.FieldSorter("path", key=mykeyfn))
227           
228            Using a string or tuple simply instantiates a :class:`whoosh.scoring.FieldSorter`
229            or :class:`whoosh.scoring.MultiFieldSorter` object for you. To get a custom sort
230            order, instantiate your own ``FieldSorter`` with a ``key`` argument, or write
231            a custom :class:`whoosh.scoring.Sorter` class.
232           
233            FieldSorter and MultiFieldSorter cache the document order, using 4 bytes times
234            the number of documents in the index, and taking time to cache. To increase
235            performance, instantiate your own sorter and re-use it (but remember you need
236            to recreate it if the index changes).
237       
238        :param reverse: if ``sortedby`` is not None, this reverses the direction of the sort.
239        :param minscore: the minimum score to include in the results.
240        :rtype: :class:`Results`
241        """
242       
243        ixreader = self.ixreader
244       
245        t = now()
246        if sortedby is not None:
247            if isinstance(sortedby, basestring):
248                sorter = scoring.FieldSorter(sortedby)
249            elif isinstance(sortedby, (list, tuple)):
250                sorter = scoring.MultiFieldSorter([FieldSorter(fn) for fn in sortedby])
251            elif isinstance(sortedby, Sorter):
252                sorter = sortedby
253            else:
254                raise ValueError("sortedby argument must be a string, list, or Sorter (%r)" % sortedby)
255           
256            scored_list = sorter.order(self, query.docs(self), reverse = reverse)
257            scores = None
258            docvector = BitVector(ixreader.doc_count_all(), source = scored_list)
259            if len(scored_list) > limit:
260                scored_list = list(scored_list)[:limit]
261        else:
262            # Sort by scores
263            topdocs = TopDocs(limit, ixreader.doc_count_all())
264            final = self.weighting.final
265            topdocs.add_all(((docnum, final(self, docnum, score))
266                             for docnum, score in query.doc_scores(self)),
267                             minscore)
268           
269            best = topdocs.best()
270            if best:
271                # topdocs.best() returns a list like
272                # [(docnum, score), (docnum, score), ... ]
273                # This unpacks that into two lists: docnums and scores
274                scored_list, scores = zip(*topdocs.best())
275            else:
276                scored_list = []
277                scores = []
278           
279            docvector = topdocs.docs
280        t = now() - t
281       
282        return Results(self, query, scored_list, docvector, runtime = t, scores = scores)
283   
284    def fieldname_to_num(self, fieldid):
285        """Returns the field number of the given field name.
286        """
287        return self.schema.to_number(fieldid)
288   
289    def fieldnum_to_name(self, fieldnum):
290        """Returns the field name corresponding to the given field number.
291        """
292        return self.schema.number_to_name(fieldnum)
293   
294    def field(self, fieldid):
295        """Returns the :class:`whoosh.fields.Field` object for the given field name.
296        """
297        return self.schema[fieldid]
298
299
300class TopDocs(object):
301    """This is like a list that only remembers the top N values that are added
302    to it. This increases efficiency when you only want the top N values, since
303    you don't have to sort most of the values (once the object reaches capacity
304    and the next item to consider has a lower score than the lowest item in the
305    collection, you can just throw it away).
306   
307    The reason we use this instead of heapq.nlargest is this object keeps
308    track of all docnums that were added, even if they're not in the "top N".
309    """
310   
311    def __init__(self, capacity, max_doc, docvector = None):
312        self.capacity = capacity
313        self.docs = docvector or BitVector(max_doc)
314        self.heap = []
315        self._total = 0
316
317    def __len__(self):
318        return len(self.sorted)
319
320    def add_all(self, sequence, minscore):
321        """Adds a sequence of (item, score) pairs.
322        """
323       
324        heap = self.heap
325        docs = self.docs
326        capacity = self.capacity
327       
328        subtotal = 0
329        for docnum, score in sequence:
330            if score < minscore: continue
331           
332            docs.set(docnum)
333            subtotal += 1
334           
335            if len(heap) >= capacity:
336                if score <= heap[0][0]:
337                    continue
338                else:
339                    heapreplace(heap, (score, docnum))
340            else:
341                heappush(heap, (score, docnum))
342       
343        self._total += subtotal
344
345    def total(self):
346        """Returns the total number of documents added so far.
347        """
348       
349        return self._total
350
351    def best(self):
352        """Returns the "top N" items. Note that this call
353        involves sorting and reversing the internal queue, so you may
354        want to cache the results rather than calling this method
355        multiple times.
356        """
357       
358        # Throw away the score and just return a list of items
359        return [(item, score) for score, item in reversed(sorted(self.heap))]
360
361
362class Results(object):
363    """
364    This object is not instantiated by the user; it is returned by a Searcher.
365    This object represents the results of a search query. You can mostly
366    use it as if it was a list of dictionaries, where each dictionary
367    is the stored fields of the document at that position in the results.
368    """
369   
370    def __init__(self, searcher, query, scored_list, docvector,
371                 scores = None, runtime = 0):
372        """
373        :param searcher: the :class:`Searcher` object that produced these
374            results.
375        :param query: the original query that created these results.
376        :param scored_list: an ordered list of document numbers
377            representing the 'hits'.
378        :param docvector: a BitVector object where the indices are
379            document numbers and an 'on' bit means that document is
380            present in the results.
381        :param scores: a list of scores corresponding to the document
382            numbers in scored_list, or None if no scores are available.
383        :param runtime: the time it took to run this search.
384        """
385       
386        self.searcher = searcher
387        self.query = query
388       
389        self.scored_list = scored_list
390        self.scores = scores
391        self.docs = docvector
392        self.runtime = runtime
393   
394    def __repr__(self):
395        return "<%s/%s Results for %r runtime=%s>" % (len(self), self.docs.count(),
396                                                      self.query,
397                                                      self.runtime)
398   
399    def __len__(self):
400        """Returns the TOTAL number of documents found by this search. Note this
401        may be greater than the number of ranked documents.
402        """
403        return self.docs.count()
404   
405    def __getitem__(self, n):
406        stored_fields = self.searcher.stored_fields
407        if isinstance(n, slice):
408            return [stored_fields(i) for i in self.scored_list.__getitem__(n)]
409        else:
410            return stored_fields(self.scored_list[n])
411   
412    def __iter__(self):
413        """Yields the stored fields of each result document in ranked order.
414        """
415        stored_fields = self.searcher.stored_fields
416        for docnum in self.scored_list:
417            yield stored_fields(docnum)
418   
419    def iterslice(self, start, stop, step=1):
420        stored_fields = self.searcher.stored_fields
421        for docnum in self.scored_list[start:stop:step]:
422            yield stored_fields(docnum)
423   
424    @property
425    def total(self):
426        return self.docs.count()
427   
428    def copy(self):
429        """Returns a copy of this results object.
430        """
431       
432        # Scores might be None, so only copy if it if it's a list
433        scores = self.scores
434        if isinstance(scores, list):
435            scores = scores[:]
436       
437        # Scored_list might be a tuple, so only copy it if it's a list
438        scored_list = self.scored_list
439        if isinstance(scored_list, list):
440            scored_list = scored_list[:]
441       
442        return self.__class__(self.searcher, self.query,
443                              scored_list=scored_list, docvector=self.docs.copy(),
444                              scores=scores, runtime=self.runtime)
445   
446    def score(self, n):
447        """Returns the score for the document at the Nth position in the
448        list of results. If the search was not scored, returns None."""
449       
450        if self.scores:
451            return self.scores[n]
452        else:
453            return None
454   
455    def scored_length(self):
456        """Returns the number of RANKED documents. Note this may be fewer
457        than the total number of documents the query matched, if you used
458        the 'limit' keyword of the Searcher.search() method to limit the
459        scoring."""
460       
461        return len(self.scored_list)
462   
463    def docnum(self, n):
464        """Returns the document number of the result at position n in the
465        list of ranked documents. Use __getitem__ (i.e. Results[n]) to
466        get the stored fields directly.
467        """
468        return self.scored_list[n]
469   
470    def key_terms(self, fieldname, docs = 10, numterms = 5,
471                  model = classify.Bo1Model, normalize = True):
472        """Returns the 'numterms' most important terms from the top 'numdocs' documents
473        in these results. "Most important" is generally defined as terms that occur
474        frequently in the top hits but relatively infrequently in the collection as
475        a whole.
476       
477        :param fieldname: Look at the terms in this field. This field must store vectors.
478        :param docs: Look at this many of the top documents of the results.
479        :param terms: Return this number of important terms.
480        :param model: The classify.ExpansionModel to use. See the classify module.
481        :returns: list of unicode strings.
482        """
483       
484        docs = min(docs, self.scored_length())
485        if docs <= 0: return
486       
487        reader = self.searcher.reader()
488        fieldnum = self.searcher.fieldname_to_num(fieldname)
489       
490        expander = classify.Expander(reader, fieldname, model = model)
491        for docnum in self.scored_list[:docs]:
492            expander.add(reader.vector_as("weight", docnum, fieldnum))
493       
494        return expander.expanded_terms(numterms, normalize = normalize)
495
496    def extend(self, results):
497        """Appends hits from 'results' (that are not already in this
498        results object) to the end of these results.
499       
500        :param results: another results object.
501        """
502       
503        docs = self.docs
504        self.scored_list.extend(docnum for docnum in results.scored_list
505                                if docnum not in docs)
506        self.docs = docs | results.docs
507       
508        # TODO: merge the query terms?
509   
510    def filter(self, results):
511        """Removes any hits that are not also in the other results object.
512        """
513       
514        docs = self.docs & results.docs
515        self.scored_list = [docnum for docnum in self.scored_list if docnum in docs]
516        self.docs = docs
517   
518    def upgrade(self, results, reverse = False):
519        """Re-sorts the results so any hits that are also in 'results' appear before
520        hits not in 'results', otherwise keeping their current relative positions.
521        This does not add the documents in the other results object to this one.
522       
523        :param results: another results object.
524        :param reverse: if True, lower the position of hits in the other
525            results object instead of raising them.
526        """
527       
528        scored_list = self.scored_list
529        otherdocs = results.docs
530        arein = [docnum for docnum in scored_list if docnum in otherdocs]
531        notin = [docnum for docnum in scored_list if docnum not in otherdocs]
532       
533        if reverse:
534            self.scored_list = notin + arein
535        else:
536            self.scored_list = arein + notin
537           
538    def upgrade_and_extend(self, results):
539        """Combines the effects of extend() and increase(): hits that are
540        also in 'results' are raised. Then any hits from 'results' that are
541        not in this results object are appended to the end of these
542        results.
543       
544        :param results: another results object.
545        """
546       
547        docs = self.docs
548        otherdocs = results.docs
549        scored_list = self.scored_list
550       
551        arein = [docnum for docnum in scored_list if docnum in otherdocs]
552        notin = [docnum for docnum in scored_list if docnum not in otherdocs]
553        other = [docnum for docnum in results.scored_list if docnum not in docs]
554       
555        self.docs = docs | otherdocs
556        self.scored_list = arein + notin + other
557       
558
559class ResultsPage(object):
560    """Represents a single page out of a longer list of results, as returned
561    by :func:`whoosh.searching.Searcher.search_page`. Supports a subset of
562    the interface of the :class:`~whoosh.searching.Results` object, namely
563    getting stored fields with __getitem__ (square brackets), iterating, and
564    the ``score()`` and ``docnum()`` methods.
565   
566    The ``offset`` attribute contains the results number this page starts
567    at (numbered from 0). For example, if the page length is 10, the ``offset``
568    attribute on the second page will be ``10``.
569   
570    The ``pagecount`` attribute contains the number of pages available.
571   
572    The ``pagenum`` attribute contains the page number. This may be less than
573    the page you requested if the results had too few pages. For example, if
574    you do::
575   
576        ResultsPage(results, 5)
577       
578    but the results object only contains 3 pages worth of hits, ``pagenum``
579    will be 3.
580   
581    The ``pagelen`` attribute contains the number of results on this page
582    (which may be less than the page length you requested if this is the
583    last page of the results).
584   
585    The ``total`` attribute contains the total number of hits in the results.
586   
587    >>> mysearcher = myindex.searcher()
588    >>> pagenum = 2
589    >>> page = mysearcher.find_page(pagenum, myquery)
590    >>> print("Page %s of %s, results %s to %s of %s" %
591    ...       (pagenum, page.pagecount, page.offset+1, page.offset+page.pagelen, page.total))
592    >>> for i, fields in enumerate(page):
593    ...   print("%s. %r" % (page.offset + i + 1, fields))
594    >>> mysearcher.close()
595    """
596   
597    def __init__(self, results, pagenum, pagelen=10):
598        """
599        :param results: a :class:`~whoosh.searching.Results` object.
600        :param pagenum: which page of the results to use, numbered from ``1``.
601        :param pagelen: the number of hits per page.
602        """
603        self.results = results
604        self.pagenum = pagenum
605        self.total = len(results)
606       
607        self.pagecount = self.total // pagelen + 1
608        if pagenum > self.pagecount:
609            pagenum = self.pagecount
610        self.pagenum = pagenum
611       
612        offset = (pagenum - 1) * pagelen
613        if (offset + pagelen) > self.total:
614            pagelen = self.total - offset
615        self.offset = offset
616        self.pagelen = pagelen
617   
618    def __getitem__(self, n):
619        offset = self.offset
620        if isinstance(n, slice):
621            start, stop, step = slice.indices(self.pagelen)
622            return self.results.__getitem__(slice(start + offset, stop + offset, step))
623        else:
624            return self.results.__getitem__(n + offset)
625   
626    def __iter__(self):
627        offset, pagelen = self.offset, self.pagelen
628        return self.results.iterslice(offset, offset+pagelen)
629           
630    def score(self, n):
631        """Returns the score of the hit at the nth position on this page.
632        """
633        return self.results.score(n + self.offset)
634   
635    def docnum(self, n):
636        """Returns the document number of the hit at the nth position on this page.
637        """
638        return self.results.scored_list[n + self.offset]
639       
640
641if __name__ == '__main__':
642    pass
643
644
645
646
647
Note: See TracBrowser for help on using the browser.