| 1 | |
|---|
| 2 | |
|---|
| 3 | |
|---|
| 4 | |
|---|
| 5 | |
|---|
| 6 | |
|---|
| 7 | |
|---|
| 8 | |
|---|
| 9 | |
|---|
| 10 | |
|---|
| 11 | |
|---|
| 12 | |
|---|
| 13 | |
|---|
| 14 | |
|---|
| 15 | |
|---|
| 16 | |
|---|
| 17 | """ |
|---|
| 18 | This module contains classes and functions related to searching the index. |
|---|
| 19 | """ |
|---|
| 20 | |
|---|
| 21 | |
|---|
| 22 | from __future__ import division |
|---|
| 23 | from heapq import heappush, heapreplace |
|---|
| 24 | from math import log |
|---|
| 25 | import sys, time |
|---|
| 26 | |
|---|
| 27 | from whoosh import classify, query, scoring |
|---|
| 28 | from whoosh.scoring import Sorter, FieldSorter |
|---|
| 29 | from whoosh.support.bitvector import BitVector |
|---|
| 30 | |
|---|
| 31 | if sys.platform == 'win32': |
|---|
| 32 | now = time.clock |
|---|
| 33 | else: |
|---|
| 34 | now = time.time |
|---|
| 35 | |
|---|
| 36 | |
|---|
| 37 | |
|---|
| 38 | |
|---|
| 39 | class Searcher(object): |
|---|
| 40 | """Wraps an :class:`~whoosh.reading.IndexReader` object and provides methods |
|---|
| 41 | for searching the index. |
|---|
| 42 | """ |
|---|
| 43 | |
|---|
| 44 | def __init__(self, ixreader, weighting=scoring.BM25F): |
|---|
| 45 | """ |
|---|
| 46 | :param ixreader: An :class:`~whoosh.reading.IndexReader` object for |
|---|
| 47 | the index to search. |
|---|
| 48 | :param weighting: A :class:`whoosh.scoring.Weighting` object to use to |
|---|
| 49 | score found documents. |
|---|
| 50 | """ |
|---|
| 51 | |
|---|
| 52 | self.ixreader = ixreader |
|---|
| 53 | |
|---|
| 54 | |
|---|
| 55 | for name in ("stored_fields", "postings", "vector", "vector_as", "schema"): |
|---|
| 56 | setattr(self, name, getattr(ixreader, name)) |
|---|
| 57 | |
|---|
| 58 | if type(weighting) is type: |
|---|
| 59 | self.weighting = weighting() |
|---|
| 60 | else: |
|---|
| 61 | self.weighting = weighting |
|---|
| 62 | |
|---|
| 63 | self.is_closed = False |
|---|
| 64 | self._idf_cache = {} |
|---|
| 65 | |
|---|
| 66 | |
|---|
| 67 | |
|---|
| 68 | |
|---|
| 69 | |
|---|
| 70 | def close(self): |
|---|
| 71 | self.ixreader.close() |
|---|
| 72 | self.is_closed = True |
|---|
| 73 | |
|---|
| 74 | def reader(self): |
|---|
| 75 | """Returns the underlying :class:`~whoosh.reading.IndexReader`.""" |
|---|
| 76 | return self.ixreader |
|---|
| 77 | |
|---|
| 78 | def idf(self, fieldid, text): |
|---|
| 79 | """Calculates the Inverse Document Frequency of the |
|---|
| 80 | current term. Subclasses may want to override this. |
|---|
| 81 | """ |
|---|
| 82 | |
|---|
| 83 | fieldnum = self.fieldname_to_num(fieldid) |
|---|
| 84 | cache = self._idf_cache |
|---|
| 85 | term = (fieldnum, text) |
|---|
| 86 | if term in cache: return cache[term] |
|---|
| 87 | |
|---|
| 88 | df = self.ixreader.doc_frequency(fieldnum, text) |
|---|
| 89 | idf = log(self.ixreader.doc_count_all() / (df + 1)) + 1.0 |
|---|
| 90 | cache[term] = idf |
|---|
| 91 | return idf |
|---|
| 92 | |
|---|
| 93 | def document(self, **kw): |
|---|
| 94 | """Convenience method returns the stored fields of a document |
|---|
| 95 | matching the given keyword arguments, where the keyword keys are |
|---|
| 96 | field names and the values are terms that must appear in the field. |
|---|
| 97 | |
|---|
| 98 | This method is equivalent to:: |
|---|
| 99 | |
|---|
| 100 | searcher.stored_fields(searcher.document_number(<keyword args>)) |
|---|
| 101 | |
|---|
| 102 | Where Searcher.documents() returns a generator, this function returns |
|---|
| 103 | either a dictionary or None. Use it when you assume the given keyword |
|---|
| 104 | arguments either match zero or one documents (i.e. at least one of the |
|---|
| 105 | fields is a unique key). |
|---|
| 106 | |
|---|
| 107 | >>> stored_fields = searcher.document(path=u"/a/b") |
|---|
| 108 | >>> if stored_fields: |
|---|
| 109 | ... print stored_fields['title'] |
|---|
| 110 | ... else: |
|---|
| 111 | ... print "There is no document with the path /a/b" |
|---|
| 112 | """ |
|---|
| 113 | |
|---|
| 114 | for p in self.documents(**kw): |
|---|
| 115 | return p |
|---|
| 116 | |
|---|
| 117 | def documents(self, **kw): |
|---|
| 118 | """Convenience method returns the stored fields of a document |
|---|
| 119 | matching the given keyword arguments, where the keyword keys are |
|---|
| 120 | field names and the values are terms that must appear in the field. |
|---|
| 121 | |
|---|
| 122 | Returns a generator of dictionaries containing the |
|---|
| 123 | stored fields of any documents matching the keyword arguments. |
|---|
| 124 | |
|---|
| 125 | >>> for stored_fields in searcher.documents(emailto=u"matt@whoosh.ca"): |
|---|
| 126 | ... print "Email subject:", stored_fields['subject'] |
|---|
| 127 | """ |
|---|
| 128 | |
|---|
| 129 | ixreader = self.ixreader |
|---|
| 130 | return (ixreader.stored_fields(docnum) for docnum in self.document_numbers(**kw)) |
|---|
| 131 | |
|---|
| 132 | def document_number(self, **kw): |
|---|
| 133 | """Returns the document number of the document matching the |
|---|
| 134 | given keyword arguments, where the keyword keys are |
|---|
| 135 | field names and the values are terms that must appear in the field. |
|---|
| 136 | |
|---|
| 137 | >>> docnum = searcher.document_number(path=u"/a/b") |
|---|
| 138 | |
|---|
| 139 | Where Searcher.document_numbers() returns a generator, this function returns |
|---|
| 140 | either an int or None. Use it when you assume the given keyword arguments |
|---|
| 141 | either match zero or one documents (i.e. at least one of the fields is a |
|---|
| 142 | unique key). |
|---|
| 143 | |
|---|
| 144 | :rtype: int |
|---|
| 145 | """ |
|---|
| 146 | |
|---|
| 147 | for docnum in self.document_numbers(**kw): |
|---|
| 148 | return docnum |
|---|
| 149 | |
|---|
| 150 | def document_numbers(self, **kw): |
|---|
| 151 | """Returns a generator of the document numbers for documents |
|---|
| 152 | matching the given keyword arguments, where the keyword keys are |
|---|
| 153 | field names and the values are terms that must appear in the field. |
|---|
| 154 | |
|---|
| 155 | >>> docnums = list(searcher.document_numbers(emailto=u"matt@whoosh.ca")) |
|---|
| 156 | """ |
|---|
| 157 | |
|---|
| 158 | q = query.And([query.Term(k, v) for k, v in kw.iteritems()]) |
|---|
| 159 | q = q.normalize() |
|---|
| 160 | if q: |
|---|
| 161 | return q.docs(self) |
|---|
| 162 | |
|---|
| 163 | def key_terms(self, docnums, fieldname, numterms = 5, |
|---|
| 164 | model = classify.Bo1Model, normalize = True): |
|---|
| 165 | """Returns the 'numterms' most important terms from the documents listed |
|---|
| 166 | (by number) in 'docnums'. You can get document numbers for the documents |
|---|
| 167 | your interested in with the document_number() and document_numbers() methods. |
|---|
| 168 | |
|---|
| 169 | >>> docnum = searcher.document_number(path=u"/a/b") |
|---|
| 170 | >>> keywords = list(searcher.key_terms([docnum], "content")) |
|---|
| 171 | |
|---|
| 172 | "Most important" is generally defined as terms that occur |
|---|
| 173 | frequently in the top hits but relatively infrequently in the collection as |
|---|
| 174 | a whole. |
|---|
| 175 | |
|---|
| 176 | :param fieldname: Look at the terms in this field. This field must store vectors. |
|---|
| 177 | :param docnums: A sequence of document numbers specifying which documents to |
|---|
| 178 | extract key terms from. |
|---|
| 179 | :param numterms: Return this number of important terms. |
|---|
| 180 | :param model: The classify.ExpansionModel to use. See the classify module. |
|---|
| 181 | """ |
|---|
| 182 | |
|---|
| 183 | ixreader = self.ixreader |
|---|
| 184 | fieldnum = self.fieldname_to_num(fieldname) |
|---|
| 185 | |
|---|
| 186 | expander = classify.Expander(self, fieldname, model = model) |
|---|
| 187 | for docnum in docnums: |
|---|
| 188 | expander.add(ixreader.vector_as(docnum, fieldnum, "weight")) |
|---|
| 189 | return expander.expanded_terms(numterms, normalize = normalize) |
|---|
| 190 | |
|---|
| 191 | def search_page(self, query, pagenum, pagelen=10, **kwargs): |
|---|
| 192 | results = self.search(query, limit=pagenum * pagelen, **kwargs) |
|---|
| 193 | return ResultsPage(results, pagenum, pagelen) |
|---|
| 194 | |
|---|
| 195 | def find(self, defaultfield, querystring, **kwargs): |
|---|
| 196 | from whoosh.qparser import QueryParser |
|---|
| 197 | qp = QueryParser(defaultfield) |
|---|
| 198 | q = qp.parse(querystring) |
|---|
| 199 | return self.search(q, **kwargs) |
|---|
| 200 | |
|---|
| 201 | def search(self, query, limit=5000, sortedby=None, reverse=False, minscore=0.0001): |
|---|
| 202 | """Runs the query represented by the ``query`` object and returns a Results object. |
|---|
| 203 | |
|---|
| 204 | :param query: a :class:`whoosh.query.Query` object. |
|---|
| 205 | :param limit: the maximum number of documents to score. If you're only interested in |
|---|
| 206 | the top N documents, you can set limit=N to limit the scoring for a faster |
|---|
| 207 | search. |
|---|
| 208 | :param sortedby: if this parameter is not None, the results are sorted instead of scored. |
|---|
| 209 | If this value is a string, the results are sorted by the field named in the string. |
|---|
| 210 | If this value is a list or tuple, it is assumed to be a sequence of strings and the |
|---|
| 211 | results are sorted by the fieldnames in the sequence. Otherwise 'sortedby' should be |
|---|
| 212 | a scoring.Sorter object. |
|---|
| 213 | |
|---|
| 214 | The fields you want to sort by must be indexed. |
|---|
| 215 | |
|---|
| 216 | For example, to sort the results by the 'path' field:: |
|---|
| 217 | |
|---|
| 218 | searcher.find(q, sortedby = "path") |
|---|
| 219 | |
|---|
| 220 | To sort the results by the 'path' field and then the 'category' field:: |
|---|
| 221 | |
|---|
| 222 | searcher.find(q, sortedby = ("path", "category")) |
|---|
| 223 | |
|---|
| 224 | To use a sorting object:: |
|---|
| 225 | |
|---|
| 226 | searcher.find(q, sortedby = scoring.FieldSorter("path", key=mykeyfn)) |
|---|
| 227 | |
|---|
| 228 | Using a string or tuple simply instantiates a :class:`whoosh.scoring.FieldSorter` |
|---|
| 229 | or :class:`whoosh.scoring.MultiFieldSorter` object for you. To get a custom sort |
|---|
| 230 | order, instantiate your own ``FieldSorter`` with a ``key`` argument, or write |
|---|
| 231 | a custom :class:`whoosh.scoring.Sorter` class. |
|---|
| 232 | |
|---|
| 233 | FieldSorter and MultiFieldSorter cache the document order, using 4 bytes times |
|---|
| 234 | the number of documents in the index, and taking time to cache. To increase |
|---|
| 235 | performance, instantiate your own sorter and re-use it (but remember you need |
|---|
| 236 | to recreate it if the index changes). |
|---|
| 237 | |
|---|
| 238 | :param reverse: if ``sortedby`` is not None, this reverses the direction of the sort. |
|---|
| 239 | :param minscore: the minimum score to include in the results. |
|---|
| 240 | :rtype: :class:`Results` |
|---|
| 241 | """ |
|---|
| 242 | |
|---|
| 243 | ixreader = self.ixreader |
|---|
| 244 | |
|---|
| 245 | t = now() |
|---|
| 246 | if sortedby is not None: |
|---|
| 247 | if isinstance(sortedby, basestring): |
|---|
| 248 | sorter = scoring.FieldSorter(sortedby) |
|---|
| 249 | elif isinstance(sortedby, (list, tuple)): |
|---|
| 250 | sorter = scoring.MultiFieldSorter([FieldSorter(fn) for fn in sortedby]) |
|---|
| 251 | elif isinstance(sortedby, Sorter): |
|---|
| 252 | sorter = sortedby |
|---|
| 253 | else: |
|---|
| 254 | raise ValueError("sortedby argument must be a string, list, or Sorter (%r)" % sortedby) |
|---|
| 255 | |
|---|
| 256 | scored_list = sorter.order(self, query.docs(self), reverse = reverse) |
|---|
| 257 | scores = None |
|---|
| 258 | docvector = BitVector(ixreader.doc_count_all(), source = scored_list) |
|---|
| 259 | if len(scored_list) > limit: |
|---|
| 260 | scored_list = list(scored_list)[:limit] |
|---|
| 261 | else: |
|---|
| 262 | |
|---|
| 263 | topdocs = TopDocs(limit, ixreader.doc_count_all()) |
|---|
| 264 | final = self.weighting.final |
|---|
| 265 | topdocs.add_all(((docnum, final(self, docnum, score)) |
|---|
| 266 | for docnum, score in query.doc_scores(self)), |
|---|
| 267 | minscore) |
|---|
| 268 | |
|---|
| 269 | best = topdocs.best() |
|---|
| 270 | if best: |
|---|
| 271 | |
|---|
| 272 | |
|---|
| 273 | |
|---|
| 274 | scored_list, scores = zip(*topdocs.best()) |
|---|
| 275 | else: |
|---|
| 276 | scored_list = [] |
|---|
| 277 | scores = [] |
|---|
| 278 | |
|---|
| 279 | docvector = topdocs.docs |
|---|
| 280 | t = now() - t |
|---|
| 281 | |
|---|
| 282 | return Results(self, query, scored_list, docvector, runtime = t, scores = scores) |
|---|
| 283 | |
|---|
| 284 | def fieldname_to_num(self, fieldid): |
|---|
| 285 | """Returns the field number of the given field name. |
|---|
| 286 | """ |
|---|
| 287 | return self.schema.to_number(fieldid) |
|---|
| 288 | |
|---|
| 289 | def fieldnum_to_name(self, fieldnum): |
|---|
| 290 | """Returns the field name corresponding to the given field number. |
|---|
| 291 | """ |
|---|
| 292 | return self.schema.number_to_name(fieldnum) |
|---|
| 293 | |
|---|
| 294 | def field(self, fieldid): |
|---|
| 295 | """Returns the :class:`whoosh.fields.Field` object for the given field name. |
|---|
| 296 | """ |
|---|
| 297 | return self.schema[fieldid] |
|---|
| 298 | |
|---|
| 299 | |
|---|
| 300 | class TopDocs(object): |
|---|
| 301 | """This is like a list that only remembers the top N values that are added |
|---|
| 302 | to it. This increases efficiency when you only want the top N values, since |
|---|
| 303 | you don't have to sort most of the values (once the object reaches capacity |
|---|
| 304 | and the next item to consider has a lower score than the lowest item in the |
|---|
| 305 | collection, you can just throw it away). |
|---|
| 306 | |
|---|
| 307 | The reason we use this instead of heapq.nlargest is this object keeps |
|---|
| 308 | track of all docnums that were added, even if they're not in the "top N". |
|---|
| 309 | """ |
|---|
| 310 | |
|---|
| 311 | def __init__(self, capacity, max_doc, docvector = None): |
|---|
| 312 | self.capacity = capacity |
|---|
| 313 | self.docs = docvector or BitVector(max_doc) |
|---|
| 314 | self.heap = [] |
|---|
| 315 | self._total = 0 |
|---|
| 316 | |
|---|
| 317 | def __len__(self): |
|---|
| 318 | return len(self.sorted) |
|---|
| 319 | |
|---|
| 320 | def add_all(self, sequence, minscore): |
|---|
| 321 | """Adds a sequence of (item, score) pairs. |
|---|
| 322 | """ |
|---|
| 323 | |
|---|
| 324 | heap = self.heap |
|---|
| 325 | docs = self.docs |
|---|
| 326 | capacity = self.capacity |
|---|
| 327 | |
|---|
| 328 | subtotal = 0 |
|---|
| 329 | for docnum, score in sequence: |
|---|
| 330 | if score < minscore: continue |
|---|
| 331 | |
|---|
| 332 | docs.set(docnum) |
|---|
| 333 | subtotal += 1 |
|---|
| 334 | |
|---|
| 335 | if len(heap) >= capacity: |
|---|
| 336 | if score <= heap[0][0]: |
|---|
| 337 | continue |
|---|
| 338 | else: |
|---|
| 339 | heapreplace(heap, (score, docnum)) |
|---|
| 340 | else: |
|---|
| 341 | heappush(heap, (score, docnum)) |
|---|
| 342 | |
|---|
| 343 | self._total += subtotal |
|---|
| 344 | |
|---|
| 345 | def total(self): |
|---|
| 346 | """Returns the total number of documents added so far. |
|---|
| 347 | """ |
|---|
| 348 | |
|---|
| 349 | return self._total |
|---|
| 350 | |
|---|
| 351 | def best(self): |
|---|
| 352 | """Returns the "top N" items. Note that this call |
|---|
| 353 | involves sorting and reversing the internal queue, so you may |
|---|
| 354 | want to cache the results rather than calling this method |
|---|
| 355 | multiple times. |
|---|
| 356 | """ |
|---|
| 357 | |
|---|
| 358 | |
|---|
| 359 | return [(item, score) for score, item in reversed(sorted(self.heap))] |
|---|
| 360 | |
|---|
| 361 | |
|---|
| 362 | class Results(object): |
|---|
| 363 | """ |
|---|
| 364 | This object is not instantiated by the user; it is returned by a Searcher. |
|---|
| 365 | This object represents the results of a search query. You can mostly |
|---|
| 366 | use it as if it was a list of dictionaries, where each dictionary |
|---|
| 367 | is the stored fields of the document at that position in the results. |
|---|
| 368 | """ |
|---|
| 369 | |
|---|
| 370 | def __init__(self, searcher, query, scored_list, docvector, |
|---|
| 371 | scores = None, runtime = 0): |
|---|
| 372 | """ |
|---|
| 373 | :param searcher: the :class:`Searcher` object that produced these |
|---|
| 374 | results. |
|---|
| 375 | :param query: the original query that created these results. |
|---|
| 376 | :param scored_list: an ordered list of document numbers |
|---|
| 377 | representing the 'hits'. |
|---|
| 378 | :param docvector: a BitVector object where the indices are |
|---|
| 379 | document numbers and an 'on' bit means that document is |
|---|
| 380 | present in the results. |
|---|
| 381 | :param scores: a list of scores corresponding to the document |
|---|
| 382 | numbers in scored_list, or None if no scores are available. |
|---|
| 383 | :param runtime: the time it took to run this search. |
|---|
| 384 | """ |
|---|
| 385 | |
|---|
| 386 | self.searcher = searcher |
|---|
| 387 | self.query = query |
|---|
| 388 | |
|---|
| 389 | self.scored_list = scored_list |
|---|
| 390 | self.scores = scores |
|---|
| 391 | self.docs = docvector |
|---|
| 392 | self.runtime = runtime |
|---|
| 393 | |
|---|
| 394 | def __repr__(self): |
|---|
| 395 | return "<%s/%s Results for %r runtime=%s>" % (len(self), self.docs.count(), |
|---|
| 396 | self.query, |
|---|
| 397 | self.runtime) |
|---|
| 398 | |
|---|
| 399 | def __len__(self): |
|---|
| 400 | """Returns the TOTAL number of documents found by this search. Note this |
|---|
| 401 | may be greater than the number of ranked documents. |
|---|
| 402 | """ |
|---|
| 403 | return self.docs.count() |
|---|
| 404 | |
|---|
| 405 | def __getitem__(self, n): |
|---|
| 406 | stored_fields = self.searcher.stored_fields |
|---|
| 407 | if isinstance(n, slice): |
|---|
| 408 | return [stored_fields(i) for i in self.scored_list.__getitem__(n)] |
|---|
| 409 | else: |
|---|
| 410 | return stored_fields(self.scored_list[n]) |
|---|
| 411 | |
|---|
| 412 | def __iter__(self): |
|---|
| 413 | """Yields the stored fields of each result document in ranked order. |
|---|
| 414 | """ |
|---|
| 415 | stored_fields = self.searcher.stored_fields |
|---|
| 416 | for docnum in self.scored_list: |
|---|
| 417 | yield stored_fields(docnum) |
|---|
| 418 | |
|---|
| 419 | def iterslice(self, start, stop, step=1): |
|---|
| 420 | stored_fields = self.searcher.stored_fields |
|---|
| 421 | for docnum in self.scored_list[start:stop:step]: |
|---|
| 422 | yield stored_fields(docnum) |
|---|
| 423 | |
|---|
| 424 | @property |
|---|
| 425 | def total(self): |
|---|
| 426 | return self.docs.count() |
|---|
| 427 | |
|---|
| 428 | def copy(self): |
|---|
| 429 | """Returns a copy of this results object. |
|---|
| 430 | """ |
|---|
| 431 | |
|---|
| 432 | |
|---|
| 433 | scores = self.scores |
|---|
| 434 | if isinstance(scores, list): |
|---|
| 435 | scores = scores[:] |
|---|
| 436 | |
|---|
| 437 | |
|---|
| 438 | scored_list = self.scored_list |
|---|
| 439 | if isinstance(scored_list, list): |
|---|
| 440 | scored_list = scored_list[:] |
|---|
| 441 | |
|---|
| 442 | return self.__class__(self.searcher, self.query, |
|---|
| 443 | scored_list=scored_list, docvector=self.docs.copy(), |
|---|
| 444 | scores=scores, runtime=self.runtime) |
|---|
| 445 | |
|---|
| 446 | def score(self, n): |
|---|
| 447 | """Returns the score for the document at the Nth position in the |
|---|
| 448 | list of results. If the search was not scored, returns None.""" |
|---|
| 449 | |
|---|
| 450 | if self.scores: |
|---|
| 451 | return self.scores[n] |
|---|
| 452 | else: |
|---|
| 453 | return None |
|---|
| 454 | |
|---|
| 455 | def scored_length(self): |
|---|
| 456 | """Returns the number of RANKED documents. Note this may be fewer |
|---|
| 457 | than the total number of documents the query matched, if you used |
|---|
| 458 | the 'limit' keyword of the Searcher.search() method to limit the |
|---|
| 459 | scoring.""" |
|---|
| 460 | |
|---|
| 461 | return len(self.scored_list) |
|---|
| 462 | |
|---|
| 463 | def docnum(self, n): |
|---|
| 464 | """Returns the document number of the result at position n in the |
|---|
| 465 | list of ranked documents. Use __getitem__ (i.e. Results[n]) to |
|---|
| 466 | get the stored fields directly. |
|---|
| 467 | """ |
|---|
| 468 | return self.scored_list[n] |
|---|
| 469 | |
|---|
| 470 | def key_terms(self, fieldname, docs = 10, numterms = 5, |
|---|
| 471 | model = classify.Bo1Model, normalize = True): |
|---|
| 472 | """Returns the 'numterms' most important terms from the top 'numdocs' documents |
|---|
| 473 | in these results. "Most important" is generally defined as terms that occur |
|---|
| 474 | frequently in the top hits but relatively infrequently in the collection as |
|---|
| 475 | a whole. |
|---|
| 476 | |
|---|
| 477 | :param fieldname: Look at the terms in this field. This field must store vectors. |
|---|
| 478 | :param docs: Look at this many of the top documents of the results. |
|---|
| 479 | :param terms: Return this number of important terms. |
|---|
| 480 | :param model: The classify.ExpansionModel to use. See the classify module. |
|---|
| 481 | :returns: list of unicode strings. |
|---|
| 482 | """ |
|---|
| 483 | |
|---|
| 484 | docs = min(docs, self.scored_length()) |
|---|
| 485 | if docs <= 0: return |
|---|
| 486 | |
|---|
| 487 | reader = self.searcher.reader() |
|---|
| 488 | fieldnum = self.searcher.fieldname_to_num(fieldname) |
|---|
| 489 | |
|---|
| 490 | expander = classify.Expander(reader, fieldname, model = model) |
|---|
| 491 | for docnum in self.scored_list[:docs]: |
|---|
| 492 | expander.add(reader.vector_as("weight", docnum, fieldnum)) |
|---|
| 493 | |
|---|
| 494 | return expander.expanded_terms(numterms, normalize = normalize) |
|---|
| 495 | |
|---|
| 496 | def extend(self, results): |
|---|
| 497 | """Appends hits from 'results' (that are not already in this |
|---|
| 498 | results object) to the end of these results. |
|---|
| 499 | |
|---|
| 500 | :param results: another results object. |
|---|
| 501 | """ |
|---|
| 502 | |
|---|
| 503 | docs = self.docs |
|---|
| 504 | self.scored_list.extend(docnum for docnum in results.scored_list |
|---|
| 505 | if docnum not in docs) |
|---|
| 506 | self.docs = docs | results.docs |
|---|
| 507 | |
|---|
| 508 | |
|---|
| 509 | |
|---|
| 510 | def filter(self, results): |
|---|
| 511 | """Removes any hits that are not also in the other results object. |
|---|
| 512 | """ |
|---|
| 513 | |
|---|
| 514 | docs = self.docs & results.docs |
|---|
| 515 | self.scored_list = [docnum for docnum in self.scored_list if docnum in docs] |
|---|
| 516 | self.docs = docs |
|---|
| 517 | |
|---|
| 518 | def upgrade(self, results, reverse = False): |
|---|
| 519 | """Re-sorts the results so any hits that are also in 'results' appear before |
|---|
| 520 | hits not in 'results', otherwise keeping their current relative positions. |
|---|
| 521 | This does not add the documents in the other results object to this one. |
|---|
| 522 | |
|---|
| 523 | :param results: another results object. |
|---|
| 524 | :param reverse: if True, lower the position of hits in the other |
|---|
| 525 | results object instead of raising them. |
|---|
| 526 | """ |
|---|
| 527 | |
|---|
| 528 | scored_list = self.scored_list |
|---|
| 529 | otherdocs = results.docs |
|---|
| 530 | arein = [docnum for docnum in scored_list if docnum in otherdocs] |
|---|
| 531 | notin = [docnum for docnum in scored_list if docnum not in otherdocs] |
|---|
| 532 | |
|---|
| 533 | if reverse: |
|---|
| 534 | self.scored_list = notin + arein |
|---|
| 535 | else: |
|---|
| 536 | self.scored_list = arein + notin |
|---|
| 537 | |
|---|
| 538 | def upgrade_and_extend(self, results): |
|---|
| 539 | """Combines the effects of extend() and increase(): hits that are |
|---|
| 540 | also in 'results' are raised. Then any hits from 'results' that are |
|---|
| 541 | not in this results object are appended to the end of these |
|---|
| 542 | results. |
|---|
| 543 | |
|---|
| 544 | :param results: another results object. |
|---|
| 545 | """ |
|---|
| 546 | |
|---|
| 547 | docs = self.docs |
|---|
| 548 | otherdocs = results.docs |
|---|
| 549 | scored_list = self.scored_list |
|---|
| 550 | |
|---|
| 551 | arein = [docnum for docnum in scored_list if docnum in otherdocs] |
|---|
| 552 | notin = [docnum for docnum in scored_list if docnum not in otherdocs] |
|---|
| 553 | other = [docnum for docnum in results.scored_list if docnum not in docs] |
|---|
| 554 | |
|---|
| 555 | self.docs = docs | otherdocs |
|---|
| 556 | self.scored_list = arein + notin + other |
|---|
| 557 | |
|---|
| 558 | |
|---|
| 559 | class ResultsPage(object): |
|---|
| 560 | """Represents a single page out of a longer list of results, as returned |
|---|
| 561 | by :func:`whoosh.searching.Searcher.search_page`. Supports a subset of |
|---|
| 562 | the interface of the :class:`~whoosh.searching.Results` object, namely |
|---|
| 563 | getting stored fields with __getitem__ (square brackets), iterating, and |
|---|
| 564 | the ``score()`` and ``docnum()`` methods. |
|---|
| 565 | |
|---|
| 566 | The ``offset`` attribute contains the results number this page starts |
|---|
| 567 | at (numbered from 0). For example, if the page length is 10, the ``offset`` |
|---|
| 568 | attribute on the second page will be ``10``. |
|---|
| 569 | |
|---|
| 570 | The ``pagecount`` attribute contains the number of pages available. |
|---|
| 571 | |
|---|
| 572 | The ``pagenum`` attribute contains the page number. This may be less than |
|---|
| 573 | the page you requested if the results had too few pages. For example, if |
|---|
| 574 | you do:: |
|---|
| 575 | |
|---|
| 576 | ResultsPage(results, 5) |
|---|
| 577 | |
|---|
| 578 | but the results object only contains 3 pages worth of hits, ``pagenum`` |
|---|
| 579 | will be 3. |
|---|
| 580 | |
|---|
| 581 | The ``pagelen`` attribute contains the number of results on this page |
|---|
| 582 | (which may be less than the page length you requested if this is the |
|---|
| 583 | last page of the results). |
|---|
| 584 | |
|---|
| 585 | The ``total`` attribute contains the total number of hits in the results. |
|---|
| 586 | |
|---|
| 587 | >>> mysearcher = myindex.searcher() |
|---|
| 588 | >>> pagenum = 2 |
|---|
| 589 | >>> page = mysearcher.find_page(pagenum, myquery) |
|---|
| 590 | >>> print("Page %s of %s, results %s to %s of %s" % |
|---|
| 591 | ... (pagenum, page.pagecount, page.offset+1, page.offset+page.pagelen, page.total)) |
|---|
| 592 | >>> for i, fields in enumerate(page): |
|---|
| 593 | ... print("%s. %r" % (page.offset + i + 1, fields)) |
|---|
| 594 | >>> mysearcher.close() |
|---|
| 595 | """ |
|---|
| 596 | |
|---|
| 597 | def __init__(self, results, pagenum, pagelen=10): |
|---|
| 598 | """ |
|---|
| 599 | :param results: a :class:`~whoosh.searching.Results` object. |
|---|
| 600 | :param pagenum: which page of the results to use, numbered from ``1``. |
|---|
| 601 | :param pagelen: the number of hits per page. |
|---|
| 602 | """ |
|---|
| 603 | self.results = results |
|---|
| 604 | self.pagenum = pagenum |
|---|
| 605 | self.total = len(results) |
|---|
| 606 | |
|---|
| 607 | self.pagecount = self.total // pagelen + 1 |
|---|
| 608 | if pagenum > self.pagecount: |
|---|
| 609 | pagenum = self.pagecount |
|---|
| 610 | self.pagenum = pagenum |
|---|
| 611 | |
|---|
| 612 | offset = (pagenum - 1) * pagelen |
|---|
| 613 | if (offset + pagelen) > self.total: |
|---|
| 614 | pagelen = self.total - offset |
|---|
| 615 | self.offset = offset |
|---|
| 616 | self.pagelen = pagelen |
|---|
| 617 | |
|---|
| 618 | def __getitem__(self, n): |
|---|
| 619 | offset = self.offset |
|---|
| 620 | if isinstance(n, slice): |
|---|
| 621 | start, stop, step = slice.indices(self.pagelen) |
|---|
| 622 | return self.results.__getitem__(slice(start + offset, stop + offset, step)) |
|---|
| 623 | else: |
|---|
| 624 | return self.results.__getitem__(n + offset) |
|---|
| 625 | |
|---|
| 626 | def __iter__(self): |
|---|
| 627 | offset, pagelen = self.offset, self.pagelen |
|---|
| 628 | return self.results.iterslice(offset, offset+pagelen) |
|---|
| 629 | |
|---|
| 630 | def score(self, n): |
|---|
| 631 | """Returns the score of the hit at the nth position on this page. |
|---|
| 632 | """ |
|---|
| 633 | return self.results.score(n + self.offset) |
|---|
| 634 | |
|---|
| 635 | def docnum(self, n): |
|---|
| 636 | """Returns the document number of the hit at the nth position on this page. |
|---|
| 637 | """ |
|---|
| 638 | return self.results.scored_list[n + self.offset] |
|---|
| 639 | |
|---|
| 640 | |
|---|
| 641 | if __name__ == '__main__': |
|---|
| 642 | pass |
|---|
| 643 | |
|---|
| 644 | |
|---|
| 645 | |
|---|
| 646 | |
|---|
| 647 | |
|---|