-
Notifications
You must be signed in to change notification settings - Fork 20
/
Copy pathdocset_spans.h
504 lines (411 loc) · 24.3 KB
/
docset_spans.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
// ## Important impl. requirements for all Spans
// See docset_iterators.h header comments. Depending on the type of the Span, you may want to advance
// provided iterators to the first document (i.e from 0 to whatever by invoking their next() method).
// See comments here for why that makes sense.
#pragma once
#include "docset_iterators.h"
namespace Trinity {
// DocsSetSpan::process() requires a MatchesProxy *. Its process() method
// will be invoked for every matched document.
//
// The relevant_document_provider::document() and relevant_document_provider::score()
// can be used by MatchesProxy subclasses to accomplish whatever's necessary.
class MatchesProxy {
public:
virtual void process(relevant_document_provider *) {
}
~MatchesProxy() {
}
};
// Instead of accessing documents sets directly using Iterators, using
// a DocsSetSpan makes more sense because subclasses can implement fancy schemes
// for higher performance.
//
// There only two methods subclasses override.
// process():
// process the span/range [min, max), i.e from min inclusive, to max exclusive
// and will return an estimate of the next matching document, after max(unless max == DocIDsEND)
// cost():
// return the evaluation cost; usually by considering the managed
// iterators or sub-spans.
//
// Check FilteredDocsSetSpan and GenericDocsSetSpan for how that works.
class DocsSetSpan {
protected:
// This is based on/inspired by Lucene's BooleanScorer design.
// Effectively, instead of naively merge-scanning all leaders, there is a prio.queue we use to track
// all iterators, where the iterator with the lowest documentID is the top.
//
// We then start from the top, and identify all other remaining leaders in the queue
// where their current document ID is within a window.
// The window is based on the top/first leader documentID
// and it's just a range from that document ID rounded down to SIZE(windowBase) and upto (windowBase + SIZE).
//
// Once we collect all from the PQ (there's a fast-path if we just collected one), we 'll advance
// them all (i.e call Iterator::next() on each of the decoders) until we have either drained the decoder, or the next
// document of the decoder. is outside our window. We collect all those IDs into a bitmap (See later), and if
// the decoder has not been drained, we 'll just push it back to the PQ.
//
// The core idea is that we have a bitmap that can hold SIZE bits, and we just set
// a bit in it based on a (document ID % SIZE), which works because all document IDs we are
// processing are within a SIZE range.
// We then just iterate the bitmap and identify the document IDs by considering the bits set and the window base
// That's pretty much all there is to it.
//
// This works great. The key ideas is the use of a PQ instead of a merge-scan among the leaders and the use
// of a bitmap and draining of iterators based on a document IDs window.
//
// Lucene does not consider the document and the matching terms together when computing a score; it computes a score
// for each matched Query (i.e BooleanQuery, TermQuery etc), and it just sums those scores together.
// This allows for some optimizations, but the end resuilt is that the score model is not great, and we really
// need to provide support for a very rich and powerful score model, one where the document and all matching terms
// and information about them is provided to a score callback and it can make use of that to produce a great relevant score.
//
//
// lucene uses 2k(2048), i.e shift is set to 11
// We set it to 13, which yields better performance(from 60ms down to 47ms)
// Setting to 14 yields worse perf. than 13
//
// This is a great algorithm for unions vs merge-scan, and what's more, we can also keep track
// of how many documents matched the same value (see Lucene impl.)
static constexpr std::size_t SHIFT{13};
static constexpr std::size_t SIZE{1 << SHIFT};
static constexpr std::size_t MASK{SIZE - 1};
static constexpr std::size_t SET_SIZE{SIZE / sizeof(uint64_t)};
public:
// process the span/range [min, max)
// i.e from min inclusive to max exclusive
//
// returns an estimate of the next matching document, after max(unless max == DocIDsEND)
virtual isrc_docid_t process(MatchesProxy *, const isrc_docid_t min, const isrc_docid_t max) = 0;
virtual ~DocsSetSpan() {
}
virtual uint64_t cost() = 0;
};
class GenericDocsSetSpan final
: public DocsSetSpan {
Trinity::DocsSetIterators::Iterator *const it;
public:
GenericDocsSetSpan(Trinity::DocsSetIterators::Iterator *const i)
: it{i} {
}
isrc_docid_t process(MatchesProxy *const mp, const isrc_docid_t min, const isrc_docid_t max) override final;
uint64_t cost() override final {
return it->cost();
}
};
// For a query such as a
// [ (1 | 10) -APPLE], where
// the cost of the NOT expression (i.e for APPLE) is lower than the cost of the required expr (i.e (1|10)), then
// it makes sense to use this DocsSetSpan. It will identify the next filtered document ID, and ask the DocsSetSpan set for req
// to process the range (prev, nextEcluded]
//
// This is faster than using a DocsSetIterators::Filter which will match required and then filter whatever's matched
class FilteredDocsSetSpan final
: public DocsSetSpan {
private:
DocsSetSpan *const req;
Trinity::DocsSetIterators::Iterator *const exclIt;
public:
FilteredDocsSetSpan(DocsSetSpan *const r, Trinity::DocsSetIterators::Iterator *const i)
: req{r}, exclIt{i} {
}
~FilteredDocsSetSpan() {
// We will assume ownership here
// not elegant, but practical
delete req;
}
isrc_docid_t process(MatchesProxy *const mp, const isrc_docid_t min, const isrc_docid_t max) override final;
uint64_t cost() override final {
return req->cost();
}
};
class DocsSetSpanForDisjunctions final
: public DocsSetSpan {
private:
struct Compare {
[[gnu::always_inline]] inline bool operator()(const DocsSetIterators::Iterator *a, const DocsSetIterators::Iterator *b) const noexcept {
return a->current() < b->current();
}
};
private:
uint64_t *const matching;
Switch::priority_queue<DocsSetIterators::Iterator *, Compare> pq;
DocsSetIterators::Iterator **const collected;
public:
DocsSetSpanForDisjunctions(std::vector<Trinity::DocsSetIterators::Iterator *> &its)
: matching((uint64_t *)calloc(SET_SIZE, sizeof(uint64_t))), pq(its.size() + 16), collected((DocsSetIterators::Iterator **)malloc(sizeof(DocsSetIterators::Iterator *) * (its.size() + 1))) {
for (auto it : its) {
// See comments in DocsSetSpanForDisjunctionsWithThreshold::process() collection loop
require(it->current() == 0);
it->next();
pq.push(it);
}
require(pq.size());
}
~DocsSetSpanForDisjunctions() noexcept {
std::free(matching);
std::free(collected);
}
isrc_docid_t process(MatchesProxy *const mp, const isrc_docid_t min, const isrc_docid_t max) override final;
uint64_t cost() override final {
uint64_t res{0};
for (auto it : pq)
res += it->cost();
return res;
}
};
class DocsSetSpanForDisjunctionsWithThreshold final
: public DocsSetSpan {
private:
struct Compare {
[[gnu::always_inline]] inline bool operator()(const DocsSetIterators::Iterator *a, const DocsSetIterators::Iterator *b) const noexcept {
return a->current() < b->current();
}
};
private:
const bool needScores;
const uint16_t matchThreshold;
uint64_t *const matching;
std::pair<double, uint32_t> *const tracker;
Switch::priority_queue<DocsSetIterators::Iterator *, Compare> pq;
DocsSetIterators::Iterator **const collected;
public:
DocsSetSpanForDisjunctionsWithThreshold(const uint16_t min, std::vector<Trinity::DocsSetIterators::Iterator *> &its, const bool ns)
: needScores{ns}, matchThreshold{min}, matching((uint64_t *)calloc(SET_SIZE, sizeof(uint64_t))), pq(its.size() + 16), collected((DocsSetIterators::Iterator **)malloc(sizeof(DocsSetIterators::Iterator *) * (its.size() + 1))), tracker((std::pair<double, uint32_t> *)calloc(SIZE, sizeof(std::pair<double, uint32_t>))) {
EXPECT(min && min <= its.size());
EXPECT(its.size() > 1);
for (auto it : its) {
// See comments in DocsSetSpanForDisjunctionsWithThreshold::process() collection loop
require(it->current() == 0);
it->next();
pq.push(it);
}
}
~DocsSetSpanForDisjunctionsWithThreshold() noexcept {
std::free(matching);
std::free(collected);
std::free(tracker);
}
isrc_docid_t process(MatchesProxy *const mp, const isrc_docid_t min, const isrc_docid_t max) override final;
uint64_t cost() override final {
uint64_t res{0};
for (auto it : pq)
res += it->cost();
return res;
}
};
class DocsSetSpanForPartialMatch final
: public DocsSetSpan {
private:
struct Compare final {
[[gnu::always_inline]] inline bool operator()(const DocsSetIterators::Iterator *a, const DocsSetIterators::Iterator *b) const noexcept {
return a->current() < b->current();
}
};
private:
const uint16_t matchThreshold;
uint64_t *const matching;
std::pair<double, uint32_t> *const tracker;
Switch::priority_queue<DocsSetIterators::Iterator *, Compare> pq;
DocsSetIterators::Iterator **const collected;
public:
DocsSetSpanForPartialMatch(std::vector<Trinity::DocsSetIterators::Iterator *> &its, const uint16_t min)
: matchThreshold{min}, matching((uint64_t *)calloc(SET_SIZE, sizeof(uint64_t))), pq(its.size() + 16), tracker((std::pair<double, uint32_t> *)calloc(SIZE, sizeof(std::pair<double, uint32_t>))), collected((DocsSetIterators::Iterator **)malloc(sizeof(DocsSetIterators::Iterator *) * (its.size() + 1))) {
for (auto it : its) {
// See comments in DocsSetSpanForDisjunctionsWithThreshold::process() collection loop
require(it->current() == 0);
it->next();
pq.push(it);
}
EXPECT(pq.size());
}
~DocsSetSpanForPartialMatch() noexcept {
std::free(matching);
std::free(collected);
std::free(tracker);
}
isrc_docid_t process(MatchesProxy *const mp, const isrc_docid_t min, const isrc_docid_t max) override final;
uint64_t cost() override final {
auto res{0};
for (auto it : pq)
res += it->cost();
return res;
}
};
// This is similar to DocsSetSpanForDisjunctions, except
// it operates on DocsSetSpan's instead, and the impl. is somewhat different to account for that, but the core algorithm is the same.
class DocsSetSpanForDisjunctionsWithSpans final
: public DocsSetSpan {
private:
struct span_ctx final {
DocsSetSpan *span;
isrc_docid_t next{0};
inline void advance(const isrc_docid_t min) {
// we can safely pass nullptr here, because we are restricting to [min, min) so
// in practice it will just advance the span to >= span but won't attempt to invoke the provided MatchesProxy::process()
process(nullptr, min, min);
}
inline void process(MatchesProxy *const mp, const isrc_docid_t min, const isrc_docid_t max) {
next = span->process(mp, min, max);
}
struct Compare {
[[gnu::always_inline]] inline bool operator()(const span_ctx &a, const span_ctx &b) const noexcept {
return a.next < b.next;
}
};
};
private:
uint64_t *const matching;
Switch::priority_queue<span_ctx, span_ctx::Compare> pq;
span_ctx *const collected;
struct Tracker final
: public MatchesProxy {
uint32_t m{0};
uint64_t *const matching;
queryexec_ctx *const rctx;
Tracker(uint64_t *const m, queryexec_ctx *const ctx)
: matching{m}, rctx{ctx} {
}
// See Tracker::process() def. comments
inline void reset() {
m = 0;
}
void process(relevant_document_provider *) override final;
} tracker;
private:
span_ctx advance(const isrc_docid_t);
public:
DocsSetSpanForDisjunctionsWithSpans(std::vector<DocsSetSpan *> &its);
~DocsSetSpanForDisjunctionsWithSpans() noexcept {
std::free(matching);
std::free(collected);
}
isrc_docid_t process(MatchesProxy *const mp, const isrc_docid_t min, const isrc_docid_t max) override final;
uint64_t cost() override final {
uint64_t res{0};
for (auto it : pq)
res += it.span->cost();
return res;
}
};
// This is similar to DocsSetSpanForDisjunctionsWithSpans, except
// that it relies on the (leads, head, tail) scheme used in
// Lucene's DisjunctionSome for efficiency (though this only makes sense if e.g
// you know that evaluating iterators can be expensive to be worth it)
class DocsSetSpanForDisjunctionsWithSpansAndCost final
: public DocsSetSpan {
private:
struct span_ctx final {
DocsSetSpan *span;
uint64_t cost;
isrc_docid_t next{0};
inline void advance(const isrc_docid_t min) {
process(nullptr, min, min);
}
inline void process(MatchesProxy *const mp, const isrc_docid_t min, const isrc_docid_t max) {
next = span->process(mp, min, max);
}
struct CompareByNext {
[[gnu::always_inline]] inline bool operator()(const span_ctx *const a, const span_ctx *const b) const noexcept {
return a->next < b->next;
}
};
struct CompareByCost {
[[gnu::always_inline]] inline bool operator()(const span_ctx *const a, const span_ctx *const b) const noexcept {
return a->cost < b->cost;
}
};
};
private:
std::pair<double, uint32_t> *const matchesTracker;
uint64_t *const matching;
span_ctx **const leads;
Switch::priority_queue<span_ctx *, span_ctx::CompareByNext> head;
Switch::priority_queue<span_ctx *, span_ctx::CompareByCost> tail;
const uint16_t matchThreshold;
span_ctx *const storage;
uint64_t cost_;
struct Tracker final
: public MatchesProxy {
uint32_t m{0};
std::pair<double, uint32_t> *const matchesTracker;
uint64_t *const matching;
queryexec_ctx *const rctx;
Tracker(uint64_t *const m, std::pair<double, uint32_t> *const t, queryexec_ctx *const ctx)
: matchesTracker{t}, matching{m}, rctx{ctx} {
}
// See Tracker::process() def. comments
inline void reset() {
m = 0;
}
void process(relevant_document_provider *) override final;
} tracker;
private:
span_ctx *advance(const isrc_docid_t);
span_ctx *score_window(span_ctx *const sctx, MatchesProxy *const mp, const isrc_docid_t min, const isrc_docid_t max);
void score_window_single(span_ctx *const sctx, MatchesProxy *const mp, const isrc_docid_t windowMin, const isrc_docid_t windowMax, const isrc_docid_t max);
void score_window_many(MatchesProxy *const mp, const isrc_docid_t windowBase, const isrc_docid_t windowMin, const isrc_docid_t windowMax, uint16_t leadsCnt);
public:
DocsSetSpanForDisjunctionsWithSpansAndCost(const uint16_t min, std::vector<DocsSetSpan *> &its);
~DocsSetSpanForDisjunctionsWithSpansAndCost() noexcept {
std::free(matching);
std::free(leads);
std::free(storage);
std::free(matchesTracker);
}
isrc_docid_t process(MatchesProxy *const mp, const isrc_docid_t min, const isrc_docid_t max) override final;
uint64_t cost() override final {
return cost_;
}
};
class DocsSetSpanForDisjunctionsWithThresholdAndCost final
: public DocsSetSpan {
private:
struct it_ctx final {
DocsSetIterators::Iterator *it;
uint64_t cost;
isrc_docid_t next{0};
inline void advance(const isrc_docid_t min) {
next = it->advance(min);
}
struct CompareByNext {
[[gnu::always_inline]] inline bool operator()(const it_ctx *const a, const it_ctx *const b) const noexcept {
return a->next < b->next;
}
};
struct CompareByCost {
[[gnu::always_inline]] inline bool operator()(const it_ctx *const a, const it_ctx *const b) const noexcept {
return a->cost < b->cost;
}
};
};
private:
std::pair<double, uint32_t> *const matchesTracker;
uint64_t *const matching;
it_ctx **const leads;
Switch::priority_queue<it_ctx *, it_ctx::CompareByNext> head;
Switch::priority_queue<it_ctx *, it_ctx::CompareByCost> tail;
const bool needScores;
const uint16_t matchThreshold;
it_ctx *const storage;
uint64_t cost_;
private:
it_ctx *advance(const isrc_docid_t);
it_ctx *score_window(it_ctx *const sctx, MatchesProxy *const mp, const isrc_docid_t min, const isrc_docid_t max);
void score_window_single(it_ctx *const sctx, MatchesProxy *const mp, const isrc_docid_t windowMin, const isrc_docid_t windowMax, const isrc_docid_t max);
void score_window_many(MatchesProxy *const mp, const isrc_docid_t windowBase, const isrc_docid_t windowMin, const isrc_docid_t windowMax, uint16_t leadsCnt);
public:
DocsSetSpanForDisjunctionsWithThresholdAndCost(const uint16_t min, std::vector<DocsSetIterators::Iterator *> &its, const bool needScores);
~DocsSetSpanForDisjunctionsWithThresholdAndCost() noexcept {
std::free(matching);
std::free(leads);
std::free(storage);
std::free(matchesTracker);
}
isrc_docid_t process(MatchesProxy *const mp, const isrc_docid_t min, const isrc_docid_t max) override final;
uint64_t cost() override final {
return cost_;
}
};
} // namespace Trinity