-
Notifications
You must be signed in to change notification settings - Fork 20
/
Copy pathsimilarity.h
257 lines (206 loc) · 14.1 KB
/
similarity.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
#pragma once
#include "common.h"
#include "index_source.h"
// Whatever's here is specific to the "accumuluated score scheme" execution mode, where
// we just aggregate a similarity score for each iterator on the "current" document, similar to what Lucene's doing.
namespace Trinity {
namespace Similarity {
struct IndexSourcesCollectionTermsScorer;
// You may want to compute/track more than just a double-worth of state
// for each term/phrase. Even though this is somewhat expensive, it's not that expensive
// and allows for encapsulation/representation of complex types in it.
struct ScorerWeight {
virtual ~ScorerWeight(){};
};
// Responsible for providing scores for a single IndexSource
// You may want to compute some IndexSource specific state in the constructor.
// This is created by IndexSourcesCollectionTermsScorer::new_source_scorer() for each IndexSource involved
// in a query execution.
struct IndexSourceTermsScorer {
IndexSource *const src;
IndexSourcesCollectionTermsScorer *const collectionScorer;
IndexSourceTermsScorer(IndexSourcesCollectionTermsScorer *const r, IndexSource *const s)
: collectionScorer{r}, src{s} {
}
virtual ~IndexSourceTermsScorer() {
}
// For each term and phrase, this will be invoked, passed a single term, or 1+ terms (for phrases)
virtual ScorerWeight *new_scorer_weight(const str8_t *const terms, const uint16_t cnt) {
return nullptr;
}
// Scores a single document; freq is the number of matches in the current document of
// either a single term or a phrase
virtual float score(const isrc_docid_t id, const uint16_t freq, const ScorerWeight *) = 0;
};
struct IndexSourcesCollectionTermsScorer {
// Override this if you want to, for example, aggregate all field_statistics of all
// index sources involved in the execution session collection and store that for later access
virtual void reset(const IndexSourcesCollection *) {
}
virtual IndexSourceTermsScorer *new_source_scorer(IndexSource *) = 0;
virtual ~IndexSourcesCollectionTermsScorer() {
}
};
// A trivial scorer that simply scores based on the matches
struct IndexSourcesCollectionTrivialScorer
: public IndexSourcesCollectionTermsScorer {
struct Scorer final
: public IndexSourceTermsScorer {
Scorer(IndexSourcesCollectionTermsScorer *r, IndexSource *src)
: IndexSourceTermsScorer(r, src) {
}
float score(const isrc_docid_t, const uint16_t freq, const ScorerWeight *) override final {
return freq;
}
};
IndexSourceTermsScorer *new_source_scorer(IndexSource *s) override final {
return new Scorer(this, s);
}
};
// A TF-IDF scorer
struct IndexSourcesCollectionTFIDFScorer
: public IndexSourcesCollectionTermsScorer {
IndexSource::field_statistics dfsAccum;
const IndexSourcesCollection *collection;
struct Scorer final
: public IndexSourceTermsScorer {
const IndexSource::field_statistics fs;
// docFreq: count of documents that contain the term
// docsCnt: total documents in a collection
static inline double idf(const uint32_t docFreq, const uint64_t docsCnt) {
return std::log((docsCnt + 1) / (double)(docFreq + 1)) + 1.0;
}
// Computers a score factor, based on a term or phrase's frequency in a document.
// Terms and phrases repeated in a document indicate the topic of the document, so impls. of
// this method usually return large values when freq is large, and smaller values when freq is small
static inline float tf(const float freq) {
return sqrt(freq);
}
Scorer(IndexSourcesCollectionTermsScorer *r, IndexSource *src)
: IndexSourceTermsScorer(r, src), fs(src->default_field_stats()) {
}
struct ScorerWeight
: public Similarity::ScorerWeight {
const double v;
ScorerWeight(const double value)
: v{value} {
}
};
Similarity::ScorerWeight *new_scorer_weight(const str8_t *const terms, const uint16_t cnt) override final {
const auto cs = static_cast<IndexSourcesCollectionTFIDFScorer *>(collectionScorer);
const auto collection = cs->collection;
// aggregate sums across all index sources, pre-computed in reset()
const auto &stats = cs->dfsAccum;
const auto documentsCnt{stats.docsCnt};
double weight{0};
// We need to aggregate document frequency for each term, across all sources
// XXX: we should probably cache this/compute this once by delegating it to
// collectionScorer which should do it for us
for (uint32_t i{0}; i != cnt; ++i) {
const auto term = terms[i];
uint64_t df{0};
for (const auto src : collection->sources)
df += src->resolve_term_ctx(term).documents;
weight += idf(df, documentsCnt);
}
return new ScorerWeight(weight);
}
// documentMatches: freq, i.e how many matches of a term in a document
// weight: See IndexSourcesCollectionTermsScorer::compute_iterator_wrapper_weight() decl. comments
inline float score(const isrc_docid_t id, const uint16_t freq, const Similarity::ScorerWeight *sw) override final {
const auto v = tf(freq) * static_cast<const ScorerWeight *>(sw)->v; // tf-idf
// TODO: if we had normalizations, we 'd instead return v * decodeNormValue(id) or something
return v;
}
};
// currently, no support for multiple fields
// so a single reset() will do
void reset(const IndexSourcesCollection *const c) override final {
collection = c;
memset(&dfsAccum, 0, sizeof(dfsAccum));
for (auto it : c->sources) {
const auto s = it->default_field_stats();
dfsAccum.sumTermHits += s.sumTermHits;
dfsAccum.totalTerms += s.totalTerms;
dfsAccum.sumTermsDocs += s.sumTermsDocs;
dfsAccum.docsCnt += s.docsCnt;
}
}
IndexSourceTermsScorer *new_source_scorer(IndexSource *s) override final {
return new Scorer(this, s);
}
};
struct IndexSourcesCollectionBM25Scorer
: public IndexSourcesCollectionTermsScorer {
IndexSource::field_statistics dfsAccum;
const IndexSourcesCollection *collection;
// controls non-linear term frequence normalization (saturation)
static constexpr float k1{1.2};
// controls degree document length normalizes tf valuies
static constexpr float b{0.75};
struct Scorer final
: public IndexSourceTermsScorer {
static float normalizationTable[256]; // will be initialized elsewhere
static bool initializer;
static inline double idf(const uint32_t docFreq, const uint64_t docsCnt) {
return std::log(1 + (docsCnt - docFreq + 0.5f) / (docFreq + 0.5f));
}
static inline float tf(const float freq) {
return sqrt(freq);
}
Scorer(IndexSourcesCollectionTermsScorer *r, IndexSource *src)
: IndexSourceTermsScorer(r, src) {
}
struct ScorerWeight final
: public Similarity::ScorerWeight {
const double idf;
const uint32_t avgDocTermFrq;
float cache[256];
ScorerWeight(const double i, const uint32_t a)
: idf{i}, avgDocTermFrq{a} {
}
};
ScorerWeight *new_scorer_weight(const str8_t *const terms, const uint16_t cnt) override final {
const auto cs = static_cast<IndexSourcesCollectionTFIDFScorer *>(collectionScorer);
const auto collection = cs->collection;
const auto &stats = cs->dfsAccum;
const auto documentsCnt{stats.docsCnt};
double idf_{0};
for (uint32_t i{0}; i != cnt; ++i) {
const auto term = terms[i];
uint64_t df{0};
for (const auto src : collection->sources)
df += src->resolve_term_ctx(term).documents;
idf_ += idf(df, documentsCnt);
}
const auto avgDocTermFrq = stats.sumTermsDocs / stats.docsCnt;
auto w = std::make_unique<ScorerWeight>(idf_, avgDocTermFrq);
for (uint32_t i{0}; i != 256; ++i)
w->cache[i] = k1 * ((1 - b) + b * double(normalizationTable[i] / avgDocTermFrq));
return w.release();
}
inline float score(const isrc_docid_t id, const uint16_t freq, const Similarity::ScorerWeight *weight) override final {
// otherwise cache[norms.get(doc)]
const auto norm{k1};
const auto w = static_cast<const ScorerWeight *>(weight);
const auto idf = w->idf;
return idf * float(freq) / double(freq + norm);
}
};
void reset(const IndexSourcesCollection *const c) override final {
collection = c;
memset(&dfsAccum, 0, sizeof(dfsAccum));
for (auto it : c->sources) {
const auto s = it->default_field_stats();
dfsAccum.sumTermHits += s.sumTermHits;
dfsAccum.totalTerms += s.totalTerms;
dfsAccum.sumTermsDocs += s.sumTermsDocs;
dfsAccum.docsCnt += s.docsCnt;
}
}
IndexSourceTermsScorer *new_source_scorer(IndexSource *s) override final {
return new Scorer(this, s);
}
};
} // namespace Similarity
} // namespace Trinity