Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extend test for bm25 feature using flattening for array in streaming … #4344

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
146 changes: 95 additions & 51 deletions tests/search/bm25_feature/bm25_feature.rb
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,55 @@ def annotation
end
end

class QueryBuilder
attr_reader :total_doc_count, :field, :avg_field_length
attr_reader :document_frequencies, :ranking
attr_reader :idfs, :annotations
def initialize(testcase, total_doc_count, field, avg_field_length, document_frequencies, ranking, add_significance: false, add_docfreq: false)
@testcase = testcase
@total_doc_count = total_doc_count
@field = field
@avg_field_length = avg_field_length
@document_frequencies = document_frequencies
@ranking = ranking
testcase.assert(add_docfreq || add_significance || !testcase.is_streaming)
@idfs = { }
@document_frequencies.each do |term, freq|
@idfs[term] = testcase.idf(freq, total_doc_count)
end
@annotations = nil
if add_docfreq
@annotations = { }
document_frequencies.each do |term, freq|
@annotations[term] = DocumentFrequency.new(freq, total_doc_count)
end
elsif add_significance
@annotations = { }
idfs.each do |term, idf|
@annotations[term] = Significance.new(idf)
end
end
end

def make_query(terms)
subqueries = []
for term in terms
annotation = ''
if !annotations.nil? && annotations.include?(term)
annotation = annotations[term].annotation
end
subqueries.push("#{field} contains (#{annotation}\"#{term}\")")
end
joined_subqueries = subqueries.join(" and ")
form = [['yql', "select * from sources * where #{joined_subqueries}"],
['ranking', ranking]]
encoded_form = URI.encode_www_form(form)
@testcase.puts "yql is #{form[0][1]}"
@testcase.puts "encoded form is #{encoded_form}"
return encoded_form
end
end

def setup
set_owner("geirst")
end
Expand All @@ -47,12 +96,15 @@ def test_bm25_feature

assert_bm25_scores
assert_bm25_scores(3, 100, 'avgfl100')
assert_bm25_array_scores(3, 8)

vespa.search["search"].first.trigger_flush
assert_bm25_scores
assert_bm25_array_scores(3, 8)

restart_proton("test", 3)
assert_bm25_scores
assert_bm25_array_scores(3, 8)
end

def test_enable_bm25_feature
Expand All @@ -63,7 +115,7 @@ def test_enable_bm25_feature
start
# Average field length for content = 4 ((7 + 3 + 2) / 3).
# Average field length for contenta = 8 ((14 + 6 + 4) / 3).
feed_and_wait_for_docs("test", 3, :file => @test_dir + "docs.json")
feed_and_wait_for_docs("test", 3, :file => selfdir + "docs.json")
assert_degraded_bm25_scores(3)
assert_degraded_bm25_array_scores(3)

Expand Down Expand Up @@ -131,85 +183,77 @@ def test_bm25_idf
assert_matching_doc_count_is_saturated_sum_for_fields(doc_counts: doc_counts)
end

def make_query(terms, ranking, annotations)
subqueries = []
for term in terms
annotation = ''
if !annotations.nil? && annotations.include?(term)
annotation = annotations[term].annotation
end
subqueries.push("content contains (#{annotation}\"#{term}\")")
end
joined_subqueries = subqueries.join(" and ")
form = [['yql', "select * from sources * where #{joined_subqueries}"],
['ranking', ranking]]
encoded_form = URI.encode_www_form(form)
puts "yql is #{form[0][1]}"
puts "encoded form is #{encoded_form}"
return encoded_form
def content_document_frequencies
{ 'a' => 3, 'b' => 2, 'd' => 2 }
end

def tweaked_content_document_frequencies
{ 'a' => 2, 'b' => 1, 'd' => 3 }
end

def assert_bm25_scores(total_doc_count = 3, avg_field_length = 4, ranking = 'default')
assert_bm25_scores_helper(total_doc_count, avg_field_length, ranking) unless is_streaming
assert_bm25_scores_helper(total_doc_count, avg_field_length, ranking, add_significance: true)
assert_bm25_scores_helper(total_doc_count, avg_field_length, ranking, add_significance: true, tweak_frequencies: true)
assert_bm25_scores_helper(total_doc_count, avg_field_length, ranking, add_significance: true, tweak_document_frequencies: true)
assert_bm25_scores_helper(total_doc_count, avg_field_length, ranking, add_docfreq: true)
assert_bm25_scores_helper(total_doc_count, avg_field_length, ranking, add_docfreq: true, tweak_frequencies: true)
assert_bm25_scores_helper(total_doc_count, avg_field_length, ranking, add_docfreq: true, tweak_document_frequencies: true)
end

def assert_bm25_scores_helper(total_doc_count, avg_field_length, ranking, add_significance: false, add_docfreq: false, tweak_frequencies: false)
def assert_bm25_scores_helper(total_doc_count, avg_field_length, ranking, add_significance: false, add_docfreq: false, tweak_document_frequencies: false)
assert(add_docfreq || add_significance || !is_streaming)
frequencies = { 'a' => 3,
'b' => 2,
'd' => 2 }
if tweak_frequencies
document_frequencies = content_document_frequencies
if tweak_document_frequencies
assert(add_docfreq || add_significance)
frequencies = { 'a' => 2,
'b' => 1,
'd' => 3 }
end
idfs = { 'a' => idf(frequencies['a'], total_doc_count),
'b' => idf(frequencies['b'], total_doc_count),
'd' => idf(frequencies['d'], total_doc_count) };
annotations = nil
if add_docfreq
annotations = { 'a' => DocumentFrequency.new(frequencies['a'], total_doc_count),
'b' => DocumentFrequency.new(frequencies['b'], total_doc_count),
'd' => DocumentFrequency.new(frequencies['d'], total_doc_count) }
elsif add_significance
annotations = { 'a' => Significance.new(idfs['a']),
'b' => Significance.new(idfs['b']),
'd' => Significance.new(idfs['d']) }
document_frequencies = tweaked_content_document_frequencies
end
assert_scores_for_query(make_query(['a'], ranking, annotations),
query_builder = QueryBuilder.new(self, total_doc_count, 'content', avg_field_length, document_frequencies, ranking, add_significance: add_significance, add_docfreq: add_docfreq)
idfs = query_builder.idfs
assert_scores_for_query(query_builder.make_query(['a']),
[score(2, 3, idfs['a'], avg_field_length),
score(3, 7, idfs['a'], avg_field_length),
score(1, 2, idfs['a'], avg_field_length)],
'content')

assert_scores_for_query(make_query(['b'], ranking, annotations),
assert_scores_for_query(query_builder.make_query(['b']),
[score(1, 3, idfs['b'], avg_field_length),
score(1, 7, idfs['b'], avg_field_length)],
'content')

assert_scores_for_query(make_query(['a','d'], ranking, annotations),
assert_scores_for_query(query_builder.make_query(['a','d']),
[score(1, 2, idfs['a'], avg_field_length) +
score(1, 2, idfs['d'], avg_field_length),
score(3, 7, idfs['a'], avg_field_length) +
score(1, 7, idfs['d'], avg_field_length)],
'content')
end

def assert_bm25_array_scores(total_doc_count, avg_field_length)
assert_scores_for_query("contenta:a&type=all", [score(2, 6, idf(3, total_doc_count), avg_field_length),
score(3, 14, idf(3, total_doc_count), avg_field_length),
score(1, 4, idf(3, total_doc_count), avg_field_length)], 'contenta')
def contenta_document_frequencies
{ 'a' => 3, 'b' => 2, 'd' => 2 }
end

assert_scores_for_query("contenta:b&type=all", [score(1, 6, idf(2, total_doc_count), avg_field_length),
score(1, 14, idf(2, total_doc_count), avg_field_length)], 'contenta')
def assert_bm25_array_scores(total_doc_count, avg_field_length)
assert_bm25_array_scores_helper(total_doc_count, avg_field_length) unless is_streaming
assert_bm25_array_scores_helper(total_doc_count, avg_field_length, add_docfreq: true)
end

assert_scores_for_query("contenta:a+contenta:d&type=all", [score(1, 4, idf(3, total_doc_count), avg_field_length) + score(1, 4, idf(2, total_doc_count), avg_field_length),
score(3, 14, idf(3, total_doc_count), avg_field_length) + score(1, 14, idf(2, total_doc_count), avg_field_length)], 'contenta')
def assert_bm25_array_scores_helper(total_doc_count, avg_field_length, add_docfreq: false)
query_builder = QueryBuilder.new(self, total_doc_count, 'contenta', avg_field_length, contenta_document_frequencies, 'default', add_docfreq: add_docfreq)
idfs = query_builder.idfs
assert_scores_for_query(query_builder.make_query(['a']),
[score(2, 6, idfs['a'], avg_field_length),
score(3, 14, idfs['a'], avg_field_length),
score(1, 4, idfs['a'], avg_field_length)],
'contenta')

assert_scores_for_query(query_builder.make_query(['b']),
[score(1, 6, idfs['b'], avg_field_length),
score(1, 14, idfs['b'], avg_field_length)],
'contenta')

assert_scores_for_query(query_builder.make_query(['a','d']),
[score(1, 4, idfs['a'], avg_field_length) + score(1, 4, idfs['d'], avg_field_length),
score(3, 14, idfs['a'], avg_field_length) + score(1, 14, idfs['d'], avg_field_length)],
'contenta')
end

def assert_degraded_bm25_scores(total_doc_count)
Expand Down
6 changes: 3 additions & 3 deletions tests/search/bm25_feature/docs.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[
{ "put": "id:test:test::0", "fields": { "content": "a a a b c d e" } },
{ "put": "id:test:test::1", "fields": { "content": "a a b" } },
{ "put": "id:test:test::2", "fields": { "content": "a d" } }
{ "put": "id:test:test::0", "fields": { "content": "a a a b c d e", "contenta": ["a a a1 b c d e", "a a2 a2 b2 c2 d2 e2"] } },
{ "put": "id:test:test::1", "fields": { "content": "a a b", "contenta": ["a1 a1 b1", "a a b"] } },
{ "put": "id:test:test::2", "fields": { "content": "a d", "contenta": ["a d1", "a2 d"] } }
]
2 changes: 1 addition & 1 deletion tests/search/bm25_feature/regen/0/test.sd
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Copyright Vespa.ai. All rights reserved.
search test {
schema test {
document test {
field content type string {
indexing: index | summary
Expand Down
2 changes: 1 addition & 1 deletion tests/search/bm25_feature/regen/1/test.sd
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Copyright Vespa.ai. All rights reserved.
search test {
schema test {
document test {
field content type string {
indexing: index | summary
Expand Down
5 changes: 0 additions & 5 deletions tests/search/bm25_feature/regen/docs.json

This file was deleted.

9 changes: 8 additions & 1 deletion tests/search/bm25_feature/streaming/test.sd
Original file line number Diff line number Diff line change
Expand Up @@ -5,24 +5,31 @@ schema test {
indexing: index | summary
index: enable-bm25
}
field contenta type array<string> {
indexing: index | summary
index: enable-bm25
}
}
rank-profile avgfl100 {
first-phase {
expression {
bm25(content)
bm25(content) + bm25(contenta)
}
}
# Default average field length for streaming search is 100.0
summary-features {
bm25(content)
bm25(contenta)
}
match-features {
bm25(content)
bm25(contenta)
}
}
rank-profile default inherits avgfl100 {
rank-properties {
bm25(content).averageFieldLength: 4.0
bm25(contenta).averageFieldLength: 8.0
}
}
}
8 changes: 7 additions & 1 deletion tests/search/bm25_feature/test.sd
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,24 @@ schema test {
indexing: index | summary
index: enable-bm25
}
field contenta type array<string> {
indexing: index | summary
index: enable-bm25
}
}
rank-profile default {
first-phase {
expression {
bm25(content)
bm25(content) + bm25(contenta)
}
}
summary-features {
bm25(content)
bm25(contenta)
}
match-features {
bm25(content)
bm25(contenta)
}
}
rank-profile avgfl100 inherits default {
Expand Down