Skip to content

Commit

Permalink
Merge pull request #4344 from vespa-engine/toregge/extend-test-for-bm…
Browse files Browse the repository at this point in the history
…25-feature-using-flattening-for-array

Extend test for bm25 feature using flattening for array in streaming …
  • Loading branch information
geirst authored Feb 13, 2025
2 parents 8116226 + f77bc37 commit fe8bbce
Show file tree
Hide file tree
Showing 7 changed files with 115 additions and 63 deletions.
146 changes: 95 additions & 51 deletions tests/search/bm25_feature/bm25_feature.rb
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,55 @@ def annotation
end
end

class QueryBuilder
attr_reader :total_doc_count, :field, :avg_field_length
attr_reader :document_frequencies, :ranking
attr_reader :idfs, :annotations
def initialize(testcase, total_doc_count, field, avg_field_length, document_frequencies, ranking, add_significance: false, add_docfreq: false)
@testcase = testcase
@total_doc_count = total_doc_count
@field = field
@avg_field_length = avg_field_length
@document_frequencies = document_frequencies
@ranking = ranking
testcase.assert(add_docfreq || add_significance || !testcase.is_streaming)
@idfs = { }
@document_frequencies.each do |term, freq|
@idfs[term] = testcase.idf(freq, total_doc_count)
end
@annotations = nil
if add_docfreq
@annotations = { }
document_frequencies.each do |term, freq|
@annotations[term] = DocumentFrequency.new(freq, total_doc_count)
end
elsif add_significance
@annotations = { }
idfs.each do |term, idf|
@annotations[term] = Significance.new(idf)
end
end
end

def make_query(terms)
subqueries = []
for term in terms
annotation = ''
if !annotations.nil? && annotations.include?(term)
annotation = annotations[term].annotation
end
subqueries.push("#{field} contains (#{annotation}\"#{term}\")")
end
joined_subqueries = subqueries.join(" and ")
form = [['yql', "select * from sources * where #{joined_subqueries}"],
['ranking', ranking]]
encoded_form = URI.encode_www_form(form)
@testcase.puts "yql is #{form[0][1]}"
@testcase.puts "encoded form is #{encoded_form}"
return encoded_form
end
end

def setup
set_owner("geirst")
end
Expand All @@ -47,12 +96,15 @@ def test_bm25_feature

assert_bm25_scores
assert_bm25_scores(3, 100, 'avgfl100')
assert_bm25_array_scores(3, 8)

vespa.search["search"].first.trigger_flush
assert_bm25_scores
assert_bm25_array_scores(3, 8)

restart_proton("test", 3)
assert_bm25_scores
assert_bm25_array_scores(3, 8)
end

def test_enable_bm25_feature
Expand All @@ -63,7 +115,7 @@ def test_enable_bm25_feature
start
# Average field length for content = 4 ((7 + 3 + 2) / 3).
# Average field length for contenta = 8 ((14 + 6 + 4) / 3).
feed_and_wait_for_docs("test", 3, :file => @test_dir + "docs.json")
feed_and_wait_for_docs("test", 3, :file => selfdir + "docs.json")
assert_degraded_bm25_scores(3)
assert_degraded_bm25_array_scores(3)

Expand Down Expand Up @@ -131,85 +183,77 @@ def test_bm25_idf
assert_matching_doc_count_is_saturated_sum_for_fields(doc_counts: doc_counts)
end

def make_query(terms, ranking, annotations)
subqueries = []
for term in terms
annotation = ''
if !annotations.nil? && annotations.include?(term)
annotation = annotations[term].annotation
end
subqueries.push("content contains (#{annotation}\"#{term}\")")
end
joined_subqueries = subqueries.join(" and ")
form = [['yql', "select * from sources * where #{joined_subqueries}"],
['ranking', ranking]]
encoded_form = URI.encode_www_form(form)
puts "yql is #{form[0][1]}"
puts "encoded form is #{encoded_form}"
return encoded_form
def content_document_frequencies
{ 'a' => 3, 'b' => 2, 'd' => 2 }
end

def tweaked_content_document_frequencies
{ 'a' => 2, 'b' => 1, 'd' => 3 }
end

def assert_bm25_scores(total_doc_count = 3, avg_field_length = 4, ranking = 'default')
assert_bm25_scores_helper(total_doc_count, avg_field_length, ranking) unless is_streaming
assert_bm25_scores_helper(total_doc_count, avg_field_length, ranking, add_significance: true)
assert_bm25_scores_helper(total_doc_count, avg_field_length, ranking, add_significance: true, tweak_frequencies: true)
assert_bm25_scores_helper(total_doc_count, avg_field_length, ranking, add_significance: true, tweak_document_frequencies: true)
assert_bm25_scores_helper(total_doc_count, avg_field_length, ranking, add_docfreq: true)
assert_bm25_scores_helper(total_doc_count, avg_field_length, ranking, add_docfreq: true, tweak_frequencies: true)
assert_bm25_scores_helper(total_doc_count, avg_field_length, ranking, add_docfreq: true, tweak_document_frequencies: true)
end

def assert_bm25_scores_helper(total_doc_count, avg_field_length, ranking, add_significance: false, add_docfreq: false, tweak_frequencies: false)
def assert_bm25_scores_helper(total_doc_count, avg_field_length, ranking, add_significance: false, add_docfreq: false, tweak_document_frequencies: false)
assert(add_docfreq || add_significance || !is_streaming)
frequencies = { 'a' => 3,
'b' => 2,
'd' => 2 }
if tweak_frequencies
document_frequencies = content_document_frequencies
if tweak_document_frequencies
assert(add_docfreq || add_significance)
frequencies = { 'a' => 2,
'b' => 1,
'd' => 3 }
end
idfs = { 'a' => idf(frequencies['a'], total_doc_count),
'b' => idf(frequencies['b'], total_doc_count),
'd' => idf(frequencies['d'], total_doc_count) };
annotations = nil
if add_docfreq
annotations = { 'a' => DocumentFrequency.new(frequencies['a'], total_doc_count),
'b' => DocumentFrequency.new(frequencies['b'], total_doc_count),
'd' => DocumentFrequency.new(frequencies['d'], total_doc_count) }
elsif add_significance
annotations = { 'a' => Significance.new(idfs['a']),
'b' => Significance.new(idfs['b']),
'd' => Significance.new(idfs['d']) }
document_frequencies = tweaked_content_document_frequencies
end
assert_scores_for_query(make_query(['a'], ranking, annotations),
query_builder = QueryBuilder.new(self, total_doc_count, 'content', avg_field_length, document_frequencies, ranking, add_significance: add_significance, add_docfreq: add_docfreq)
idfs = query_builder.idfs
assert_scores_for_query(query_builder.make_query(['a']),
[score(2, 3, idfs['a'], avg_field_length),
score(3, 7, idfs['a'], avg_field_length),
score(1, 2, idfs['a'], avg_field_length)],
'content')

assert_scores_for_query(make_query(['b'], ranking, annotations),
assert_scores_for_query(query_builder.make_query(['b']),
[score(1, 3, idfs['b'], avg_field_length),
score(1, 7, idfs['b'], avg_field_length)],
'content')

assert_scores_for_query(make_query(['a','d'], ranking, annotations),
assert_scores_for_query(query_builder.make_query(['a','d']),
[score(1, 2, idfs['a'], avg_field_length) +
score(1, 2, idfs['d'], avg_field_length),
score(3, 7, idfs['a'], avg_field_length) +
score(1, 7, idfs['d'], avg_field_length)],
'content')
end

def assert_bm25_array_scores(total_doc_count, avg_field_length)
assert_scores_for_query("contenta:a&type=all", [score(2, 6, idf(3, total_doc_count), avg_field_length),
score(3, 14, idf(3, total_doc_count), avg_field_length),
score(1, 4, idf(3, total_doc_count), avg_field_length)], 'contenta')
def contenta_document_frequencies
{ 'a' => 3, 'b' => 2, 'd' => 2 }
end

assert_scores_for_query("contenta:b&type=all", [score(1, 6, idf(2, total_doc_count), avg_field_length),
score(1, 14, idf(2, total_doc_count), avg_field_length)], 'contenta')
def assert_bm25_array_scores(total_doc_count, avg_field_length)
assert_bm25_array_scores_helper(total_doc_count, avg_field_length) unless is_streaming
assert_bm25_array_scores_helper(total_doc_count, avg_field_length, add_docfreq: true)
end

assert_scores_for_query("contenta:a+contenta:d&type=all", [score(1, 4, idf(3, total_doc_count), avg_field_length) + score(1, 4, idf(2, total_doc_count), avg_field_length),
score(3, 14, idf(3, total_doc_count), avg_field_length) + score(1, 14, idf(2, total_doc_count), avg_field_length)], 'contenta')
def assert_bm25_array_scores_helper(total_doc_count, avg_field_length, add_docfreq: false)
query_builder = QueryBuilder.new(self, total_doc_count, 'contenta', avg_field_length, contenta_document_frequencies, 'default', add_docfreq: add_docfreq)
idfs = query_builder.idfs
assert_scores_for_query(query_builder.make_query(['a']),
[score(2, 6, idfs['a'], avg_field_length),
score(3, 14, idfs['a'], avg_field_length),
score(1, 4, idfs['a'], avg_field_length)],
'contenta')

assert_scores_for_query(query_builder.make_query(['b']),
[score(1, 6, idfs['b'], avg_field_length),
score(1, 14, idfs['b'], avg_field_length)],
'contenta')

assert_scores_for_query(query_builder.make_query(['a','d']),
[score(1, 4, idfs['a'], avg_field_length) + score(1, 4, idfs['d'], avg_field_length),
score(3, 14, idfs['a'], avg_field_length) + score(1, 14, idfs['d'], avg_field_length)],
'contenta')
end

def assert_degraded_bm25_scores(total_doc_count)
Expand Down
6 changes: 3 additions & 3 deletions tests/search/bm25_feature/docs.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[
{ "put": "id:test:test::0", "fields": { "content": "a a a b c d e" } },
{ "put": "id:test:test::1", "fields": { "content": "a a b" } },
{ "put": "id:test:test::2", "fields": { "content": "a d" } }
{ "put": "id:test:test::0", "fields": { "content": "a a a b c d e", "contenta": ["a a a1 b c d e", "a a2 a2 b2 c2 d2 e2"] } },
{ "put": "id:test:test::1", "fields": { "content": "a a b", "contenta": ["a1 a1 b1", "a a b"] } },
{ "put": "id:test:test::2", "fields": { "content": "a d", "contenta": ["a d1", "a2 d"] } }
]
2 changes: 1 addition & 1 deletion tests/search/bm25_feature/regen/0/test.sd
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Copyright Vespa.ai. All rights reserved.
search test {
schema test {
document test {
field content type string {
indexing: index | summary
Expand Down
2 changes: 1 addition & 1 deletion tests/search/bm25_feature/regen/1/test.sd
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Copyright Vespa.ai. All rights reserved.
search test {
schema test {
document test {
field content type string {
indexing: index | summary
Expand Down
5 changes: 0 additions & 5 deletions tests/search/bm25_feature/regen/docs.json

This file was deleted.

9 changes: 8 additions & 1 deletion tests/search/bm25_feature/streaming/test.sd
Original file line number Diff line number Diff line change
Expand Up @@ -5,24 +5,31 @@ schema test {
indexing: index | summary
index: enable-bm25
}
field contenta type array<string> {
indexing: index | summary
index: enable-bm25
}
}
rank-profile avgfl100 {
first-phase {
expression {
bm25(content)
bm25(content) + bm25(contenta)
}
}
# Default average field length for streaming search is 100.0
summary-features {
bm25(content)
bm25(contenta)
}
match-features {
bm25(content)
bm25(contenta)
}
}
rank-profile default inherits avgfl100 {
rank-properties {
bm25(content).averageFieldLength: 4.0
bm25(contenta).averageFieldLength: 8.0
}
}
}
8 changes: 7 additions & 1 deletion tests/search/bm25_feature/test.sd
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,24 @@ schema test {
indexing: index | summary
index: enable-bm25
}
field contenta type array<string> {
indexing: index | summary
index: enable-bm25
}
}
rank-profile default {
first-phase {
expression {
bm25(content)
bm25(content) + bm25(contenta)
}
}
summary-features {
bm25(content)
bm25(contenta)
}
match-features {
bm25(content)
bm25(contenta)
}
}
rank-profile avgfl100 inherits default {
Expand Down

0 comments on commit fe8bbce

Please sign in to comment.