Skip to content

Commit

Permalink
Merge pull request #4291 from vespa-engine/toregge/test-handling-of-m…
Browse files Browse the repository at this point in the history
…issing-interleaved-features-in-bm25-feature

Test handling of missing interleaved features in bm25 feature.
  • Loading branch information
geirst authored Nov 29, 2024
2 parents 61ca9f3 + 593ef89 commit de1ed05
Show file tree
Hide file tree
Showing 4 changed files with 36 additions and 20 deletions.
32 changes: 20 additions & 12 deletions tests/search/bm25_feature/bm25_feature.rb
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,8 @@ def test_enable_bm25_feature
# Average field length for content = 4 ((7 + 3 + 2) / 3).
# Average field length for contenta = 8 ((14 + 6 + 4) / 3).
feed_and_wait_for_docs("test", 3, :file => @test_dir + "docs.json")
assert_no_bm25_scores
assert_no_bm25_array_scores
assert_degraded_bm25_scores(3)
assert_degraded_bm25_array_scores(3)

redeploy(SearchApp.new.sd("#{@test_dir}1/test.sd"))
60.times do |i|
Expand Down Expand Up @@ -208,24 +208,32 @@ def assert_bm25_array_scores(total_doc_count, avg_field_length)
assert_scores_for_query("contenta:b&type=all", [score(1, 6, idf(2, total_doc_count), avg_field_length),
score(1, 14, idf(2, total_doc_count), avg_field_length)], 'contenta')

assert_scores_for_query("content:a+content:d&type=all", [score(1, 4, idf(3, total_doc_count), avg_field_length) + score(1, 4, idf(2, total_doc_count), avg_field_length),
score(3, 14, idf(3, total_doc_count), avg_field_length) + score(1, 14, idf(2, total_doc_count), avg_field_length)], 'content')
assert_scores_for_query("contenta:a+contenta:d&type=all", [score(1, 4, idf(3, total_doc_count), avg_field_length) + score(1, 4, idf(2, total_doc_count), avg_field_length),
score(3, 14, idf(3, total_doc_count), avg_field_length) + score(1, 14, idf(2, total_doc_count), avg_field_length)], 'contenta')
end

def assert_no_bm25_scores
assert_scores_for_query("content:a&type=all", [0.0, 0.0, 0.0], 'content')
def assert_degraded_bm25_scores(total_doc_count)
assert_scores_for_query("content:a&type=all", [idf(3, total_doc_count),
idf(3, total_doc_count),
idf(3, total_doc_count)], 'content')

assert_scores_for_query("content:b&type=all", [0.0, 0.0], 'content')
assert_scores_for_query("content:b&type=all", [idf(2, total_doc_count),
idf(2, total_doc_count)], 'content')

assert_scores_for_query("content:a+content:d&type=all", [0.0, 0.0], 'content')
assert_scores_for_query("content:a+content:d&type=all", [idf(3, total_doc_count) + idf(2, total_doc_count),
idf(3, total_doc_count) + idf(2, total_doc_count)], 'content')
end

def assert_no_bm25_array_scores
assert_scores_for_query("contenta:a&type=all", [0.0, 0.0, 0.0], 'contenta')
def assert_degraded_bm25_array_scores(total_doc_count)
assert_scores_for_query("contenta:a&type=all", [idf(3, total_doc_count),
idf(3, total_doc_count),
idf(3, total_doc_count)], 'contenta')

assert_scores_for_query("contenta:b&type=all", [0.0, 0.0], 'contenta')
assert_scores_for_query("contenta:b&type=all", [idf(2, total_doc_count),
idf(2, total_doc_count)], 'contenta')

assert_scores_for_query("content:a+content:d&type=all", [0.0, 0.0], 'content')
assert_scores_for_query("contenta:a+contenta:d&type=all", [idf(3, total_doc_count) + idf(2, total_doc_count),
idf(3, total_doc_count) + idf(2, total_doc_count)], 'contenta')
end

def idf(matching_doc_count, total_doc_count = 3)
Expand Down
8 changes: 8 additions & 0 deletions tests/search/bm25_feature/regen/0/test.sd
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,13 @@ search test {
bm25(content) + bm25(contenta)
}
}
summary-features {
bm25(content)
bm25(contenta)
}
match-features {
bm25(content)
bm25(contenta)
}
}
}
8 changes: 4 additions & 4 deletions tests/search/featurestability/result4.txt
Original file line number Diff line number Diff line change
Expand Up @@ -193,18 +193,18 @@ attributeMatch(position_exp_zcurve).queryCompleteness#0.0
attributeMatch(position_exp_zcurve).significance#0.0
attributeMatch(position_exp_zcurve).totalWeight#0.0
attributeMatch(position_exp_zcurve).weight#0.0
bm25(string_arr)#0.0
bm25(string_arr)#0.28768207245178085
bm25(string_exp)#0.28768207245178085
bm25(string_wset)#0.0
bm25(url_arr)#0.0
bm25(string_wset)#0.28768207245178085
bm25(url_arr)#0.28768207245178085
bm25(url_arr.fragment)#0.0
bm25(url_arr.host)#0.0
bm25(url_arr.hostname)#0.0
bm25(url_arr.path)#0.0
bm25(url_arr.port)#0.0
bm25(url_arr.query)#0.0
bm25(url_arr.scheme)#0.0
bm25(url_exp)#0.0
bm25(url_exp)#0.28768207245178085
bm25(url_exp.fragment)#0.0
bm25(url_exp.host)#0.0
bm25(url_exp.hostname)#0.0
Expand Down
8 changes: 4 additions & 4 deletions tests/search/featurestability/result5.txt
Original file line number Diff line number Diff line change
Expand Up @@ -193,18 +193,18 @@ attributeMatch(position_exp_zcurve).queryCompleteness#0.0
attributeMatch(position_exp_zcurve).significance#0.0
attributeMatch(position_exp_zcurve).totalWeight#0.0
attributeMatch(position_exp_zcurve).weight#0.0
bm25(string_arr)#0.0
bm25(string_arr)#0.5753641449035617
bm25(string_exp)#0.28768207245178085
bm25(string_wset)#0.0
bm25(url_arr)#0.0
bm25(string_wset)#0.5753641449035617
bm25(url_arr)#0.5753641449035617
bm25(url_arr.fragment)#0.0
bm25(url_arr.host)#0.0
bm25(url_arr.hostname)#0.0
bm25(url_arr.path)#0.0
bm25(url_arr.port)#0.0
bm25(url_arr.query)#0.0
bm25(url_arr.scheme)#0.0
bm25(url_exp)#0.0
bm25(url_exp)#0.5753641449035617
bm25(url_exp.fragment)#0.0
bm25(url_exp.host)#0.0
bm25(url_exp.hostname)#0.0
Expand Down

0 comments on commit de1ed05

Please sign in to comment.