-
Notifications
You must be signed in to change notification settings - Fork 70
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Address inconsistent scoring in hybrid query results #998
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,6 +4,7 @@ | |
*/ | ||
package org.opensearch.neuralsearch.query; | ||
|
||
import com.google.common.annotations.VisibleForTesting; | ||
import lombok.Getter; | ||
import lombok.extern.log4j.Log4j2; | ||
import org.apache.lucene.search.DisiPriorityQueue; | ||
|
@@ -30,7 +31,7 @@ | |
* corresponds to order of sub-queries in an input Hybrid query. | ||
*/ | ||
@Log4j2 | ||
public final class HybridQueryScorer extends Scorer { | ||
public class HybridQueryScorer extends Scorer { | ||
|
||
// score for each of sub-query in this hybrid query | ||
@Getter | ||
|
@@ -100,7 +101,8 @@ public float score() throws IOException { | |
return score(getSubMatches()); | ||
} | ||
|
||
private float score(DisiWrapper topList) throws IOException { | ||
@VisibleForTesting | ||
float score(DisiWrapper topList) throws IOException { | ||
float totalScore = 0.0f; | ||
for (DisiWrapper disiWrapper = topList; disiWrapper != null; disiWrapper = disiWrapper.next) { | ||
// check if this doc has match in the subQuery. If not, add score as 0.0 and continue | ||
|
@@ -187,7 +189,12 @@ public int docID() { | |
*/ | ||
public float[] hybridScores() throws IOException { | ||
float[] scores = new float[numSubqueries]; | ||
DisiWrapper topList = subScorersPQ.topList(); | ||
// retrieves sub-matches using DisjunctionDisiScorer's two-phase iteration process. | ||
// while the two-phase iterator can efficiently skip blocks of document IDs during matching, | ||
// the DisiWrapper (obtained from subScorersPQ.topList()) ensures sequential document ID iteration. | ||
// this is necessary for maintaining correct scoring order. | ||
DisiWrapper topList = getSubMatches(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is main change for this PR There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you explain here what is the difference between There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. critical difference is the case when we do have two phase iterator, this can happen when underlying doc iterator uses approximation. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for the explanation. Just wanted to avoid the case where an engineer change this back to subScorersPQ.topList() in the future and introduce same bug. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. makes sense, let me put that as a comment |
||
|
||
for (HybridDisiWrapper disiWrapper = (HybridDisiWrapper) topList; disiWrapper != null; disiWrapper = | ||
(HybridDisiWrapper) disiWrapper.next) { | ||
// check if this doc has match in the subQuery. If not, add score as 0.0 and continue | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -108,12 +108,21 @@ public void collect(int doc) throws IOException { | |
} | ||
// Increment total hit count which represents unique doc found on the shard | ||
totalHits++; | ||
hitsThresholdChecker.incrementHitCount(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is a gap team noted while working on the fix, not directly related to reported issue |
||
for (int i = 0; i < subScoresByQuery.length; i++) { | ||
float score = subScoresByQuery[i]; | ||
// if score is 0.0 there is no hits for that sub-query | ||
if (score == 0) { | ||
continue; | ||
} | ||
if (hitsThresholdChecker.isThresholdReached() && totalHitsRelation == TotalHits.Relation.EQUAL_TO) { | ||
log.info( | ||
"hit count threshold reached: total hits={}, threshold={}, action=updating_results", | ||
totalHits, | ||
hitsThresholdChecker.getTotalHitsThreshold() | ||
); | ||
totalHitsRelation = TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO; | ||
} | ||
collectedHitsPerSubQuery[i]++; | ||
PriorityQueue<ScoreDoc> pq = compoundScores[i]; | ||
ScoreDoc currentDoc = new ScoreDoc(doc + docBase, score); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
need to remove
final
to make this class "mockable"