opensearch-project · martin-gaievski · Nov 26, 2024 · Nov 25, 2024 · martin-gaievski · Nov 25, 2024
@@ -18,6 +18,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 ### Features
 ### Enhancements
 ### Bug Fixes
+-  Address inconsistent scoring in hybrid query results ([#998](https://github.com/opensearch-project/neural-search/pull/998))
 ### Infrastructure
 ### Documentation
 ### Maintenance

@@ -4,6 +4,7 @@
  */
 package org.opensearch.neuralsearch.query;
 
+import com.google.common.annotations.VisibleForTesting;
 import lombok.Getter;
 import lombok.extern.log4j.Log4j2;
 import org.apache.lucene.search.DisiPriorityQueue;
@@ -30,7 +31,7 @@
  * corresponds to order of sub-queries in an input Hybrid query.
  */
 @Log4j2
-public final class HybridQueryScorer extends Scorer {
+public class HybridQueryScorer extends Scorer {
 
     // score for each of sub-query in this hybrid query
     @Getter
@@ -100,7 +101,8 @@ public float score() throws IOException {
         return score(getSubMatches());
     }
 
-    private float score(DisiWrapper topList) throws IOException {
+    @VisibleForTesting
+    float score(DisiWrapper topList) throws IOException {
         float totalScore = 0.0f;
         for (DisiWrapper disiWrapper = topList; disiWrapper != null; disiWrapper = disiWrapper.next) {
             // check if this doc has match in the subQuery. If not, add score as 0.0 and continue
@@ -187,7 +189,12 @@ public int docID() {
      */
     public float[] hybridScores() throws IOException {
         float[] scores = new float[numSubqueries];
-        DisiWrapper topList = subScorersPQ.topList();
+        // retrieves sub-matches using DisjunctionDisiScorer's two-phase iteration process.
+        // while the two-phase iterator can efficiently skip blocks of document IDs during matching,
+        // the DisiWrapper (obtained from subScorersPQ.topList()) ensures sequential document ID iteration.
+        // this is necessary for maintaining correct scoring order.
+        DisiWrapper topList = getSubMatches();
+
         for (HybridDisiWrapper disiWrapper = (HybridDisiWrapper) topList; disiWrapper != null; disiWrapper =
             (HybridDisiWrapper) disiWrapper.next) {
             // check if this doc has match in the subQuery. If not, add score as 0.0 and continue

@@ -108,12 +108,21 @@ public void collect(int doc) throws IOException {
                 }
                 // Increment total hit count which represents unique doc found on the shard
                 totalHits++;
+                hitsThresholdChecker.incrementHitCount();
                 for (int i = 0; i < subScoresByQuery.length; i++) {
                     float score = subScoresByQuery[i];
                     // if score is 0.0 there is no hits for that sub-query
                     if (score == 0) {
                         continue;
                     }
+                    if (hitsThresholdChecker.isThresholdReached() && totalHitsRelation == TotalHits.Relation.EQUAL_TO) {
+                        log.info(
+                            "hit count threshold reached: total hits={}, threshold={}, action=updating_results",
+                            totalHits,
+                            hitsThresholdChecker.getTotalHitsThreshold()
+                        );
+                        totalHitsRelation = TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO;
+                    }
                     collectedHitsPerSubQuery[i]++;
                     PriorityQueue<ScoreDoc> pq = compoundScores[i];
                     ScoreDoc currentDoc = new ScoreDoc(doc + docBase, score);

@@ -50,6 +50,7 @@
 import org.opensearch.core.xcontent.ToXContent;
 import org.opensearch.core.xcontent.XContentBuilder;
 import org.opensearch.core.xcontent.XContentParser;
+import org.opensearch.index.mapper.MappedFieldType;
 import org.opensearch.index.mapper.TextFieldMapper;
 import org.opensearch.index.query.MatchAllQueryBuilder;
 import org.opensearch.index.query.QueryBuilder;
@@ -756,6 +757,69 @@ public void testBoost_whenDefaultBoostSet_thenBuildSuccessfully() {
         assertNotNull(hybridQueryBuilder);
     }
 
+    @SneakyThrows
+    public void testBuild_whenValidParameters_thenCreateQuery() {
+        String queryText = "test query";
+        String modelId = "test_model";
+        String fieldName = "rank_features";
+
+        // Create mock context
+        QueryShardContext context = mock(QueryShardContext.class);
+        MappedFieldType fieldType = mock(MappedFieldType.class);
+        when(context.fieldMapper(fieldName)).thenReturn(fieldType);
+        when(fieldType.typeName()).thenReturn("rank_features");
+
+        // Create HybridQueryBuilder instance (no spy since it's final)
+        NeuralSparseQueryBuilder neuralSparseQueryBuilder = new NeuralSparseQueryBuilder();
+        neuralSparseQueryBuilder.fieldName(fieldName)
+            .queryText(queryText)
+            .modelId(modelId)
+            .queryTokensSupplier(() -> Map.of("token1", 1.0f, "token2", 0.5f));
+        HybridQueryBuilder builder = new HybridQueryBuilder().add(neuralSparseQueryBuilder);
+
+        // Build query
+        Query query = builder.toQuery(context);
+
+        // Verify
+        assertNotNull("Query should not be null", query);
+        assertTrue("Should be HybridQuery", query instanceof HybridQuery);
+    }
+
+    @SneakyThrows
+    public void testDoEquals_whenSameParameters_thenEqual() {
+        // Create neural queries
+        NeuralQueryBuilder neuralQueryBuilder1 = new NeuralQueryBuilder().queryText("test").modelId("test_model");
+
+        NeuralQueryBuilder neuralQueryBuilder2 = new NeuralQueryBuilder().queryText("test").modelId("test_model");
+
+        // Create neural sparse queries with queryTokensSupplier
+        NeuralSparseQueryBuilder neuralSparseQueryBuilder1 = new NeuralSparseQueryBuilder().fieldName("test_field")
+            .queryText("test")
+            .modelId("test_model")
+            .queryTokensSupplier(() -> Map.of("token1", 1.0f));
+
+        NeuralSparseQueryBuilder neuralSparseQueryBuilder2 = new NeuralSparseQueryBuilder().fieldName("test_field")
+            .queryText("test")
+            .modelId("test_model")
+            .queryTokensSupplier(() -> Map.of("token1", 1.0f));
+
+        // Create builders
+        HybridQueryBuilder builder1 = new HybridQueryBuilder().add(neuralQueryBuilder1).add(neuralSparseQueryBuilder1);
+
+        HybridQueryBuilder builder2 = new HybridQueryBuilder().add(neuralQueryBuilder2).add(neuralSparseQueryBuilder2);
+
+        // Verify
+        assertTrue("Builders should be equal", builder1.equals(builder2));
+        assertEquals("Hash codes should match", builder1.hashCode(), builder2.hashCode());
+    }
+
+    public void testValidate_whenInvalidParameters_thenThrowException() {
+        // Test null query builder
+        HybridQueryBuilder builderWithNull = new HybridQueryBuilder();
+        IllegalArgumentException nullException = assertThrows(IllegalArgumentException.class, () -> builderWithNull.add(null));
+        assertEquals("inner hybrid query clause cannot be null", nullException.getMessage());
+    }
+
     public void testVisit() {
         HybridQueryBuilder hybridQueryBuilder = new HybridQueryBuilder().add(new NeuralQueryBuilder()).add(new NeuralSparseQueryBuilder());
         List<QueryBuilder> visitedQueries = new ArrayList<>();