onyx-dot-app · yuhongsun96 · Oct 30, 2023 · Oct 29, 2023 · Oct 30, 2023 · Oct 30, 2023
diff --git a/backend/danswer/configs/model_configs.py b/backend/danswer/configs/model_configs.py
@@ -21,7 +21,9 @@
 DOC_EMBEDDING_DIM = 384
 # Model should be chosen with 512 context size, ideally don't change this
 DOC_EMBEDDING_CONTEXT_SIZE = 512
-NORMALIZE_EMBEDDINGS = (os.environ.get("SKIP_RERANKING") or "False").lower() == "true"
+NORMALIZE_EMBEDDINGS = (
+    os.environ.get("NORMALIZE_EMBEDDINGS") or "False"
+).lower() == "true"
 # These are only used if reranking is turned off, to normalize the direct retrieval scores for display
 SIM_SCORE_RANGE_LOW = float(os.environ.get("SIM_SCORE_RANGE_LOW") or 0.0)
 SIM_SCORE_RANGE_HIGH = float(os.environ.get("SIM_SCORE_RANGE_HIGH") or 1.0)
@@ -47,10 +49,8 @@
 CROSS_ENCODER_RANGE_MIN = -12
 CROSS_EMBED_CONTEXT_SIZE = 512
 
-
-# Better to keep it loose, surfacing more results better than missing results
-# Currently unused by Vespa
-SEARCH_DISTANCE_CUTOFF = 0.1  # Cosine similarity (currently), range of -1 to 1 with -1 being completely opposite
+# Unused currently, can't be used with the current default encoder model due to its output range
+SEARCH_DISTANCE_CUTOFF = 0
 
 # Intent model max context size
 QUERY_MAX_CONTEXT_SIZE = 256

diff --git a/backend/danswer/document_index/vespa/app_config/schemas/danswer_chunk.sd b/backend/danswer/document_index/vespa/app_config/schemas/danswer_chunk.sd
@@ -106,7 +106,7 @@ schema danswer_chunk {
         }
 
         function inline document_age() {
-            # Time in years (3 Months if no age found)
+            # Time in years (91.3 days ~= 3 Months ~= 1 fiscal quarter if no age found)
             expression: max(if(isNan(attribute(doc_updated_at)) == 1, 7890000, now() - attribute(doc_updated_at)) / 31536000, 0)
         }
 
@@ -122,6 +122,8 @@ schema danswer_chunk {
         first-phase {
             expression: bm25(content) * document_boost * recency_bias
         }
+
+        match-features: recency_bias document_boost bm25(content)
     }
 
     rank-profile semantic_search inherits default, default_rank {
@@ -135,7 +137,7 @@ schema danswer_chunk {
             expression: closeness(field, embeddings)
         }
 
-        match-features: recency_bias closest(embeddings)
+        match-features: recency_bias document_boost closest(embeddings)
     }
 
     rank-profile hybrid_search inherits default, default_rank {
@@ -148,11 +150,12 @@ schema danswer_chunk {
         }
 
         global-phase {
-            expression: (normalize_linear(closeness(field, embeddings)) + normalize_linear(bm25(content))) * document_boost * recency_bias
+            expression: (normalize_linear(closeness(field, embeddings)) + normalize_linear(bm25(content))) / 2 * document_boost * recency_bias
             rerank-count: 1000
         }
 
-        match-features: recency_bias closest(embeddings)
+        # Cannot pass normalize_linear features in match-features
+        match-features: recency_bias document_boost closest(embeddings)
     }
 
     # used when searching from the admin UI for a specific doc to hide / boost

diff --git a/backend/danswer/document_index/vespa/index.py b/backend/danswer/document_index/vespa/index.py
@@ -310,13 +310,15 @@ def _build_or_filters(key: str, vals: list[str] | None) -> str:
 
     def _build_time_filter(
         cutoff: datetime | None,
-        untimed_doc_cutoff: timedelta = timedelta(days=62),  # Slightly over 2 Months
+        # Slightly over 3 Months, approximately 1 fiscal quarter
+        untimed_doc_cutoff: timedelta = timedelta(days=92),
     ) -> str:
         if not cutoff:
             return ""
 
         # For Documents that don't have an updated at, filter them out for queries asking for
-        # very recent documents (2 months) default
+        # very recent documents (3 months) default. Documents that don't have an updated at
+        # time are assigned 3 months for time decay value
         include_untimed = datetime.now(timezone.utc) - untimed_doc_cutoff > cutoff
         cutoff_secs = int(cutoff.timestamp())
 
@@ -340,10 +342,6 @@ def _build_time_filter(
     return filter_str
 
 
-def _build_vespa_limit(num_to_retrieve: int, offset: int = 0) -> str:
-    return f" limit {num_to_retrieve} offset {offset}"
-
-
 def _process_dynamic_summary(
     dynamic_summary: str, max_summary_length: int = 400
 ) -> list[str]:
@@ -605,7 +603,6 @@ def keyword_retrieval(
             # not working as desired
             + '({grammar: "weakAnd"}userInput(@query) '
             + f'or ({{defaultIndex: "{CONTENT_SUMMARY}"}}userInput(@query)))'
-            + _build_vespa_limit(num_to_retrieve)
         )
 
         final_query = query_processing(query) if edit_keyword_query else query
@@ -615,7 +612,7 @@ def keyword_retrieval(
             "query": final_query,
             "input.query(decay_factor)": str(DOC_TIME_DECAY * decay_multiplier),
             "hits": num_to_retrieve,
-            "num_to_rerank": 10 * num_to_retrieve,
+            "offset": 0,
             "ranking.profile": "keyword_search",
         }
 
@@ -640,7 +637,6 @@ def semantic_retrieval(
             # needed for highlighting while the N-gram highlighting is broken /
             # not working as desired
             + f'or ({{defaultIndex: "{CONTENT_SUMMARY}"}}userInput(@query)))'
-            + _build_vespa_limit(num_to_retrieve)
         )
 
         query_embedding = embed_query(query)
@@ -649,11 +645,13 @@ def semantic_retrieval(
             " ".join(remove_stop_words(query)) if edit_keyword_query else query
         )
 
-        params = {
+        params: dict[str, str | int] = {
             "yql": yql,
-            "query": query_keywords,
+            "query": query_keywords,  # Needed for highlighting
             "input.query(query_embedding)": str(query_embedding),
             "input.query(decay_factor)": str(DOC_TIME_DECAY * decay_multiplier),
+            "hits": num_to_retrieve,
+            "offset": 0,
             "ranking.profile": "semantic_search",
         }
 
@@ -668,8 +666,35 @@ def hybrid_retrieval(
         distance_cutoff: float | None = SEARCH_DISTANCE_CUTOFF,
         edit_keyword_query: bool = EDIT_KEYWORD_QUERY,
     ) -> list[InferenceChunk]:
-        # TODO introduce the real hybrid search
-        return self.semantic_retrieval(query, filters, favor_recent, num_to_retrieve)
+        decay_multiplier = FAVOR_RECENT_DECAY_MULTIPLIER if favor_recent else 1
+        vespa_where_clauses = _build_vespa_filters(filters)
+        # Needs to be at least as much as the value set in Vespa schema config
+        target_hits = max(10 * num_to_retrieve, 1000)
+        yql = (
+            VespaIndex.yql_base
+            + vespa_where_clauses
+            + f"(({{targetHits: {target_hits}}}nearestNeighbor(embeddings, query_embedding)) "
+            + 'or ({grammar: "weakAnd"}userInput(@query)) '
+            + f'or ({{defaultIndex: "{CONTENT_SUMMARY}"}}userInput(@query)))'
+        )
+
+        query_embedding = embed_query(query)
+
+        query_keywords = (
+            " ".join(remove_stop_words(query)) if edit_keyword_query else query
+        )
+
+        params: dict[str, str | int] = {
+            "yql": yql,
+            "query": query_keywords,
+            "input.query(query_embedding)": str(query_embedding),
+            "input.query(decay_factor)": str(DOC_TIME_DECAY * decay_multiplier),
+            "hits": num_to_retrieve,
+            "offset": 0,
+            "ranking.profile": "hybrid_search",
+        }
+
+        return _query_vespa(params)
 
     def admin_retrieval(
         self,
@@ -686,14 +711,13 @@ def admin_retrieval(
             # needed for highlighting while the N-gram highlighting is broken /
             # not working as desired
             + f'or ({{defaultIndex: "{CONTENT_SUMMARY}"}}userInput(@query)))'
-            + _build_vespa_limit(num_to_retrieve)
         )
 
         params: dict[str, str | int] = {
             "yql": yql,
             "query": query,
             "hits": num_to_retrieve,
-            "num_to_rerank": 10 * num_to_retrieve,
+            "offset": 0,
             "ranking.profile": "admin_search",
         }