diff --git a/backend/danswer/configs/model_configs.py b/backend/danswer/configs/model_configs.py index 89b500da985..59a1961ce0e 100644 --- a/backend/danswer/configs/model_configs.py +++ b/backend/danswer/configs/model_configs.py @@ -21,7 +21,9 @@ DOC_EMBEDDING_DIM = 384 # Model should be chosen with 512 context size, ideally don't change this DOC_EMBEDDING_CONTEXT_SIZE = 512 -NORMALIZE_EMBEDDINGS = (os.environ.get("SKIP_RERANKING") or "False").lower() == "true" +NORMALIZE_EMBEDDINGS = ( + os.environ.get("NORMALIZE_EMBEDDINGS") or "False" +).lower() == "true" # These are only used if reranking is turned off, to normalize the direct retrieval scores for display SIM_SCORE_RANGE_LOW = float(os.environ.get("SIM_SCORE_RANGE_LOW") or 0.0) SIM_SCORE_RANGE_HIGH = float(os.environ.get("SIM_SCORE_RANGE_HIGH") or 1.0) @@ -47,10 +49,8 @@ CROSS_ENCODER_RANGE_MIN = -12 CROSS_EMBED_CONTEXT_SIZE = 512 - -# Better to keep it loose, surfacing more results better than missing results -# Currently unused by Vespa -SEARCH_DISTANCE_CUTOFF = 0.1 # Cosine similarity (currently), range of -1 to 1 with -1 being completely opposite +# Unused currently, can't be used with the current default encoder model due to its output range +SEARCH_DISTANCE_CUTOFF = 0 # Intent model max context size QUERY_MAX_CONTEXT_SIZE = 256 diff --git a/backend/danswer/document_index/vespa/app_config/schemas/danswer_chunk.sd b/backend/danswer/document_index/vespa/app_config/schemas/danswer_chunk.sd index aed8184dc95..76986a33ebc 100644 --- a/backend/danswer/document_index/vespa/app_config/schemas/danswer_chunk.sd +++ b/backend/danswer/document_index/vespa/app_config/schemas/danswer_chunk.sd @@ -106,7 +106,7 @@ schema danswer_chunk { } function inline document_age() { - # Time in years (3 Months if no age found) + # Time in years (91.3 days ~= 3 Months ~= 1 fiscal quarter if no age found) expression: max(if(isNan(attribute(doc_updated_at)) == 1, 7890000, now() - attribute(doc_updated_at)) / 31536000, 0) } @@ -122,6 +122,8 @@ schema danswer_chunk { first-phase { expression: bm25(content) * document_boost * recency_bias } + + match-features: recency_bias document_boost bm25(content) } rank-profile semantic_search inherits default, default_rank { @@ -135,7 +137,7 @@ schema danswer_chunk { expression: closeness(field, embeddings) } - match-features: recency_bias closest(embeddings) + match-features: recency_bias document_boost closest(embeddings) } rank-profile hybrid_search inherits default, default_rank { @@ -148,11 +150,12 @@ schema danswer_chunk { } global-phase { - expression: (normalize_linear(closeness(field, embeddings)) + normalize_linear(bm25(content))) * document_boost * recency_bias + expression: (normalize_linear(closeness(field, embeddings)) + normalize_linear(bm25(content))) / 2 * document_boost * recency_bias rerank-count: 1000 } - match-features: recency_bias closest(embeddings) + # Cannot pass normalize_linear features in match-features + match-features: recency_bias document_boost closest(embeddings) } # used when searching from the admin UI for a specific doc to hide / boost diff --git a/backend/danswer/document_index/vespa/index.py b/backend/danswer/document_index/vespa/index.py index 6f89d86fc0b..80d5737352d 100644 --- a/backend/danswer/document_index/vespa/index.py +++ b/backend/danswer/document_index/vespa/index.py @@ -310,13 +310,15 @@ def _build_or_filters(key: str, vals: list[str] | None) -> str: def _build_time_filter( cutoff: datetime | None, - untimed_doc_cutoff: timedelta = timedelta(days=62), # Slightly over 2 Months + # Slightly over 3 Months, approximately 1 fiscal quarter + untimed_doc_cutoff: timedelta = timedelta(days=92), ) -> str: if not cutoff: return "" # For Documents that don't have an updated at, filter them out for queries asking for - # very recent documents (2 months) default + # very recent documents (3 months) default. Documents that don't have an updated at + # time are assigned 3 months for time decay value include_untimed = datetime.now(timezone.utc) - untimed_doc_cutoff > cutoff cutoff_secs = int(cutoff.timestamp()) @@ -340,10 +342,6 @@ def _build_time_filter( return filter_str -def _build_vespa_limit(num_to_retrieve: int, offset: int = 0) -> str: - return f" limit {num_to_retrieve} offset {offset}" - - def _process_dynamic_summary( dynamic_summary: str, max_summary_length: int = 400 ) -> list[str]: @@ -605,7 +603,6 @@ def keyword_retrieval( # not working as desired + '({grammar: "weakAnd"}userInput(@query) ' + f'or ({{defaultIndex: "{CONTENT_SUMMARY}"}}userInput(@query)))' - + _build_vespa_limit(num_to_retrieve) ) final_query = query_processing(query) if edit_keyword_query else query @@ -615,7 +612,7 @@ def keyword_retrieval( "query": final_query, "input.query(decay_factor)": str(DOC_TIME_DECAY * decay_multiplier), "hits": num_to_retrieve, - "num_to_rerank": 10 * num_to_retrieve, + "offset": 0, "ranking.profile": "keyword_search", } @@ -640,7 +637,6 @@ def semantic_retrieval( # needed for highlighting while the N-gram highlighting is broken / # not working as desired + f'or ({{defaultIndex: "{CONTENT_SUMMARY}"}}userInput(@query)))' - + _build_vespa_limit(num_to_retrieve) ) query_embedding = embed_query(query) @@ -649,11 +645,13 @@ def semantic_retrieval( " ".join(remove_stop_words(query)) if edit_keyword_query else query ) - params = { + params: dict[str, str | int] = { "yql": yql, - "query": query_keywords, + "query": query_keywords, # Needed for highlighting "input.query(query_embedding)": str(query_embedding), "input.query(decay_factor)": str(DOC_TIME_DECAY * decay_multiplier), + "hits": num_to_retrieve, + "offset": 0, "ranking.profile": "semantic_search", } @@ -668,8 +666,35 @@ def hybrid_retrieval( distance_cutoff: float | None = SEARCH_DISTANCE_CUTOFF, edit_keyword_query: bool = EDIT_KEYWORD_QUERY, ) -> list[InferenceChunk]: - # TODO introduce the real hybrid search - return self.semantic_retrieval(query, filters, favor_recent, num_to_retrieve) + decay_multiplier = FAVOR_RECENT_DECAY_MULTIPLIER if favor_recent else 1 + vespa_where_clauses = _build_vespa_filters(filters) + # Needs to be at least as much as the value set in Vespa schema config + target_hits = max(10 * num_to_retrieve, 1000) + yql = ( + VespaIndex.yql_base + + vespa_where_clauses + + f"(({{targetHits: {target_hits}}}nearestNeighbor(embeddings, query_embedding)) " + + 'or ({grammar: "weakAnd"}userInput(@query)) ' + + f'or ({{defaultIndex: "{CONTENT_SUMMARY}"}}userInput(@query)))' + ) + + query_embedding = embed_query(query) + + query_keywords = ( + " ".join(remove_stop_words(query)) if edit_keyword_query else query + ) + + params: dict[str, str | int] = { + "yql": yql, + "query": query_keywords, + "input.query(query_embedding)": str(query_embedding), + "input.query(decay_factor)": str(DOC_TIME_DECAY * decay_multiplier), + "hits": num_to_retrieve, + "offset": 0, + "ranking.profile": "hybrid_search", + } + + return _query_vespa(params) def admin_retrieval( self, @@ -686,14 +711,13 @@ def admin_retrieval( # needed for highlighting while the N-gram highlighting is broken / # not working as desired + f'or ({{defaultIndex: "{CONTENT_SUMMARY}"}}userInput(@query)))' - + _build_vespa_limit(num_to_retrieve) ) params: dict[str, str | int] = { "yql": yql, "query": query, "hits": num_to_retrieve, - "num_to_rerank": 10 * num_to_retrieve, + "offset": 0, "ranking.profile": "admin_search", }