Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Hybrid Search #653

Merged
merged 3 commits into from
Oct 30, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions backend/danswer/configs/model_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,9 @@
DOC_EMBEDDING_DIM = 384
# Model should be chosen with 512 context size, ideally don't change this
DOC_EMBEDDING_CONTEXT_SIZE = 512
NORMALIZE_EMBEDDINGS = (os.environ.get("SKIP_RERANKING") or "False").lower() == "true"
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oops...

NORMALIZE_EMBEDDINGS = (
os.environ.get("NORMALIZE_EMBEDDINGS") or "False"
).lower() == "true"
# These are only used if reranking is turned off, to normalize the direct retrieval scores for display
SIM_SCORE_RANGE_LOW = float(os.environ.get("SIM_SCORE_RANGE_LOW") or 0.0)
SIM_SCORE_RANGE_HIGH = float(os.environ.get("SIM_SCORE_RANGE_HIGH") or 1.0)
Expand All @@ -47,10 +49,8 @@
CROSS_ENCODER_RANGE_MIN = -12
CROSS_EMBED_CONTEXT_SIZE = 512


# Better to keep it loose, surfacing more results better than missing results
# Currently unused by Vespa
SEARCH_DISTANCE_CUTOFF = 0.1 # Cosine similarity (currently), range of -1 to 1 with -1 being completely opposite
# Unused currently, can't be used with the current default encoder model due to its output range
SEARCH_DISTANCE_CUTOFF = 0

# Intent model max context size
QUERY_MAX_CONTEXT_SIZE = 256
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ schema danswer_chunk {
}

function inline document_age() {
# Time in years (3 Months if no age found)
# Time in years (91.3 days ~= 3 Months ~= 1 fiscal quarter if no age found)
expression: max(if(isNan(attribute(doc_updated_at)) == 1, 7890000, now() - attribute(doc_updated_at)) / 31536000, 0)
}

Expand All @@ -122,6 +122,8 @@ schema danswer_chunk {
first-phase {
expression: bm25(content) * document_boost * recency_bias
}

match-features: recency_bias document_boost bm25(content)
}

rank-profile semantic_search inherits default, default_rank {
Expand All @@ -135,7 +137,7 @@ schema danswer_chunk {
expression: closeness(field, embeddings)
}

match-features: recency_bias closest(embeddings)
match-features: recency_bias document_boost closest(embeddings)
}

rank-profile hybrid_search inherits default, default_rank {
Expand All @@ -148,11 +150,12 @@ schema danswer_chunk {
}

global-phase {
expression: (normalize_linear(closeness(field, embeddings)) + normalize_linear(bm25(content))) * document_boost * recency_bias
expression: (normalize_linear(closeness(field, embeddings)) + normalize_linear(bm25(content))) / 2 * document_boost * recency_bias
rerank-count: 1000
}

match-features: recency_bias closest(embeddings)
# Cannot pass normalize_linear features in match-features
match-features: recency_bias document_boost closest(embeddings)
}

# used when searching from the admin UI for a specific doc to hide / boost
Expand Down
54 changes: 39 additions & 15 deletions backend/danswer/document_index/vespa/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,13 +310,15 @@ def _build_or_filters(key: str, vals: list[str] | None) -> str:

def _build_time_filter(
cutoff: datetime | None,
untimed_doc_cutoff: timedelta = timedelta(days=62), # Slightly over 2 Months
# Slightly over 3 Months, approximately 1 fiscal quarter
untimed_doc_cutoff: timedelta = timedelta(days=92),
) -> str:
if not cutoff:
return ""

# For Documents that don't have an updated at, filter them out for queries asking for
# very recent documents (2 months) default
# very recent documents (3 months) default. Documents that don't have an updated at
# time are assigned 3 months for time decay value
include_untimed = datetime.now(timezone.utc) - untimed_doc_cutoff > cutoff
cutoff_secs = int(cutoff.timestamp())

Expand All @@ -340,10 +342,6 @@ def _build_time_filter(
return filter_str


def _build_vespa_limit(num_to_retrieve: int, offset: int = 0) -> str:
return f" limit {num_to_retrieve} offset {offset}"


def _process_dynamic_summary(
dynamic_summary: str, max_summary_length: int = 400
) -> list[str]:
Expand Down Expand Up @@ -605,7 +603,6 @@ def keyword_retrieval(
# not working as desired
+ '({grammar: "weakAnd"}userInput(@query) '
+ f'or ({{defaultIndex: "{CONTENT_SUMMARY}"}}userInput(@query)))'
+ _build_vespa_limit(num_to_retrieve)
)

final_query = query_processing(query) if edit_keyword_query else query
Expand All @@ -615,7 +612,7 @@ def keyword_retrieval(
"query": final_query,
"input.query(decay_factor)": str(DOC_TIME_DECAY * decay_multiplier),
"hits": num_to_retrieve,
"num_to_rerank": 10 * num_to_retrieve,
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was never doing anything :/

"offset": 0,
"ranking.profile": "keyword_search",
}

Expand All @@ -640,7 +637,6 @@ def semantic_retrieval(
# needed for highlighting while the N-gram highlighting is broken /
# not working as desired
+ f'or ({{defaultIndex: "{CONTENT_SUMMARY}"}}userInput(@query)))'
+ _build_vespa_limit(num_to_retrieve)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As far as testing shows, adding it to yql vs parameters is the same, feels easier/cleaner to just have it in the params

)

query_embedding = embed_query(query)
Expand All @@ -649,11 +645,13 @@ def semantic_retrieval(
" ".join(remove_stop_words(query)) if edit_keyword_query else query
)

params = {
params: dict[str, str | int] = {
"yql": yql,
"query": query_keywords,
"query": query_keywords, # Needed for highlighting
"input.query(query_embedding)": str(query_embedding),
"input.query(decay_factor)": str(DOC_TIME_DECAY * decay_multiplier),
"hits": num_to_retrieve,
"offset": 0,
"ranking.profile": "semantic_search",
}

Expand All @@ -668,8 +666,35 @@ def hybrid_retrieval(
distance_cutoff: float | None = SEARCH_DISTANCE_CUTOFF,
edit_keyword_query: bool = EDIT_KEYWORD_QUERY,
) -> list[InferenceChunk]:
# TODO introduce the real hybrid search
return self.semantic_retrieval(query, filters, favor_recent, num_to_retrieve)
decay_multiplier = FAVOR_RECENT_DECAY_MULTIPLIER if favor_recent else 1
vespa_where_clauses = _build_vespa_filters(filters)
# Needs to be at least as much as the value set in Vespa schema config
target_hits = max(10 * num_to_retrieve, 1000)
yql = (
VespaIndex.yql_base
+ vespa_where_clauses
+ f"(({{targetHits: {target_hits}}}nearestNeighbor(embeddings, query_embedding)) "
+ 'or ({grammar: "weakAnd"}userInput(@query)) '
+ f'or ({{defaultIndex: "{CONTENT_SUMMARY}"}}userInput(@query)))'
)

query_embedding = embed_query(query)

query_keywords = (
" ".join(remove_stop_words(query)) if edit_keyword_query else query
)

params: dict[str, str | int] = {
"yql": yql,
"query": query_keywords,
"input.query(query_embedding)": str(query_embedding),
"input.query(decay_factor)": str(DOC_TIME_DECAY * decay_multiplier),
"hits": num_to_retrieve,
"offset": 0,
"ranking.profile": "hybrid_search",
}

return _query_vespa(params)

def admin_retrieval(
self,
Expand All @@ -686,14 +711,13 @@ def admin_retrieval(
# needed for highlighting while the N-gram highlighting is broken /
# not working as desired
+ f'or ({{defaultIndex: "{CONTENT_SUMMARY}"}}userInput(@query)))'
+ _build_vespa_limit(num_to_retrieve)
)

params: dict[str, str | int] = {
"yql": yql,
"query": query,
"hits": num_to_retrieve,
"num_to_rerank": 10 * num_to_retrieve,
"offset": 0,
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Set all these as 0 just so we remember later how to do it if we decide to introduce pagination

"ranking.profile": "admin_search",
}

Expand Down