Skip to content

Commit

Permalink
Merge pull request #1551 from vespa-engine/andreer/stopword-removal
Browse files Browse the repository at this point in the history
remove stopwords
  • Loading branch information
ldalves authored Oct 30, 2024
2 parents 8065b0e + fadaf4e commit c51b318
Show file tree
Hide file tree
Showing 4 changed files with 459 additions and 3 deletions.
17 changes: 17 additions & 0 deletions visual-retrieval-colpali/backend/stopwords.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import spacy
import os

# Download the model if it is not already present
if not spacy.util.is_package("en_core_web_sm"):
spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")

# It would be possible to remove bolding for stopwords without removing them from the query,
# but that would require a java plugin which we didn't want to complicate this sample app with.
def filter(text):
doc = nlp(text)
tokens = [token.text for token in doc if not token.is_stop]
if len(tokens) == 0:
# if we remove all the words we don't have a query at all, so use the original
return text
return " ".join(tokens)
6 changes: 5 additions & 1 deletion visual-retrieval-colpali/backend/vespa_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from vespa.application import Vespa
from vespa.io import VespaQueryResponse
from .colpali import should_filter_token

import backend.stopwords

class VespaQueryClient:
MAX_QUERY_TERMS = 64
Expand Down Expand Up @@ -275,6 +275,10 @@ async def get_result_from_query(
Returns:
Dict[str, Any]: The query results.
"""

# Remove stopwords from the query to avoid visual emphasis on irrelevant words (e.g., "the", "and", "of")
query = backend.stopwords.filter(query)

rank_method = ranking.split("_")[0]
sim_map: bool = len(ranking.split("_")) > 1 and ranking.split("_")[1] == "sim"
if rank_method == "nn+colpali":
Expand Down
4 changes: 3 additions & 1 deletion visual-retrieval-colpali/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@ dependencies = [
"setuptools",
"python-dotenv",
"shad4fast>=1.2.1",
"google-generativeai>=0.7.2"
"google-generativeai>=0.7.2",
"spacy",
"pip"
]

# dev-dependencies
Expand Down
Loading

0 comments on commit c51b318

Please sign in to comment.