Skip to content

Commit

Permalink
Merge pull request #970 from ecds/feature/951-exact-search
Browse files Browse the repository at this point in the history
Search with exact matches using double quotes (#951)
  • Loading branch information
jayvarner authored Nov 21, 2023
2 parents 7795a57 + 10ffc68 commit 4c81e97
Show file tree
Hide file tree
Showing 5 changed files with 131 additions and 45 deletions.
15 changes: 0 additions & 15 deletions apps/cms/templatetags/readux_templatetags.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,3 @@
def order_by(queryset, args):
args = [x.strip() for x in args.split(",")]
return queryset.order_by(*args)


@register.filter
def dict_item(dictionary, key):
"""'Template filter to allow accessing dictionary value by variable key.
Example use::
{{ mydict|dict_item:keyvar }}
"""
# adapted from Princeton-CDH/geniza project https://github.com/Princeton-CDH/geniza/
try:
return dictionary[key]
except AttributeError:
# fail silently if something other than a dict is passed
return None
58 changes: 58 additions & 0 deletions apps/readux/templatetags/readux_extras.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
from collections import OrderedDict
from django.template import Library

register = Library()


@register.filter
def has_inner_hits(volume):
"""Template filter to determine if there are any inner hits across the volume"""
try:
inner_hits = volume.meta.inner_hits
for key in inner_hits.to_dict().keys():
if inner_hits[key].hits.total.value:
return True
except AttributeError:
pass
return False


@register.filter
def group_by_canvas(inner_hits):
"""Template filter to group inner hits by canvas #, then flatten into list"""
hits_dict = inner_hits.to_dict()
# dict keyed on canvas
canvases = {}
for key in hits_dict.keys():
for canvas in hits_dict[key]:
if not canvas.position in canvases:
# only need to get some info once per canvas (position, pid)
canvases[canvas.position] = {
"pid": canvas.pid,
"position": canvas.position,
"highlights": [],
"search_terms": [key],
}
else:
# keep track of search term for exact match queries
canvases[canvas.position]["search_terms"] += [key]
# collect highlights per canvas
if canvas.meta and canvas.meta.highlight:
for result in canvas.meta.highlight["canvas_set.result"]:
canvases[canvas.position]["highlights"].append(result)

# flatten values into list for display
grouped = []
for canvas in canvases.values():
# result should generally be of length 3 or less, but if there are multiple exact queries
# in this search, ensure at least one highlighted page per exact query (i.e. if none of the
# search terms matched on this page are matched on any of the pages we've selected for
# display, ensure this page gets displayed too)
if len(grouped) < 3 or not any(
[
set(c["search_terms"]).intersection(canvas["search_terms"])
for c in grouped
]
):
grouped.append(canvas)
return grouped
88 changes: 65 additions & 23 deletions apps/readux/views.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Django Views for the Readux app"""
import re
from os import path
from urllib.parse import urlencode
from django.http import HttpResponse
Expand Down Expand Up @@ -339,6 +340,9 @@ class VolumeSearchView(ListView, FormMixin):
"sort": "label_alphabetical"
}

# regex to match terms in doublequotes
re_exact_match = re.compile(r'\B(".+?")\B')

def get_form_kwargs(self):
# adapted from Princeton-CDH/geniza project https://github.com/Princeton-CDH/geniza/
kwargs = super().get_form_kwargs()
Expand Down Expand Up @@ -409,37 +413,75 @@ def get_queryset(self):
search_query = form_data.get("q") or ""
scope = form_data.get("scope") or "all"
if search_query:
queries = []
# find exact match queries (words or phrases in double quotes)
exact_queries = self.re_exact_match.findall(search_query)
# remove exact queries from the original search query to search separately
search_query = re.sub(self.re_exact_match , "", search_query).strip()

es_queries = []
es_queries_exact = []
if scope in ["all", "metadata"]:
# query for root level fields
multimatch_query = Q(
"multi_match", query=search_query, fields=self.query_search_fields
)
queries.append(multimatch_query)
if search_query:
multimatch_query = Q(
"multi_match", query=search_query, fields=self.query_search_fields
)
es_queries.append(multimatch_query)
for exq in exact_queries:
# separate exact searches so we can put them in "must" boolean query
multimatch_exact = Q(
"multi_match",
query=exq.replace('"', "").strip(), # strip double quotes
fields=self.query_search_fields,
type="phrase", # type = "phrase" for exact phrase matches
)
es_queries_exact.append({"bool": {"should": [multimatch_exact]}})

if scope in ["all", "text"]:
# query for nested fields (i.e. canvas position and text)
nested_query = Q(
"nested",
path="canvas_set",
query=Q(
"multi_match",
query=search_query,
fields=["canvas_set.result"],
),
inner_hits={
"name": "canvases",
"size": 3, # max number of pages shown in full-text results
"highlight": {"fields": {"canvas_set.result": {}}},
},
nested_kwargs = {
"path": "canvas_set",
# sum scores if in full text only search, so vols with most hits show up first.
# if also searching metadata, use avg (default) instead, to not over-inflate.
score_mode="sum" if scope == "text" else "avg",
)
queries.append(nested_query)
"score_mode": "sum" if scope == "text" else "avg",
}
inner_hits_dict = {
"size": 3, # max number of pages shown in full-text results
"highlight": {"fields": {"canvas_set.result": {}}},
}
if search_query:
nested_query = Q(
"nested",
query=Q(
"multi_match",
query=search_query,
fields=["canvas_set.result"],
),
inner_hits={ **inner_hits_dict, "name": "canvases" },
**nested_kwargs,
)
es_queries.append(nested_query)
for i, exq in enumerate(exact_queries):
# separate exact searches so we can put them in "must" boolean query
nested_exact = Q(
"nested",
query=Q(
"multi_match",
query=exq.replace('"', "").strip(),
fields=["canvas_set.result"],
type="phrase",
),
# each inner_hits set needs to have a different name in elasticsearch
inner_hits={ **inner_hits_dict, "name": f"canvases_{i}" },
**nested_kwargs,
)
if scope == "all":
es_queries_exact[i]["bool"]["should"].append(nested_exact)
else:
es_queries_exact.append({"bool": {"should": [nested_exact]}})

# combine them with bool: { should }
q = Q("bool", should=queries)
# combine them with bool: { should, must }
q = Q("bool", should=es_queries, must=es_queries_exact)
volumes = volumes.query(q)

# highlight
Expand Down
3 changes: 2 additions & 1 deletion apps/templates/search_results.html
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,8 @@ <h1 class="uk-heading-medium uk-text-center">Search</h1>
</fieldset>
<span class="uk-text-small">
Search for individual whole keywords. Multiple words will be searched as
'or' (e.g. Rome London = Rome or London).
'or' (e.g. Rome London = Rome or London). Surround a word or phrase in
double quotes (e.g. "Roman painter") to require exact matches in results.
</span>
<fieldset class="uk-margin uk-width-1-1">
<div class="uk-form-label">{{ form.sort.label }}</div>
Expand Down
12 changes: 6 additions & 6 deletions apps/templates/snippets/volume_result.html
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{% load readux_templatetags %}
{% load readux_extras %}

<li class="uk-width-1-1@m uk-margin-small">
<h4>
Expand Down Expand Up @@ -73,15 +73,15 @@ <h4>
{% endif %}
</dd>
{% endif %}
{% if 'inner_hits' in volume.meta and volume.meta.inner_hits.canvases.hits.total.value %}
<dt>Full Text</dt>
{% for canvas in volume.meta.inner_hits.canvases %}
{% if volume|has_inner_hits %}
<dt>Full Text</dt>
{% for canvas in volume.meta.inner_hits|group_by_canvas %}
<dd class="result-page">
<a href="{% url 'page' volume=volume.pid page=canvas.pid %}">
<span class="page-number">p. {{ canvas.position|add:1 }}</span>
{% if canvas.meta.highlight %}
{% if canvas.highlights|length %}
<ul class="highlights">
{% for fragment in canvas.meta.highlight|dict_item:"canvas_set.result" %}
{% for fragment in canvas.highlights %}
<li>{{ fragment|safe }}</li>
{% endfor %}
</ul>
Expand Down

0 comments on commit 4c81e97

Please sign in to comment.