Skip to content

Commit

Permalink
Display second-best match if best match is not in title, sentence, or…
Browse files Browse the repository at this point in the history
… entity
  • Loading branch information
memgonzales committed Sep 5, 2023
1 parent 410cbaf commit 35cc1d9
Showing 1 changed file with 42 additions and 17 deletions.
59 changes: 42 additions & 17 deletions callbacks/text_mining/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,25 @@ def find_index_space_after(index, text):
return index


def display_aligned_substring_in_bold(text, similarity):
before_match = text[:find_index_space_before(
similarity.dest_start, text)]
match = text[find_index_space_before(
similarity.dest_start, text):find_index_space_after(similarity.dest_end, text)]
after_match = text[find_index_space_after(
similarity.dest_end, text):]

text = before_match
if before_match[:-1] == ' ' or match[0] == ' ':
text += ' '
text += f'<b>{match}</b>'
if after_match[0] == ' ':
text += ' '
text += after_match

return text


def text_mining_query_search(query_string):
# Make case-insensitive and remove starting and trailing spaces
query_string = query_string.lower().strip()
Expand All @@ -90,20 +109,7 @@ def text_mining_query_search(query_string):

try:
# Display the matching substring in bold
before_match = line[:find_index_space_before(
similarity.dest_start, line)]
match = line[find_index_space_before(
similarity.dest_start, line):find_index_space_after(similarity.dest_end, line)]
after_match = line[find_index_space_after(
similarity.dest_end, line):]

line = before_match
if before_match[:-1] == ' ' or match[0] == ' ':
line += ' '
line += f'<b>{match}</b>'
if after_match[0] == ' ':
line += ' '
line += after_match
line = display_aligned_substring_in_bold(line, similarity)

if similarity.score > 0:
try:
Expand Down Expand Up @@ -141,13 +147,32 @@ def text_mining_query_search(query_string):
Entity = addl_sanitize_for_bold(Entity)
Type = addl_sanitize_for_bold(Type)

if PMID == '34199720':
print(Sentence)

if Type == 'Gene':
if Sentence == 'None':
Sentence = Title

# If the Type is the one containing the aligned substring, then no bold substring is displayed to the user
# We resort to parsing the string again, but, this time, excluding the Type
if '<b>' not in Title and '<b>' not in Sentence and '<b>' not in Entity:
title_sim = rapidfuzz.fuzz.partial_ratio_alignment(
query_string, Title.lower(), score_cutoff=SIMILARITY_CUTOFF)
sentence_sim = rapidfuzz.fuzz.partial_ratio_alignment(
query_string, Sentence.lower(), score_cutoff=SIMILARITY_CUTOFF)
entity_sim = rapidfuzz.fuzz.partial_ratio_alignment(
query_string, Entity.lower(), score_cutoff=SIMILARITY_CUTOFF)

max_sim = max(title_sim.score,
sentence_sim.score, entity_sim.score)
if max_sim == title_sim.score:
Title = display_aligned_substring_in_bold(
Title, title_sim)
elif max_sim == sentence_sim.score:
Sentence = display_aligned_substring_in_bold(
Sentence, sentence_sim)
else:
Entity = display_aligned_substring_in_bold(
Entity, entity_sim)

pubmed_matches.add(PMID)
df.loc[len(df.index)] = [Entity, PMID,
Title, Sentence, similarity.score]
Expand Down

0 comments on commit 35cc1d9

Please sign in to comment.