Skip to content

Commit

Permalink
Merge pull request #109 from bioinfodlsu/hot_fix
Browse files Browse the repository at this point in the history
Display only unique PubMed articles in text-mining results
  • Loading branch information
pbong authored Sep 5, 2023
2 parents ce0dd2f + 2082a5a commit 0f45de8
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 22 deletions.
10 changes: 2 additions & 8 deletions callbacks/text_mining/callbacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,19 +120,13 @@ def display_text_mining_results(text_mining_is_submitted, homepage_submitted, te
columns = [{'id': x, 'name': x, 'presentation': 'markdown'}
for x in text_mining_results_df.columns]

num_entries = get_num_entries(text_mining_results_df, "PMID")
num_unique_entries = get_num_unique_entries(
text_mining_results_df, "PMID")

if num_entries == 1:
stats = f'Found {num_entries} match '
else:
stats = f'Found {num_entries} matches '

if num_unique_entries == 1:
stats += f'across {num_unique_entries} publication'
stats = f'Found matches across {num_unique_entries} publication'
else:
stats += f'across {num_unique_entries} publications'
stats = f'Found matches across {num_unique_entries} publications'

return text_mining_results_df.to_dict('records'), columns, stats

Expand Down
65 changes: 51 additions & 14 deletions callbacks/text_mining/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ def addl_sanitize_gene(text):

def addl_sanitize_for_bold(text):
text = re.sub(r'</b$', '</b>', text)
text = re.sub(r'<</b>\s+/\s+i>', '</b></i>', text)

return text

Expand All @@ -67,6 +68,25 @@ def find_index_space_after(index, text):
return index


def display_aligned_substring_in_bold(text, similarity):
before_match = text[:find_index_space_before(
similarity.dest_start, text)]
match = text[find_index_space_before(
similarity.dest_start, text):find_index_space_after(similarity.dest_end, text)]
after_match = text[find_index_space_after(
similarity.dest_end, text):]

text = before_match
if before_match[:-1] == ' ' or match[0] == ' ':
text += ' '
text += f'<b>{match}</b>'
if after_match[0] == ' ':
text += ' '
text += after_match

return text


def text_mining_query_search(query_string):
# Make case-insensitive and remove starting and trailing spaces
query_string = query_string.lower().strip()
Expand All @@ -80,6 +100,7 @@ def text_mining_query_search(query_string):
return pd.read_csv(text_mining_path)

df = pd.DataFrame(columns=COLNAMES)
pubmed_matches = set()

with open(Constants.TEXT_MINING_ANNOTATED_ABSTRACTS, 'r', encoding='utf8') as f:
for line in f:
Expand All @@ -88,25 +109,13 @@ def text_mining_query_search(query_string):

try:
# Display the matching substring in bold
before_match = line[:find_index_space_before(
similarity.dest_start, line)]
match = line[find_index_space_before(
similarity.dest_start, line):find_index_space_after(similarity.dest_end, line)]
after_match = line[find_index_space_after(
similarity.dest_end, line):]

line = before_match
if before_match[:-1] == ' ' or match[0] == ' ':
line += ' '
line += f'<b>{match}</b>'
if after_match[0] == ' ':
line += ' '
line += after_match
line = display_aligned_substring_in_bold(line, similarity)

if similarity.score > 0:
try:
PMID, Title, Sentence, _, Entity, _, Type, _, _, _ = map(
sanitize_text, line.split('\t'))

except Exception as e:
while True:
# Sometimes there is a newline in the abstract, which causes a literal line break
Expand All @@ -121,10 +130,14 @@ def text_mining_query_search(query_string):
try:
PMID, Title, Sentence, _, Entity, _, Type, _, _, _ = map(
sanitize_text, line.split('\t'))

break
except:
pass

if PMID in pubmed_matches:
continue

Entity = addl_sanitize_gene(Entity)
Title = Title[:-1]

Expand All @@ -137,6 +150,30 @@ def text_mining_query_search(query_string):
if Type == 'Gene':
if Sentence == 'None':
Sentence = Title

# If the Type is the one containing the aligned substring, then no bold substring is displayed to the user
# We resort to parsing the string again, but, this time, excluding the Type
if '<b>' not in Title and '<b>' not in Sentence and '<b>' not in Entity:
title_sim = rapidfuzz.fuzz.partial_ratio_alignment(
query_string, Title.lower(), score_cutoff=SIMILARITY_CUTOFF)
sentence_sim = rapidfuzz.fuzz.partial_ratio_alignment(
query_string, Sentence.lower(), score_cutoff=SIMILARITY_CUTOFF)
entity_sim = rapidfuzz.fuzz.partial_ratio_alignment(
query_string, Entity.lower(), score_cutoff=SIMILARITY_CUTOFF)

max_sim = max(title_sim.score,
sentence_sim.score, entity_sim.score)
if max_sim == title_sim.score:
Title = display_aligned_substring_in_bold(
Title, title_sim)
elif max_sim == sentence_sim.score:
Sentence = display_aligned_substring_in_bold(
Sentence, sentence_sim)
else:
Entity = display_aligned_substring_in_bold(
Entity, entity_sim)

pubmed_matches.add(PMID)
df.loc[len(df.index)] = [Entity, PMID,
Title, Sentence, similarity.score]

Expand Down

0 comments on commit 0f45de8

Please sign in to comment.