Merge pull request #109 from bioinfodlsu/hot_fix

Display only unique PubMed articles in text-mining results
bioinfodlsu · Sep 5, 2023 · 0f45de8 · 0f45de8
2 parents ce0dd2f + 2082a5a
commit 0f45de8
Show file tree

Hide file tree

Showing 2 changed files with 53 additions and 22 deletions.
diff --git a/callbacks/text_mining/callbacks.py b/callbacks/text_mining/callbacks.py
@@ -120,19 +120,13 @@ def display_text_mining_results(text_mining_is_submitted, homepage_submitted, te
                 columns = [{'id': x, 'name': x, 'presentation': 'markdown'}
                            for x in text_mining_results_df.columns]
 
-                num_entries = get_num_entries(text_mining_results_df, "PMID")
                 num_unique_entries = get_num_unique_entries(
                     text_mining_results_df, "PMID")
 
-                if num_entries == 1:
-                    stats = f'Found {num_entries} match '
-                else:
-                    stats = f'Found {num_entries} matches '
-
                 if num_unique_entries == 1:
-                    stats += f'across {num_unique_entries} publication'
+                    stats = f'Found matches across {num_unique_entries} publication'
                 else:
-                    stats += f'across {num_unique_entries} publications'
+                    stats = f'Found matches across {num_unique_entries} publications'
 
                 return text_mining_results_df.to_dict('records'), columns, stats
 

diff --git a/callbacks/text_mining/util.py b/callbacks/text_mining/util.py
@@ -49,6 +49,7 @@ def addl_sanitize_gene(text):
 
 def addl_sanitize_for_bold(text):
     text = re.sub(r'</b$', '</b>', text)
+    text = re.sub(r'<</b>\s+/\s+i>', '</b></i>', text)
 
     return text
 
@@ -67,6 +68,25 @@ def find_index_space_after(index, text):
     return index
 
 
+def display_aligned_substring_in_bold(text, similarity):
+    before_match = text[:find_index_space_before(
+        similarity.dest_start, text)]
+    match = text[find_index_space_before(
+        similarity.dest_start, text):find_index_space_after(similarity.dest_end, text)]
+    after_match = text[find_index_space_after(
+        similarity.dest_end, text):]
+
+    text = before_match
+    if before_match[:-1] == ' ' or match[0] == ' ':
+        text += ' '
+    text += f'<b>{match}</b>'
+    if after_match[0] == ' ':
+        text += ' '
+    text += after_match
+
+    return text
+
+
 def text_mining_query_search(query_string):
     # Make case-insensitive and remove starting and trailing spaces
     query_string = query_string.lower().strip()
@@ -80,6 +100,7 @@ def text_mining_query_search(query_string):
         return pd.read_csv(text_mining_path)
 
     df = pd.DataFrame(columns=COLNAMES)
+    pubmed_matches = set()
 
     with open(Constants.TEXT_MINING_ANNOTATED_ABSTRACTS, 'r', encoding='utf8') as f:
         for line in f:
@@ -88,25 +109,13 @@ def text_mining_query_search(query_string):
 
             try:
                 # Display the matching substring in bold
-                before_match = line[:find_index_space_before(
-                    similarity.dest_start, line)]
-                match = line[find_index_space_before(
-                    similarity.dest_start, line):find_index_space_after(similarity.dest_end, line)]
-                after_match = line[find_index_space_after(
-                    similarity.dest_end, line):]
-
-                line = before_match
-                if before_match[:-1] == ' ' or match[0] == ' ':
-                    line += ' '
-                line += f'<b>{match}</b>'
-                if after_match[0] == ' ':
-                    line += ' '
-                line += after_match
+                line = display_aligned_substring_in_bold(line, similarity)
 
                 if similarity.score > 0:
                     try:
                         PMID, Title, Sentence, _, Entity, _, Type, _, _, _ = map(
                             sanitize_text, line.split('\t'))
+
                     except Exception as e:
                         while True:
                             # Sometimes there is a newline in the abstract, which causes a literal line break
@@ -121,10 +130,14 @@ def text_mining_query_search(query_string):
                             try:
                                 PMID, Title, Sentence, _, Entity, _, Type, _, _, _ = map(
                                     sanitize_text, line.split('\t'))
+
                                 break
                             except:
                                 pass
 
+                    if PMID in pubmed_matches:
+                        continue
+
                     Entity = addl_sanitize_gene(Entity)
                     Title = Title[:-1]
 
@@ -137,6 +150,30 @@ def text_mining_query_search(query_string):
                     if Type == 'Gene':
                         if Sentence == 'None':
                             Sentence = Title
+
+                        # If the Type is the one containing the aligned substring, then no bold substring is displayed to the user
+                        # We resort to parsing the string again, but, this time, excluding the Type
+                        if '<b>' not in Title and '<b>' not in Sentence and '<b>' not in Entity:
+                            title_sim = rapidfuzz.fuzz.partial_ratio_alignment(
+                                query_string, Title.lower(), score_cutoff=SIMILARITY_CUTOFF)
+                            sentence_sim = rapidfuzz.fuzz.partial_ratio_alignment(
+                                query_string, Sentence.lower(), score_cutoff=SIMILARITY_CUTOFF)
+                            entity_sim = rapidfuzz.fuzz.partial_ratio_alignment(
+                                query_string, Entity.lower(), score_cutoff=SIMILARITY_CUTOFF)
+
+                            max_sim = max(title_sim.score,
+                                          sentence_sim.score, entity_sim.score)
+                            if max_sim == title_sim.score:
+                                Title = display_aligned_substring_in_bold(
+                                    Title, title_sim)
+                            elif max_sim == sentence_sim.score:
+                                Sentence = display_aligned_substring_in_bold(
+                                    Sentence, sentence_sim)
+                            else:
+                                Entity = display_aligned_substring_in_bold(
+                                    Entity, entity_sim)
+
+                        pubmed_matches.add(PMID)
                         df.loc[len(df.index)] = [Entity, PMID,
                                                  Title, Sentence, similarity.score]