From 410cbaf01fcb1d457684f7053f5f171d2c459438 Mon Sep 17 00:00:00 2001
From: memgonzales <gonzales.markedward@gmail.com>
Date: Tue, 5 Sep 2023 12:04:07 +0800
Subject: [PATCH 1/2] Display only unique PubMed articles

---
 callbacks/text_mining/callbacks.py | 10 ++--------
 callbacks/text_mining/util.py      | 12 ++++++++++++
 2 files changed, 14 insertions(+), 8 deletions(-)
diff --git a/callbacks/text_mining/callbacks.py b/callbacks/text_mining/callbacks.py
index c2e54cf5..b2223921 100644
--- a/callbacks/text_mining/callbacks.py
+++ b/callbacks/text_mining/callbacks.py
@@ -120,19 +120,13 @@ def display_text_mining_results(text_mining_is_submitted, homepage_submitted, te
                 columns = [{'id': x, 'name': x, 'presentation': 'markdown'}
                            for x in text_mining_results_df.columns]
 
-                num_entries = get_num_entries(text_mining_results_df, "PMID")
                 num_unique_entries = get_num_unique_entries(
                     text_mining_results_df, "PMID")
 
-                if num_entries == 1:
-                    stats = f'Found {num_entries} match '
-                else:
-                    stats = f'Found {num_entries} matches '
-
                 if num_unique_entries == 1:
-                    stats += f'across {num_unique_entries} publication'
+                    stats = f'Found matches across {num_unique_entries} publication'
                 else:
-                    stats += f'across {num_unique_entries} publications'
+                    stats = f'Found matches across {num_unique_entries} publications'
 
                 return text_mining_results_df.to_dict('records'), columns, stats
 
diff --git a/callbacks/text_mining/util.py b/callbacks/text_mining/util.py
index 0cf2e0a4..b8667704 100644
--- a/callbacks/text_mining/util.py
+++ b/callbacks/text_mining/util.py
@@ -49,6 +49,7 @@ def addl_sanitize_gene(text):
 
 def addl_sanitize_for_bold(text):
     text = re.sub(r'</b$', '</b>', text)
+    text = re.sub(r'<</b>\s+/\s+i>', '</b></i>', text)
 
     return text
 
@@ -80,6 +81,7 @@ def text_mining_query_search(query_string):
         return pd.read_csv(text_mining_path)
 
     df = pd.DataFrame(columns=COLNAMES)
+    pubmed_matches = set()
 
     with open(Constants.TEXT_MINING_ANNOTATED_ABSTRACTS, 'r', encoding='utf8') as f:
         for line in f:
@@ -107,6 +109,7 @@ def text_mining_query_search(query_string):
                     try:
                         PMID, Title, Sentence, _, Entity, _, Type, _, _, _ = map(
                             sanitize_text, line.split('\t'))
+
                     except Exception as e:
                         while True:
                             # Sometimes there is a newline in the abstract, which causes a literal line break
@@ -121,10 +124,14 @@ def text_mining_query_search(query_string):
                             try:
                                 PMID, Title, Sentence, _, Entity, _, Type, _, _, _ = map(
                                     sanitize_text, line.split('\t'))
+
                                 break
                             except:
                                 pass
 
+                    if PMID in pubmed_matches:
+                        continue
+
                     Entity = addl_sanitize_gene(Entity)
                     Title = Title[:-1]
 
@@ -134,9 +141,14 @@ def text_mining_query_search(query_string):
                     Entity = addl_sanitize_for_bold(Entity)
                     Type = addl_sanitize_for_bold(Type)
 
+                    if PMID == '34199720':
+                        print(Sentence)
+
                     if Type == 'Gene':
                         if Sentence == 'None':
                             Sentence = Title
+
+                        pubmed_matches.add(PMID)
                         df.loc[len(df.index)] = [Entity, PMID,
                                                  Title, Sentence, similarity.score]
 

From 35cc1d95069f44ca3b7539bd4d47dc924cae2f15 Mon Sep 17 00:00:00 2001
From: memgonzales <gonzales.markedward@gmail.com>
Date: Tue, 5 Sep 2023 12:45:59 +0800
Subject: [PATCH 2/2] Display second-best match if best match is not in title,
 sentence, or entity

---
 callbacks/text_mining/util.py | 59 +++++++++++++++++++++++++----------
 1 file changed, 42 insertions(+), 17 deletions(-)

diff --git a/callbacks/text_mining/util.py b/callbacks/text_mining/util.py
index b8667704..870d740e 100644
--- a/callbacks/text_mining/util.py
+++ b/callbacks/text_mining/util.py
@@ -68,6 +68,25 @@ def find_index_space_after(index, text):
     return index
 
 
+def display_aligned_substring_in_bold(text, similarity):
+    before_match = text[:find_index_space_before(
+        similarity.dest_start, text)]
+    match = text[find_index_space_before(
+        similarity.dest_start, text):find_index_space_after(similarity.dest_end, text)]
+    after_match = text[find_index_space_after(
+        similarity.dest_end, text):]
+
+    text = before_match
+    if before_match[:-1] == ' ' or match[0] == ' ':
+        text += ' '
+    text += f'<b>{match}</b>'
+    if after_match[0] == ' ':
+        text += ' '
+    text += after_match
+
+    return text
+
+
 def text_mining_query_search(query_string):
     # Make case-insensitive and remove starting and trailing spaces
     query_string = query_string.lower().strip()
@@ -90,20 +109,7 @@ def text_mining_query_search(query_string):
 
             try:
                 # Display the matching substring in bold
-                before_match = line[:find_index_space_before(
-                    similarity.dest_start, line)]
-                match = line[find_index_space_before(
-                    similarity.dest_start, line):find_index_space_after(similarity.dest_end, line)]
-                after_match = line[find_index_space_after(
-                    similarity.dest_end, line):]
-
-                line = before_match
-                if before_match[:-1] == ' ' or match[0] == ' ':
-                    line += ' '
-                line += f'<b>{match}</b>'
-                if after_match[0] == ' ':
-                    line += ' '
-                line += after_match
+                line = display_aligned_substring_in_bold(line, similarity)
 
                 if similarity.score > 0:
                     try:
@@ -141,13 +147,32 @@ def text_mining_query_search(query_string):
                     Entity = addl_sanitize_for_bold(Entity)
                     Type = addl_sanitize_for_bold(Type)
 
-                    if PMID == '34199720':
-                        print(Sentence)
-
                     if Type == 'Gene':
                         if Sentence == 'None':
                             Sentence = Title
 
+                        # If the Type is the one containing the aligned substring, then no bold substring is displayed to the user
+                        # We resort to parsing the string again, but, this time, excluding the Type
+                        if '<b>' not in Title and '<b>' not in Sentence and '<b>' not in Entity:
+                            title_sim = rapidfuzz.fuzz.partial_ratio_alignment(
+                                query_string, Title.lower(), score_cutoff=SIMILARITY_CUTOFF)
+                            sentence_sim = rapidfuzz.fuzz.partial_ratio_alignment(
+                                query_string, Sentence.lower(), score_cutoff=SIMILARITY_CUTOFF)
+                            entity_sim = rapidfuzz.fuzz.partial_ratio_alignment(
+                                query_string, Entity.lower(), score_cutoff=SIMILARITY_CUTOFF)
+
+                            max_sim = max(title_sim.score,
+                                          sentence_sim.score, entity_sim.score)
+                            if max_sim == title_sim.score:
+                                Title = display_aligned_substring_in_bold(
+                                    Title, title_sim)
+                            elif max_sim == sentence_sim.score:
+                                Sentence = display_aligned_substring_in_bold(
+                                    Sentence, sentence_sim)
+                            else:
+                                Entity = display_aligned_substring_in_bold(
+                                    Entity, entity_sim)
+
                         pubmed_matches.add(PMID)
                         df.loc[len(df.index)] = [Entity, PMID,
                                                  Title, Sentence, similarity.score]