Allow fuzzy matching for text mining results

bioinfodlsu · Sep 2, 2023 · 5de6852 · 5de6852
1 parent 8a26042
commit 5de6852
Showing 1 changed file with 68 additions and 16 deletions.
diff --git a/callbacks/text_mining/util.py b/callbacks/text_mining/util.py
@@ -4,10 +4,13 @@
 from ..links_util import *
 import regex as re
 import ftfy
+import rapidfuzz
+
 from ..file_util import *
 
 
 COLNAMES = ['Gene', 'PMID', 'Title', 'Sentence', 'Score']
+SIMILARITY_CUTOFF = 70
 
 
 def sanitize_text(text):
@@ -43,7 +46,30 @@ def addl_sanitize_gene(text):
     return text
 
 
+def addl_sanitize_for_bold(text):
+    text = re.sub(r'</b$', '</b>', text)
+
+    return text
+
+
+def find_index_space_before(index, text):
+    while index > 0 and text[index] != ' ':
+        index -= 1
+
+    return index
+
+
+def find_index_space_after(index, text):
+    while index < len(text) - 1 and text[index] != ' ':
+        index += 1
+
+    return index
+
+
 def text_mining_query_search(query_string):
+    # Make case-insensitive and remove starting and trailing spaces
+    query_string = query_string.lower().strip()
+
     text_mining_path = get_path_to_text_mining_temp(Constants.TEMP_TEXT_MINING)
     make_dir(text_mining_path)
 
@@ -52,31 +78,57 @@ def text_mining_query_search(query_string):
     if path_exists(text_mining_path):
         return pd.read_csv(text_mining_path)
 
-    query_string = query_string.strip()
     df = pd.DataFrame(columns=COLNAMES)
-    query_regex = re.compile(re.escape(query_string), re.IGNORECASE)
+
     with open(Constants.TEXT_MINING_ANNOTATED_ABSTRACTS, 'r', encoding='utf8') as f:
         for line in f:
-            if re.search(query_regex, line):
-                PMID, Title, Sentence, IsInTitle, Entity, Annotations, Type, start_pos, end_pos, score = map(sanitize_text, line.split(
-                    '\t'))
-                Entity = addl_sanitize_gene(Entity)
-                Title = Title[:-1]
-
-                if Type == 'Gene':
-                    if Sentence == 'None':
-                        Sentence = Title
-                    df.loc[len(df.index)] = [
-                        Entity, PMID, Title, Sentence, score]
+            similarity = rapidfuzz.fuzz.partial_ratio_alignment(
+                query_string, line.lower(), score_cutoff=SIMILARITY_CUTOFF)
+
+            try:
+                # Display the matching substring in bold
+                before_match = line[:find_index_space_before(
+                    similarity.dest_start, line)]
+                match = line[find_index_space_before(
+                    similarity.dest_start, line):find_index_space_after(similarity.dest_end, line)]
+                after_match = line[find_index_space_after(
+                    similarity.dest_end, line):]
+
+                line = before_match
+                if before_match[:-1] == ' ' or match[0] == ' ':
+                    line += ' '
+                line += f'<b>{match}</b>'
+                if after_match[0] == ' ':
+                    line += ' '
+                line += after_match
+
+                if similarity.score > 0:
+                    PMID, Title, Sentence, _, Entity, _, Type, _, _, _ = map(
+                        sanitize_text, line.split('\t'))
+                    Entity = addl_sanitize_gene(Entity)
+                    Title = Title[:-1]
+
+                    PMID = addl_sanitize_for_bold(PMID)
+                    Title = addl_sanitize_for_bold(Title)
+                    Sentence = addl_sanitize_for_bold(Sentence)
+                    Entity = addl_sanitize_for_bold(Entity)
+                    Type = addl_sanitize_for_bold(Type)
+
+                    if Type == 'Gene':
+                        if Sentence == 'None':
+                            Sentence = Title
+                        df.loc[len(df.index)] = [Entity, PMID,
+                                                 Title, Sentence, similarity.score]
+            except:
+                pass
 
     df['PMID'] = get_pubmed_link(df, 'PMID')
-
     df = df.sort_values('Score', ascending=False)
-
-    display_cols_in_fixed_dec_places(df, ['Score'])
+    df = df.drop(columns=['Score'])
 
     if len(df.index) == 0:
         df = create_empty_df_with_cols(COLNAMES)
+        df = df.drop(columns=['Score'])
 
     df.to_csv(f'{text_mining_path}', index=False)