diff --git a/callbacks/text_mining/callbacks.py b/callbacks/text_mining/callbacks.py index c2e54cf5..b2223921 100644 --- a/callbacks/text_mining/callbacks.py +++ b/callbacks/text_mining/callbacks.py @@ -120,19 +120,13 @@ def display_text_mining_results(text_mining_is_submitted, homepage_submitted, te columns = [{'id': x, 'name': x, 'presentation': 'markdown'} for x in text_mining_results_df.columns] - num_entries = get_num_entries(text_mining_results_df, "PMID") num_unique_entries = get_num_unique_entries( text_mining_results_df, "PMID") - if num_entries == 1: - stats = f'Found {num_entries} match ' - else: - stats = f'Found {num_entries} matches ' - if num_unique_entries == 1: - stats += f'across {num_unique_entries} publication' + stats = f'Found matches across {num_unique_entries} publication' else: - stats += f'across {num_unique_entries} publications' + stats = f'Found matches across {num_unique_entries} publications' return text_mining_results_df.to_dict('records'), columns, stats diff --git a/callbacks/text_mining/util.py b/callbacks/text_mining/util.py index 0cf2e0a4..870d740e 100644 --- a/callbacks/text_mining/util.py +++ b/callbacks/text_mining/util.py @@ -49,6 +49,7 @@ def addl_sanitize_gene(text): def addl_sanitize_for_bold(text): text = re.sub(r'', text) + text = re.sub(r'<\s+/\s+i>', '', text) return text @@ -67,6 +68,25 @@ def find_index_space_after(index, text): return index +def display_aligned_substring_in_bold(text, similarity): + before_match = text[:find_index_space_before( + similarity.dest_start, text)] + match = text[find_index_space_before( + similarity.dest_start, text):find_index_space_after(similarity.dest_end, text)] + after_match = text[find_index_space_after( + similarity.dest_end, text):] + + text = before_match + if before_match[:-1] == ' ' or match[0] == ' ': + text += ' ' + text += f'{match}' + if after_match[0] == ' ': + text += ' ' + text += after_match + + return text + + def text_mining_query_search(query_string): # Make case-insensitive and remove starting and trailing spaces query_string = query_string.lower().strip() @@ -80,6 +100,7 @@ def text_mining_query_search(query_string): return pd.read_csv(text_mining_path) df = pd.DataFrame(columns=COLNAMES) + pubmed_matches = set() with open(Constants.TEXT_MINING_ANNOTATED_ABSTRACTS, 'r', encoding='utf8') as f: for line in f: @@ -88,25 +109,13 @@ def text_mining_query_search(query_string): try: # Display the matching substring in bold - before_match = line[:find_index_space_before( - similarity.dest_start, line)] - match = line[find_index_space_before( - similarity.dest_start, line):find_index_space_after(similarity.dest_end, line)] - after_match = line[find_index_space_after( - similarity.dest_end, line):] - - line = before_match - if before_match[:-1] == ' ' or match[0] == ' ': - line += ' ' - line += f'{match}' - if after_match[0] == ' ': - line += ' ' - line += after_match + line = display_aligned_substring_in_bold(line, similarity) if similarity.score > 0: try: PMID, Title, Sentence, _, Entity, _, Type, _, _, _ = map( sanitize_text, line.split('\t')) + except Exception as e: while True: # Sometimes there is a newline in the abstract, which causes a literal line break @@ -121,10 +130,14 @@ def text_mining_query_search(query_string): try: PMID, Title, Sentence, _, Entity, _, Type, _, _, _ = map( sanitize_text, line.split('\t')) + break except: pass + if PMID in pubmed_matches: + continue + Entity = addl_sanitize_gene(Entity) Title = Title[:-1] @@ -137,6 +150,30 @@ def text_mining_query_search(query_string): if Type == 'Gene': if Sentence == 'None': Sentence = Title + + # If the Type is the one containing the aligned substring, then no bold substring is displayed to the user + # We resort to parsing the string again, but, this time, excluding the Type + if '' not in Title and '' not in Sentence and '' not in Entity: + title_sim = rapidfuzz.fuzz.partial_ratio_alignment( + query_string, Title.lower(), score_cutoff=SIMILARITY_CUTOFF) + sentence_sim = rapidfuzz.fuzz.partial_ratio_alignment( + query_string, Sentence.lower(), score_cutoff=SIMILARITY_CUTOFF) + entity_sim = rapidfuzz.fuzz.partial_ratio_alignment( + query_string, Entity.lower(), score_cutoff=SIMILARITY_CUTOFF) + + max_sim = max(title_sim.score, + sentence_sim.score, entity_sim.score) + if max_sim == title_sim.score: + Title = display_aligned_substring_in_bold( + Title, title_sim) + elif max_sim == sentence_sim.score: + Sentence = display_aligned_substring_in_bold( + Sentence, sentence_sim) + else: + Entity = display_aligned_substring_in_bold( + Entity, entity_sim) + + pubmed_matches.add(PMID) df.loc[len(df.index)] = [Entity, PMID, Title, Sentence, similarity.score]