From 410cbaf01fcb1d457684f7053f5f171d2c459438 Mon Sep 17 00:00:00 2001 From: memgonzales Date: Tue, 5 Sep 2023 12:04:07 +0800 Subject: [PATCH 1/2] Display only unique PubMed articles --- callbacks/text_mining/callbacks.py | 10 ++-------- callbacks/text_mining/util.py | 12 ++++++++++++ 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/callbacks/text_mining/callbacks.py b/callbacks/text_mining/callbacks.py index c2e54cf5..b2223921 100644 --- a/callbacks/text_mining/callbacks.py +++ b/callbacks/text_mining/callbacks.py @@ -120,19 +120,13 @@ def display_text_mining_results(text_mining_is_submitted, homepage_submitted, te columns = [{'id': x, 'name': x, 'presentation': 'markdown'} for x in text_mining_results_df.columns] - num_entries = get_num_entries(text_mining_results_df, "PMID") num_unique_entries = get_num_unique_entries( text_mining_results_df, "PMID") - if num_entries == 1: - stats = f'Found {num_entries} match ' - else: - stats = f'Found {num_entries} matches ' - if num_unique_entries == 1: - stats += f'across {num_unique_entries} publication' + stats = f'Found matches across {num_unique_entries} publication' else: - stats += f'across {num_unique_entries} publications' + stats = f'Found matches across {num_unique_entries} publications' return text_mining_results_df.to_dict('records'), columns, stats diff --git a/callbacks/text_mining/util.py b/callbacks/text_mining/util.py index 0cf2e0a4..b8667704 100644 --- a/callbacks/text_mining/util.py +++ b/callbacks/text_mining/util.py @@ -49,6 +49,7 @@ def addl_sanitize_gene(text): def addl_sanitize_for_bold(text): text = re.sub(r'', text) + text = re.sub(r'<\s+/\s+i>', '', text) return text @@ -80,6 +81,7 @@ def text_mining_query_search(query_string): return pd.read_csv(text_mining_path) df = pd.DataFrame(columns=COLNAMES) + pubmed_matches = set() with open(Constants.TEXT_MINING_ANNOTATED_ABSTRACTS, 'r', encoding='utf8') as f: for line in f: @@ -107,6 +109,7 @@ def text_mining_query_search(query_string): try: PMID, Title, Sentence, _, Entity, _, Type, _, _, _ = map( sanitize_text, line.split('\t')) + except Exception as e: while True: # Sometimes there is a newline in the abstract, which causes a literal line break @@ -121,10 +124,14 @@ def text_mining_query_search(query_string): try: PMID, Title, Sentence, _, Entity, _, Type, _, _, _ = map( sanitize_text, line.split('\t')) + break except: pass + if PMID in pubmed_matches: + continue + Entity = addl_sanitize_gene(Entity) Title = Title[:-1] @@ -134,9 +141,14 @@ def text_mining_query_search(query_string): Entity = addl_sanitize_for_bold(Entity) Type = addl_sanitize_for_bold(Type) + if PMID == '34199720': + print(Sentence) + if Type == 'Gene': if Sentence == 'None': Sentence = Title + + pubmed_matches.add(PMID) df.loc[len(df.index)] = [Entity, PMID, Title, Sentence, similarity.score] From 35cc1d95069f44ca3b7539bd4d47dc924cae2f15 Mon Sep 17 00:00:00 2001 From: memgonzales Date: Tue, 5 Sep 2023 12:45:59 +0800 Subject: [PATCH 2/2] Display second-best match if best match is not in title, sentence, or entity --- callbacks/text_mining/util.py | 59 +++++++++++++++++++++++++---------- 1 file changed, 42 insertions(+), 17 deletions(-) diff --git a/callbacks/text_mining/util.py b/callbacks/text_mining/util.py index b8667704..870d740e 100644 --- a/callbacks/text_mining/util.py +++ b/callbacks/text_mining/util.py @@ -68,6 +68,25 @@ def find_index_space_after(index, text): return index +def display_aligned_substring_in_bold(text, similarity): + before_match = text[:find_index_space_before( + similarity.dest_start, text)] + match = text[find_index_space_before( + similarity.dest_start, text):find_index_space_after(similarity.dest_end, text)] + after_match = text[find_index_space_after( + similarity.dest_end, text):] + + text = before_match + if before_match[:-1] == ' ' or match[0] == ' ': + text += ' ' + text += f'{match}' + if after_match[0] == ' ': + text += ' ' + text += after_match + + return text + + def text_mining_query_search(query_string): # Make case-insensitive and remove starting and trailing spaces query_string = query_string.lower().strip() @@ -90,20 +109,7 @@ def text_mining_query_search(query_string): try: # Display the matching substring in bold - before_match = line[:find_index_space_before( - similarity.dest_start, line)] - match = line[find_index_space_before( - similarity.dest_start, line):find_index_space_after(similarity.dest_end, line)] - after_match = line[find_index_space_after( - similarity.dest_end, line):] - - line = before_match - if before_match[:-1] == ' ' or match[0] == ' ': - line += ' ' - line += f'{match}' - if after_match[0] == ' ': - line += ' ' - line += after_match + line = display_aligned_substring_in_bold(line, similarity) if similarity.score > 0: try: @@ -141,13 +147,32 @@ def text_mining_query_search(query_string): Entity = addl_sanitize_for_bold(Entity) Type = addl_sanitize_for_bold(Type) - if PMID == '34199720': - print(Sentence) - if Type == 'Gene': if Sentence == 'None': Sentence = Title + # If the Type is the one containing the aligned substring, then no bold substring is displayed to the user + # We resort to parsing the string again, but, this time, excluding the Type + if '' not in Title and '' not in Sentence and '' not in Entity: + title_sim = rapidfuzz.fuzz.partial_ratio_alignment( + query_string, Title.lower(), score_cutoff=SIMILARITY_CUTOFF) + sentence_sim = rapidfuzz.fuzz.partial_ratio_alignment( + query_string, Sentence.lower(), score_cutoff=SIMILARITY_CUTOFF) + entity_sim = rapidfuzz.fuzz.partial_ratio_alignment( + query_string, Entity.lower(), score_cutoff=SIMILARITY_CUTOFF) + + max_sim = max(title_sim.score, + sentence_sim.score, entity_sim.score) + if max_sim == title_sim.score: + Title = display_aligned_substring_in_bold( + Title, title_sim) + elif max_sim == sentence_sim.score: + Sentence = display_aligned_substring_in_bold( + Sentence, sentence_sim) + else: + Entity = display_aligned_substring_in_bold( + Entity, entity_sim) + pubmed_matches.add(PMID) df.loc[len(df.index)] = [Entity, PMID, Title, Sentence, similarity.score]