diff --git a/callbacks/text_mining/util.py b/callbacks/text_mining/util.py index 96276911..6707fdfb 100644 --- a/callbacks/text_mining/util.py +++ b/callbacks/text_mining/util.py @@ -4,10 +4,13 @@ from ..links_util import * import regex as re import ftfy +import rapidfuzz + from ..file_util import * COLNAMES = ['Gene', 'PMID', 'Title', 'Sentence', 'Score'] +SIMILARITY_CUTOFF = 70 def sanitize_text(text): @@ -43,7 +46,30 @@ def addl_sanitize_gene(text): return text +def addl_sanitize_for_bold(text): + text = re.sub(r'', text) + + return text + + +def find_index_space_before(index, text): + while index > 0 and text[index] != ' ': + index -= 1 + + return index + + +def find_index_space_after(index, text): + while index < len(text) - 1 and text[index] != ' ': + index += 1 + + return index + + def text_mining_query_search(query_string): + # Make case-insensitive and remove starting and trailing spaces + query_string = query_string.lower().strip() + text_mining_path = get_path_to_text_mining_temp(Constants.TEMP_TEXT_MINING) make_dir(text_mining_path) @@ -52,31 +78,57 @@ def text_mining_query_search(query_string): if path_exists(text_mining_path): return pd.read_csv(text_mining_path) - query_string = query_string.strip() df = pd.DataFrame(columns=COLNAMES) - query_regex = re.compile(re.escape(query_string), re.IGNORECASE) + with open(Constants.TEXT_MINING_ANNOTATED_ABSTRACTS, 'r', encoding='utf8') as f: for line in f: - if re.search(query_regex, line): - PMID, Title, Sentence, IsInTitle, Entity, Annotations, Type, start_pos, end_pos, score = map(sanitize_text, line.split( - '\t')) - Entity = addl_sanitize_gene(Entity) - Title = Title[:-1] - - if Type == 'Gene': - if Sentence == 'None': - Sentence = Title - df.loc[len(df.index)] = [ - Entity, PMID, Title, Sentence, score] + similarity = rapidfuzz.fuzz.partial_ratio_alignment( + query_string, line.lower(), score_cutoff=SIMILARITY_CUTOFF) + + try: + # Display the matching substring in bold + before_match = line[:find_index_space_before( + similarity.dest_start, line)] + match = line[find_index_space_before( + similarity.dest_start, line):find_index_space_after(similarity.dest_end, line)] + after_match = line[find_index_space_after( + similarity.dest_end, line):] + + line = before_match + if before_match[:-1] == ' ' or match[0] == ' ': + line += ' ' + line += f'{match}' + if after_match[0] == ' ': + line += ' ' + line += after_match + + if similarity.score > 0: + PMID, Title, Sentence, _, Entity, _, Type, _, _, _ = map( + sanitize_text, line.split('\t')) + Entity = addl_sanitize_gene(Entity) + Title = Title[:-1] + + PMID = addl_sanitize_for_bold(PMID) + Title = addl_sanitize_for_bold(Title) + Sentence = addl_sanitize_for_bold(Sentence) + Entity = addl_sanitize_for_bold(Entity) + Type = addl_sanitize_for_bold(Type) + + if Type == 'Gene': + if Sentence == 'None': + Sentence = Title + df.loc[len(df.index)] = [Entity, PMID, + Title, Sentence, similarity.score] + except: + pass df['PMID'] = get_pubmed_link(df, 'PMID') - df = df.sort_values('Score', ascending=False) - - display_cols_in_fixed_dec_places(df, ['Score']) + df = df.drop(columns=['Score']) if len(df.index) == 0: df = create_empty_df_with_cols(COLNAMES) + df = df.drop(columns=['Score']) df.to_csv(f'{text_mining_path}', index=False)