Skip to content

Commit

Permalink
Allow fuzzy matching for text mining results
Browse files Browse the repository at this point in the history
  • Loading branch information
memgonzales committed Sep 2, 2023
1 parent 8a26042 commit 5de6852
Showing 1 changed file with 68 additions and 16 deletions.
84 changes: 68 additions & 16 deletions callbacks/text_mining/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,13 @@
from ..links_util import *
import regex as re
import ftfy
import rapidfuzz

from ..file_util import *


COLNAMES = ['Gene', 'PMID', 'Title', 'Sentence', 'Score']
SIMILARITY_CUTOFF = 70


def sanitize_text(text):
Expand Down Expand Up @@ -43,7 +46,30 @@ def addl_sanitize_gene(text):
return text


def addl_sanitize_for_bold(text):
text = re.sub(r'</b$', '</b>', text)

return text


def find_index_space_before(index, text):
while index > 0 and text[index] != ' ':
index -= 1

return index


def find_index_space_after(index, text):
while index < len(text) - 1 and text[index] != ' ':
index += 1

return index


def text_mining_query_search(query_string):
# Make case-insensitive and remove starting and trailing spaces
query_string = query_string.lower().strip()

text_mining_path = get_path_to_text_mining_temp(Constants.TEMP_TEXT_MINING)
make_dir(text_mining_path)

Expand All @@ -52,31 +78,57 @@ def text_mining_query_search(query_string):
if path_exists(text_mining_path):
return pd.read_csv(text_mining_path)

query_string = query_string.strip()
df = pd.DataFrame(columns=COLNAMES)
query_regex = re.compile(re.escape(query_string), re.IGNORECASE)

with open(Constants.TEXT_MINING_ANNOTATED_ABSTRACTS, 'r', encoding='utf8') as f:
for line in f:
if re.search(query_regex, line):
PMID, Title, Sentence, IsInTitle, Entity, Annotations, Type, start_pos, end_pos, score = map(sanitize_text, line.split(
'\t'))
Entity = addl_sanitize_gene(Entity)
Title = Title[:-1]

if Type == 'Gene':
if Sentence == 'None':
Sentence = Title
df.loc[len(df.index)] = [
Entity, PMID, Title, Sentence, score]
similarity = rapidfuzz.fuzz.partial_ratio_alignment(
query_string, line.lower(), score_cutoff=SIMILARITY_CUTOFF)

try:
# Display the matching substring in bold
before_match = line[:find_index_space_before(
similarity.dest_start, line)]
match = line[find_index_space_before(
similarity.dest_start, line):find_index_space_after(similarity.dest_end, line)]
after_match = line[find_index_space_after(
similarity.dest_end, line):]

line = before_match
if before_match[:-1] == ' ' or match[0] == ' ':
line += ' '
line += f'<b>{match}</b>'
if after_match[0] == ' ':
line += ' '
line += after_match

if similarity.score > 0:
PMID, Title, Sentence, _, Entity, _, Type, _, _, _ = map(
sanitize_text, line.split('\t'))
Entity = addl_sanitize_gene(Entity)
Title = Title[:-1]

PMID = addl_sanitize_for_bold(PMID)
Title = addl_sanitize_for_bold(Title)
Sentence = addl_sanitize_for_bold(Sentence)
Entity = addl_sanitize_for_bold(Entity)
Type = addl_sanitize_for_bold(Type)

if Type == 'Gene':
if Sentence == 'None':
Sentence = Title
df.loc[len(df.index)] = [Entity, PMID,
Title, Sentence, similarity.score]
except:
pass

df['PMID'] = get_pubmed_link(df, 'PMID')

df = df.sort_values('Score', ascending=False)

display_cols_in_fixed_dec_places(df, ['Score'])
df = df.drop(columns=['Score'])

if len(df.index) == 0:
df = create_empty_df_with_cols(COLNAMES)
df = df.drop(columns=['Score'])

df.to_csv(f'{text_mining_path}', index=False)

Expand Down

0 comments on commit 5de6852

Please sign in to comment.