Skip to content

Commit

Permalink
Handle malformed lines in CSV file
Browse files Browse the repository at this point in the history
  • Loading branch information
memgonzales committed Sep 2, 2023
1 parent 3b5f80f commit fc2066d
Showing 1 changed file with 22 additions and 3 deletions.
25 changes: 22 additions & 3 deletions callbacks/text_mining/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@


COLNAMES = ['Gene', 'PMID', 'Title', 'Sentence', 'Score']
SIMILARITY_CUTOFF = 70
SIMILARITY_CUTOFF = 75


def sanitize_text(text):
Expand Down Expand Up @@ -103,8 +103,27 @@ def text_mining_query_search(query_string):
line += after_match

if similarity.score > 0:
PMID, Title, Sentence, _, Entity, _, Type, _, _, _ = map(
sanitize_text, line.split('\t'))
try:
PMID, Title, Sentence, _, Entity, _, Type, _, _, _ = map(
sanitize_text, line.split('\t'))
except Exception as e:
while True:
# Sometimes there is a newline in the abstract, which causes a literal line break
# in the CSV file
prev_line = line
try:
next_line = next(f)
line = prev_line.strip() + ' ' + next_line.strip()
except StopIteration:
break

try:
PMID, Title, Sentence, _, Entity, _, Type, _, _, _ = map(
sanitize_text, line.split('\t'))
break
except:
pass

Entity = addl_sanitize_gene(Entity)
Title = Title[:-1]

Expand Down

0 comments on commit fc2066d

Please sign in to comment.