Skip to content

Commit

Permalink
Merge pull request #71 from evelynnchen-cmu/remove-digits
Browse files Browse the repository at this point in the history
#23: Remove digits from the start and end of questions
  • Loading branch information
woodthom2 authored Dec 6, 2024
2 parents fb0b065 + f28f9ab commit 2f85c05
Show file tree
Hide file tree
Showing 2 changed files with 98 additions and 1 deletion.
13 changes: 12 additions & 1 deletion src/harmony/parsing/text_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,16 @@
re_number_column = re.compile(r'(?i)(?:number|\bno)')


def remove_numbers(question_text):
# remove formatted numbers from start of text
cleaned_text = re.sub(r'^[\s\(]*\d+[\s\.\)\-]*', '', question_text)

# remove formatted numbers from end of text
cleaned_text = re.sub(r'[\s\(]*\d+[\s\.\)\-]*$', '', cleaned_text)

return cleaned_text.strip()


def convert_text_to_instruments(file: RawFile) -> List[Instrument]:
if file.file_type == FileType.txt or file.file_type == FileType.csv: # text files not binary
page_text = file.content
Expand Down Expand Up @@ -96,6 +106,7 @@ def convert_text_to_instruments(file: RawFile) -> List[Instrument]:
question_no = "Q" + str(len(questions) + 1).zfill(3)

question_text = df[question_column].iloc[idx].strip()
question_text = remove_numbers(question_text)
if options_column is not None:
options = df[options_column].iloc[idx].split("/")
else:
Expand All @@ -114,7 +125,7 @@ def convert_text_to_instruments(file: RawFile) -> List[Instrument]:
continue
line = re.sub(r'\s+', ' ', line)
question_no = "Q" + str(len(questions) + 1).zfill(3)
question_text = line.strip()
question_text = remove_numbers(line.strip())
if question_text == "":
continue
question = Question(question_no=question_no, question_intro="", question_text=question_text,
Expand Down
86 changes: 86 additions & 0 deletions tests/test_convert_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@

import sys
import unittest
from harmony.parsing.text_parser import convert_text_to_instruments
from harmony.schemas.requests.text import RawFile, FileType

sys.path.append("../src")

Expand All @@ -42,6 +44,54 @@
}
)

leading_digits_csv = RawFile.model_validate({
"file_id": "b89800ob990a",
"file_name": "leading.csv",
"file_type": "csv",
"content": """1 I feel nervous
2 I feel afraid"""
})

trailing_digits_csv = RawFile.model_validate({
"file_id": "obas2333of",
"file_name": "trailing.csv",
"file_type": "csv",
"content": """I feel sad 2
I feel hopeless 2"""
})

parentheses_digits_csv = RawFile.model_validate({
"file_id": "parentheses_digits_csv",
"file_name": "parentheses.csv",
"file_type": "csv",
"content": """(1) I feel tired
(2) I feel weak"""
})

period_digits_csv = RawFile.model_validate({
"file_id": "period_digits_csv",
"file_name": "period.csv",
"file_type": "csv",
"content": """1. I feel angry
2. I feel upset"""
})

mixed_format_digits_csv = RawFile.model_validate({
"file_id": "mixed_format_digits_csv",
"file_name": "mixed.csv",
"file_type": "csv",
"content": """1) How do you feel
(2) Are you okay"""
})

both_ends_digits_csv = RawFile.model_validate({
"file_id": "both_ends_digits_csv",
"file_name": "bothends.csv",
"file_type": "csv",
"content": """1. How are you today (2)
(1) Are you feeling better 2."""
})


class TestConvertTxt(unittest.TestCase):

Expand All @@ -51,6 +101,42 @@ def test_single_instrument(self):
def test_two_questions(self):
self.assertEqual(2, len(convert_text_to_instruments(txt_gad_7_2_questions)[0].questions))

def test_remove_leading_digits_from_csv(self):
instruments = convert_text_to_instruments(leading_digits_csv)
questions = instruments[0].questions
self.assertEqual("I feel nervous", questions[0].question_text)
self.assertEqual("I feel afraid", questions[1].question_text)

def test_remove_trailing_digits_from_csv(self):
instruments = convert_text_to_instruments(trailing_digits_csv)
questions = instruments[0].questions
self.assertEqual("I feel sad", questions[0].question_text)
self.assertEqual("I feel hopeless", questions[1].question_text)

def test_remove_parentheses_digits_from_csv(self):
instruments = convert_text_to_instruments(parentheses_digits_csv)
questions = instruments[0].questions
self.assertEqual("I feel tired", questions[0].question_text)
self.assertEqual("I feel weak", questions[1].question_text)

def test_remove_period_digits_from_csv(self):
instruments = convert_text_to_instruments(period_digits_csv)
questions = instruments[0].questions
self.assertEqual("I feel angry", questions[0].question_text)
self.assertEqual("I feel upset", questions[1].question_text)

def test_remove_mixed_format_digits_from_csv(self):
instruments = convert_text_to_instruments(mixed_format_digits_csv)
questions = instruments[0].questions
self.assertEqual("How do you feel", questions[0].question_text)
self.assertEqual("Are you okay", questions[1].question_text)

def test_remove_both_ends_digits_from_csv(self):
instruments = convert_text_to_instruments(both_ends_digits_csv)
questions = instruments[0].questions
self.assertEqual("How are you today", questions[0].question_text)
self.assertEqual("Are you feeling better", questions[1].question_text)


if __name__ == '__main__':
unittest.main()

0 comments on commit 2f85c05

Please sign in to comment.