diff --git a/src/harmony/matching/matcher.py b/src/harmony/matching/matcher.py index d6394d6..3e00cf7 100644 --- a/src/harmony/matching/matcher.py +++ b/src/harmony/matching/matcher.py @@ -29,11 +29,12 @@ from typing import List, Callable import numpy as np +from numpy import dot, mat, matmul, ndarray +from numpy.linalg import norm + from harmony.matching.negator import negate from harmony.schemas.requests.text import Instrument from harmony.schemas.text_vector import TextVector -from numpy import dot, mat, matmul, ndarray -from numpy.linalg import norm def cosine_similarity(vec1: ndarray, vec2: ndarray) -> ndarray: @@ -45,13 +46,13 @@ def cosine_similarity(vec1: ndarray, vec2: ndarray) -> ndarray: def match_instruments_with_function( - instruments: List[Instrument], - query: str, - vectorisation_function: Callable, - mhc_questions: List = [], - mhc_all_metadatas: List = [], - mhc_embeddings: np.ndarray = np.zeros((0, 0)), - texts_cached_vectors: dict[str, List[float]] = {}, + instruments: List[Instrument], + query: str, + vectorisation_function: Callable, + mhc_questions: List = [], + mhc_all_metadatas: List = [], + mhc_embeddings: np.ndarray = np.zeros((0, 0)), + texts_cached_vectors: dict[str, List[float]] = {}, ) -> tuple: """ Match instruments @@ -70,6 +71,9 @@ def match_instruments_with_function( text_vectors: List[TextVector] = [] for instrument in instruments: for question in instrument.questions: + if question.question_text is None or question.question_text.strip() == "": + continue # skip empty questions + question.instrument_id = instrument.instrument_id all_questions.append(question) diff --git a/src/harmony/parsing/excel_parser.py b/src/harmony/parsing/excel_parser.py index 7988293..1f851a9 100644 --- a/src/harmony/parsing/excel_parser.py +++ b/src/harmony/parsing/excel_parser.py @@ -25,8 +25,8 @@ ''' +import re import traceback -import uuid from typing import List import numpy as np @@ -37,6 +37,8 @@ from harmony.schemas.requests.text import Question from harmony.schemas.requests.text import RawFile, Instrument +re_header_column = re.compile(r'(?i)(?:question|text|pergunta)') + def clean_option_no(option_could_be_int): if option_could_be_int is None \ @@ -89,9 +91,9 @@ def convert_excel_to_instruments(file: RawFile) -> List[Instrument]: rows_to_delete = [] for i in range(len(df_questions)): if df_questions.question.iloc[i] is None or type(df_questions.question.iloc[i]) is not str or \ - df_questions.question.iloc[i].lower() in ["question", "text", - "pergunta", "texto"]: + re_header_column.match(df_questions.question.iloc[i]): rows_to_delete.append(i) + break if len(rows_to_delete) > 0: df_questions.drop(rows_to_delete, inplace=True) diff --git a/src/harmony/parsing/text_parser.py b/src/harmony/parsing/text_parser.py index 631d48b..9f59c67 100644 --- a/src/harmony/parsing/text_parser.py +++ b/src/harmony/parsing/text_parser.py @@ -27,19 +27,24 @@ import re import traceback +from io import StringIO from typing import List +import pandas as pd from langdetect import detect from harmony.parsing.text_extraction.ensemble_named_entity_recogniser import extract_questions from harmony.schemas.enums.file_types import FileType from harmony.schemas.requests.text import RawFile, Instrument, Question +re_question_text_column = re.compile(r'(?i)(?:question|text|pergunta)') +re_number_column = re.compile(r'(?i)(?:number|\bno)') + def convert_text_to_instruments(file: RawFile) -> List[Instrument]: - if file.file_type == FileType.txt: + if file.file_type == FileType.txt or file.file_type == FileType.csv: # text files not binary page_text = file.content - else: + else: # any binary format page_text = file.text_content if file.file_id is None: @@ -53,14 +58,67 @@ def convert_text_to_instruments(file: RawFile) -> List[Instrument]: traceback.print_exc() traceback.print_stack() - # TODO: replace this with smarter logic - if file.file_type == FileType.txt: + csv_sep = None + if file.file_type == FileType.csv: + first_line, _ = page_text.split("\n", 1) + if "\t" in first_line: + csv_sep = "\t" + elif "," in first_line: + csv_sep = "," + + string_io = StringIO(page_text) + df = pd.read_csv(string_io, sep=csv_sep) + df.fillna("", inplace=True) + + # Pick the column with the longest text as the question column + col_lengths = {} + for col in df.columns: + col_lengths[col] = df[col].apply(lambda x: len(x) if type(x) is str else 0).sum() + question_column = max(col_lengths, key=col_lengths.get) + + for col in df.columns: + if re_question_text_column.match(col) and not re_number_column.findall(col): + question_column = col + break + options_column = None + for col in df.columns: + if "options" in col.lower(): + options_column = col + break + numbers_column = None + if question_column != df.columns[0]: + numbers_column = df.columns[0] + + questions = [] + for idx in range(len(df)): + if numbers_column is not None: + question_no = str(df[numbers_column].iloc[idx]) + else: + question_no = "Q" + str(len(questions) + 1).zfill(3) + + question_text = df[question_column].iloc[idx].strip() + if options_column is not None: + options = df[options_column].iloc[idx].split("/") + else: + options = [] + if question_text == "": + continue + question = Question(question_no=question_no, question_intro="", question_text=question_text, + options=options) + questions.append(question) + + if file.file_type == FileType.txt or (file.file_type == FileType.csv and csv_sep is None): + # Either txt file, or CSV file where no separating character was found in the first line questions = [] for line in page_text.split("\n"): if line.strip() == "": continue line = re.sub(r'\s+', ' ', line) - question = Question(question_no=len(questions) + 1, question_intro="", question_text=line.strip(), + question_no = "Q" + str(len(questions) + 1).zfill(3) + question_text = line.strip() + if question_text == "": + continue + question = Question(question_no=question_no, question_intro="", question_text=question_text, options=[]) questions.append(question) else: diff --git a/src/harmony/parsing/wrapper_all_parsers.py b/src/harmony/parsing/wrapper_all_parsers.py index 0e11724..b691448 100644 --- a/src/harmony/parsing/wrapper_all_parsers.py +++ b/src/harmony/parsing/wrapper_all_parsers.py @@ -37,7 +37,7 @@ def _get_instruments_from_file(file): if file.file_type == FileType.pdf or file.file_type == FileType.docx: instruments_from_this_file = convert_pdf_to_instruments(file) - elif file.file_type == FileType.txt: + elif file.file_type == FileType.txt or file.file_type == FileType.csv: instruments_from_this_file = convert_text_to_instruments(file) elif file.file_type == FileType.xlsx: instruments_from_this_file = convert_excel_to_instruments(file) diff --git a/src/harmony/schemas/enums/file_types.py b/src/harmony/schemas/enums/file_types.py index e7eba9a..e45c294 100644 --- a/src/harmony/schemas/enums/file_types.py +++ b/src/harmony/schemas/enums/file_types.py @@ -32,4 +32,5 @@ class FileType(str, Enum): pdf: str = 'pdf' xlsx: str = 'xlsx' txt: str = 'txt' - docx: str = 'docx' \ No newline at end of file + csv: str = 'csv' + docx: str = 'docx'