Merge pull request #6 from harmonydata/parserfixesdec2023

Parserfixesdec2023
harmonydata · Dec 11, 2023 · 5664bde · 5664bde
2 parents 259090e + 7b576b9
commit 5664bde
Show file tree

Hide file tree

Showing 5 changed files with 84 additions and 19 deletions.
diff --git a/src/harmony/matching/matcher.py b/src/harmony/matching/matcher.py
@@ -29,11 +29,12 @@
 from typing import List, Callable
 
 import numpy as np
+from numpy import dot, mat, matmul, ndarray
+from numpy.linalg import norm
+
 from harmony.matching.negator import negate
 from harmony.schemas.requests.text import Instrument
 from harmony.schemas.text_vector import TextVector
-from numpy import dot, mat, matmul, ndarray
-from numpy.linalg import norm
 
 
 def cosine_similarity(vec1: ndarray, vec2: ndarray) -> ndarray:
@@ -45,13 +46,13 @@ def cosine_similarity(vec1: ndarray, vec2: ndarray) -> ndarray:
 
 
 def match_instruments_with_function(
-    instruments: List[Instrument],
-    query: str,
-    vectorisation_function: Callable,
-    mhc_questions: List = [],
-    mhc_all_metadatas: List = [],
-    mhc_embeddings: np.ndarray = np.zeros((0, 0)),
-    texts_cached_vectors: dict[str, List[float]] = {},
+        instruments: List[Instrument],
+        query: str,
+        vectorisation_function: Callable,
+        mhc_questions: List = [],
+        mhc_all_metadatas: List = [],
+        mhc_embeddings: np.ndarray = np.zeros((0, 0)),
+        texts_cached_vectors: dict[str, List[float]] = {},
 ) -> tuple:
     """
     Match instruments
@@ -70,6 +71,9 @@ def match_instruments_with_function(
     text_vectors: List[TextVector] = []
     for instrument in instruments:
         for question in instrument.questions:
+            if question.question_text is None or question.question_text.strip() == "":
+                continue  # skip empty questions
+
             question.instrument_id = instrument.instrument_id
             all_questions.append(question)
 

diff --git a/src/harmony/parsing/excel_parser.py b/src/harmony/parsing/excel_parser.py
@@ -25,8 +25,8 @@
 
 '''
 
+import re
 import traceback
-import uuid
 from typing import List
 
 import numpy as np
@@ -37,6 +37,8 @@
 from harmony.schemas.requests.text import Question
 from harmony.schemas.requests.text import RawFile, Instrument
 
+re_header_column = re.compile(r'(?i)(?:question|text|pergunta)')
+
 
 def clean_option_no(option_could_be_int):
     if option_could_be_int is None \
@@ -89,9 +91,9 @@ def convert_excel_to_instruments(file: RawFile) -> List[Instrument]:
         rows_to_delete = []
         for i in range(len(df_questions)):
             if df_questions.question.iloc[i] is None or type(df_questions.question.iloc[i]) is not str or \
-                    df_questions.question.iloc[i].lower() in ["question", "text",
-                                                              "pergunta", "texto"]:
+                    re_header_column.match(df_questions.question.iloc[i]):
                 rows_to_delete.append(i)
+                break
 
         if len(rows_to_delete) > 0:
             df_questions.drop(rows_to_delete, inplace=True)

diff --git a/src/harmony/parsing/text_parser.py b/src/harmony/parsing/text_parser.py
@@ -27,19 +27,24 @@
 
 import re
 import traceback
+from io import StringIO
 from typing import List
 
+import pandas as pd
 from langdetect import detect
 
 from harmony.parsing.text_extraction.ensemble_named_entity_recogniser import extract_questions
 from harmony.schemas.enums.file_types import FileType
 from harmony.schemas.requests.text import RawFile, Instrument, Question
 
+re_question_text_column = re.compile(r'(?i)(?:question|text|pergunta)')
+re_number_column = re.compile(r'(?i)(?:number|\bno)')
+
 
 def convert_text_to_instruments(file: RawFile) -> List[Instrument]:
-    if file.file_type == FileType.txt:
+    if file.file_type == FileType.txt or file.file_type == FileType.csv:  # text files not binary
         page_text = file.content
-    else:
+    else:  # any binary format
         page_text = file.text_content
 
     if file.file_id is None:
@@ -53,14 +58,67 @@ def convert_text_to_instruments(file: RawFile) -> List[Instrument]:
         traceback.print_exc()
         traceback.print_stack()
 
-    # TODO: replace this with smarter logic
-    if file.file_type == FileType.txt:
+    csv_sep = None
+    if file.file_type == FileType.csv:
+        first_line, _ = page_text.split("\n", 1)
+        if "\t" in first_line:
+            csv_sep = "\t"
+        elif "," in first_line:
+            csv_sep = ","
+
+        string_io = StringIO(page_text)
+        df = pd.read_csv(string_io, sep=csv_sep)
+        df.fillna("", inplace=True)
+
+        # Pick the column with the longest text as the question column
+        col_lengths = {}
+        for col in df.columns:
+            col_lengths[col] = df[col].apply(lambda x: len(x) if type(x) is str else 0).sum()
+        question_column = max(col_lengths, key=col_lengths.get)
+
+        for col in df.columns:
+            if re_question_text_column.match(col) and not re_number_column.findall(col):
+                question_column = col
+                break
+        options_column = None
+        for col in df.columns:
+            if "options" in col.lower():
+                options_column = col
+                break
+        numbers_column = None
+        if question_column != df.columns[0]:
+            numbers_column = df.columns[0]
+
+        questions = []
+        for idx in range(len(df)):
+            if numbers_column is not None:
+                question_no = str(df[numbers_column].iloc[idx])
+            else:
+                question_no = "Q" + str(len(questions) + 1).zfill(3)
+
+            question_text = df[question_column].iloc[idx].strip()
+            if options_column is not None:
+                options = df[options_column].iloc[idx].split("/")
+            else:
+                options = []
+            if question_text == "":
+                continue
+            question = Question(question_no=question_no, question_intro="", question_text=question_text,
+                                options=options)
+            questions.append(question)
+
+    if file.file_type == FileType.txt or (file.file_type == FileType.csv and csv_sep is None):
+        # Either txt file, or CSV file where no separating character was found in the first line
         questions = []
         for line in page_text.split("\n"):
             if line.strip() == "":
                 continue
             line = re.sub(r'\s+', ' ', line)
-            question = Question(question_no=len(questions) + 1, question_intro="", question_text=line.strip(),
+            question_no = "Q" + str(len(questions) + 1).zfill(3)
+            question_text = line.strip()
+            if question_text == "":
+                continue
+            question = Question(question_no=question_no, question_intro="", question_text=question_text,
                                 options=[])
             questions.append(question)
     else:

diff --git a/src/harmony/parsing/wrapper_all_parsers.py b/src/harmony/parsing/wrapper_all_parsers.py
@@ -37,7 +37,7 @@
 def _get_instruments_from_file(file):
     if file.file_type == FileType.pdf or file.file_type == FileType.docx:
         instruments_from_this_file = convert_pdf_to_instruments(file)
-    elif file.file_type == FileType.txt:
+    elif file.file_type == FileType.txt or file.file_type == FileType.csv:
         instruments_from_this_file = convert_text_to_instruments(file)
     elif file.file_type == FileType.xlsx:
         instruments_from_this_file = convert_excel_to_instruments(file)

diff --git a/src/harmony/schemas/enums/file_types.py b/src/harmony/schemas/enums/file_types.py
@@ -32,4 +32,5 @@ class FileType(str, Enum):
     pdf: str = 'pdf'
     xlsx: str = 'xlsx'
     txt: str = 'txt'
-    docx: str = 'docx'
+    csv: str = 'csv'
+    docx: str = 'docx'