Skip to content

Commit

Permalink
Merge pull request #6 from harmonydata/parserfixesdec2023
Browse files Browse the repository at this point in the history
Parserfixesdec2023
  • Loading branch information
woodthom2 authored Dec 11, 2023
2 parents 259090e + 7b576b9 commit 5664bde
Show file tree
Hide file tree
Showing 5 changed files with 84 additions and 19 deletions.
22 changes: 13 additions & 9 deletions src/harmony/matching/matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,12 @@
from typing import List, Callable

import numpy as np
from numpy import dot, mat, matmul, ndarray
from numpy.linalg import norm

from harmony.matching.negator import negate
from harmony.schemas.requests.text import Instrument
from harmony.schemas.text_vector import TextVector
from numpy import dot, mat, matmul, ndarray
from numpy.linalg import norm


def cosine_similarity(vec1: ndarray, vec2: ndarray) -> ndarray:
Expand All @@ -45,13 +46,13 @@ def cosine_similarity(vec1: ndarray, vec2: ndarray) -> ndarray:


def match_instruments_with_function(
instruments: List[Instrument],
query: str,
vectorisation_function: Callable,
mhc_questions: List = [],
mhc_all_metadatas: List = [],
mhc_embeddings: np.ndarray = np.zeros((0, 0)),
texts_cached_vectors: dict[str, List[float]] = {},
instruments: List[Instrument],
query: str,
vectorisation_function: Callable,
mhc_questions: List = [],
mhc_all_metadatas: List = [],
mhc_embeddings: np.ndarray = np.zeros((0, 0)),
texts_cached_vectors: dict[str, List[float]] = {},
) -> tuple:
"""
Match instruments
Expand All @@ -70,6 +71,9 @@ def match_instruments_with_function(
text_vectors: List[TextVector] = []
for instrument in instruments:
for question in instrument.questions:
if question.question_text is None or question.question_text.strip() == "":
continue # skip empty questions

question.instrument_id = instrument.instrument_id
all_questions.append(question)

Expand Down
8 changes: 5 additions & 3 deletions src/harmony/parsing/excel_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@
'''

import re
import traceback
import uuid
from typing import List

import numpy as np
Expand All @@ -37,6 +37,8 @@
from harmony.schemas.requests.text import Question
from harmony.schemas.requests.text import RawFile, Instrument

re_header_column = re.compile(r'(?i)(?:question|text|pergunta)')


def clean_option_no(option_could_be_int):
if option_could_be_int is None \
Expand Down Expand Up @@ -89,9 +91,9 @@ def convert_excel_to_instruments(file: RawFile) -> List[Instrument]:
rows_to_delete = []
for i in range(len(df_questions)):
if df_questions.question.iloc[i] is None or type(df_questions.question.iloc[i]) is not str or \
df_questions.question.iloc[i].lower() in ["question", "text",
"pergunta", "texto"]:
re_header_column.match(df_questions.question.iloc[i]):
rows_to_delete.append(i)
break

if len(rows_to_delete) > 0:
df_questions.drop(rows_to_delete, inplace=True)
Expand Down
68 changes: 63 additions & 5 deletions src/harmony/parsing/text_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,19 +27,24 @@

import re
import traceback
from io import StringIO
from typing import List

import pandas as pd
from langdetect import detect

from harmony.parsing.text_extraction.ensemble_named_entity_recogniser import extract_questions
from harmony.schemas.enums.file_types import FileType
from harmony.schemas.requests.text import RawFile, Instrument, Question

re_question_text_column = re.compile(r'(?i)(?:question|text|pergunta)')
re_number_column = re.compile(r'(?i)(?:number|\bno)')


def convert_text_to_instruments(file: RawFile) -> List[Instrument]:
if file.file_type == FileType.txt:
if file.file_type == FileType.txt or file.file_type == FileType.csv: # text files not binary
page_text = file.content
else:
else: # any binary format
page_text = file.text_content

if file.file_id is None:
Expand All @@ -53,14 +58,67 @@ def convert_text_to_instruments(file: RawFile) -> List[Instrument]:
traceback.print_exc()
traceback.print_stack()

# TODO: replace this with smarter logic
if file.file_type == FileType.txt:
csv_sep = None
if file.file_type == FileType.csv:
first_line, _ = page_text.split("\n", 1)
if "\t" in first_line:
csv_sep = "\t"
elif "," in first_line:
csv_sep = ","

string_io = StringIO(page_text)
df = pd.read_csv(string_io, sep=csv_sep)
df.fillna("", inplace=True)

# Pick the column with the longest text as the question column
col_lengths = {}
for col in df.columns:
col_lengths[col] = df[col].apply(lambda x: len(x) if type(x) is str else 0).sum()
question_column = max(col_lengths, key=col_lengths.get)

for col in df.columns:
if re_question_text_column.match(col) and not re_number_column.findall(col):
question_column = col
break
options_column = None
for col in df.columns:
if "options" in col.lower():
options_column = col
break
numbers_column = None
if question_column != df.columns[0]:
numbers_column = df.columns[0]

questions = []
for idx in range(len(df)):
if numbers_column is not None:
question_no = str(df[numbers_column].iloc[idx])
else:
question_no = "Q" + str(len(questions) + 1).zfill(3)

question_text = df[question_column].iloc[idx].strip()
if options_column is not None:
options = df[options_column].iloc[idx].split("/")
else:
options = []
if question_text == "":
continue
question = Question(question_no=question_no, question_intro="", question_text=question_text,
options=options)
questions.append(question)

if file.file_type == FileType.txt or (file.file_type == FileType.csv and csv_sep is None):
# Either txt file, or CSV file where no separating character was found in the first line
questions = []
for line in page_text.split("\n"):
if line.strip() == "":
continue
line = re.sub(r'\s+', ' ', line)
question = Question(question_no=len(questions) + 1, question_intro="", question_text=line.strip(),
question_no = "Q" + str(len(questions) + 1).zfill(3)
question_text = line.strip()
if question_text == "":
continue
question = Question(question_no=question_no, question_intro="", question_text=question_text,
options=[])
questions.append(question)
else:
Expand Down
2 changes: 1 addition & 1 deletion src/harmony/parsing/wrapper_all_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
def _get_instruments_from_file(file):
if file.file_type == FileType.pdf or file.file_type == FileType.docx:
instruments_from_this_file = convert_pdf_to_instruments(file)
elif file.file_type == FileType.txt:
elif file.file_type == FileType.txt or file.file_type == FileType.csv:
instruments_from_this_file = convert_text_to_instruments(file)
elif file.file_type == FileType.xlsx:
instruments_from_this_file = convert_excel_to_instruments(file)
Expand Down
3 changes: 2 additions & 1 deletion src/harmony/schemas/enums/file_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,5 @@ class FileType(str, Enum):
pdf: str = 'pdf'
xlsx: str = 'xlsx'
txt: str = 'txt'
docx: str = 'docx'
csv: str = 'csv'
docx: str = 'docx'

0 comments on commit 5664bde

Please sign in to comment.