Skip to content

Commit

Permalink
Random Forest to address PDF parsing issues. #23 #28 #11 #4 #39
Browse files Browse the repository at this point in the history
  • Loading branch information
woodthom2 committed Jul 19, 2024
1 parent 6608f58 commit a1c4561
Show file tree
Hide file tree
Showing 5 changed files with 183 additions and 153 deletions.
Binary file not shown.
Binary file removed src/harmony/parsing/crf_text_model.pkl
Binary file not shown.
193 changes: 46 additions & 147 deletions src/harmony/parsing/pdf_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,32 +24,59 @@
SOFTWARE.
'''

import pathlib
import pickle as pkl
import re

import numpy as np

import harmony
from harmony.parsing.util.feature_extraction import convert_text_to_features
from harmony.parsing.util.tika_wrapper import parse_pdf_to_plain_text
# from harmony.parsing.util.tesseract_wrapper import parse_image_pdf_to_plain_text
# from harmony.parsing.util.camelot_wrapper import parse_pdf_to_tables
from harmony.schemas.requests.text import RawFile, Instrument

re_initial_num = re.compile(r'(^\d+)')
re_initial_num_dot = re.compile(r'(^\d+\.)')
re_word = re.compile(r'(?i)(\b[\w\']+\b)')
re_alpha = re.compile(r'(^[a-zA-Z]+)')
re_bracket = re.compile(r'(?:\(|\))')
import pathlib

model_containing_folder = pathlib.Path(__file__).parent.resolve()

with open(f"{model_containing_folder}/rf_table_model.pkl", "rb") as f:
rf_table_model = pkl.load(f)

with open(f"{model_containing_folder}/crf_text_model.pkl", "rb") as f:
with open(f"{model_containing_folder}/20240719_pdf_question_extraction_sklearn_crf_model.pkl", "rb") as f:
crf_text_model = pkl.load(f)

# Predict method is taken from the training repo. Use the training repo as the master copy of the predict method.
# All training code is in https://github.com/harmonydata/pdf-questionnaire-extraction
def predict(test_text):
token_texts, token_start_char_indices, token_end_char_indices, token_properties = convert_text_to_features(
test_text)

X = []
X.append(token_properties)

y_pred = crf_text_model.predict(X)

questions_from_text = []

tokens_already_used = set()

last_token_category = "O"

for idx in range(len(X[0])):

if y_pred[0][idx] != "O" and idx not in tokens_already_used:
if last_token_category == "O" or y_pred[0][idx] == "B":
start_idx = token_start_char_indices[idx]
end_idx = len(test_text)
for j in range(idx + 1, len(X[0])):
if y_pred[0][j] == "O" or y_pred[0][j] == "B":
end_idx = token_end_char_indices[j - 1]
break
tokens_already_used.add(j)

question_text = test_text[start_idx:end_idx]
question_text = re.sub(r'\s+', ' ', question_text)
question_text = question_text.strip()
questions_from_text.append(question_text)

last_token_category = y_pred[0][idx]

return questions_from_text


def convert_pdf_to_instruments(file: RawFile) -> Instrument:
# file is an object containing these properties:
Expand All @@ -60,136 +87,8 @@ def convert_pdf_to_instruments(file: RawFile) -> Instrument:
if not file.text_content:
file.text_content = parse_pdf_to_plain_text(file.content) # call Tika to convert the PDF to plain text

# TODO: New PDF parsing algorithm should go here, together with return statement.

table_cell_texts = []
page_tables = file.tables
questions_from_tables = []
if len(page_tables) > 0:
for page_table in page_tables:
tables = page_table['tables']
for row in tables:
for item in row:
if len(item.strip()) > 0:
table_cell_texts.append(item)

X = []
for idx in range(len(table_cell_texts)):
t = table_cell_texts[idx]
features = [len(t),
len(re_initial_num.findall(t)),
len(re_initial_num_dot.findall(t))]
X.append(features)

if len(X) > 0:
X = np.asarray(X)

y_pred = rf_table_model.predict(X)

questions_from_tables = []
for idx in range(len(table_cell_texts)):
if y_pred[idx] == 1:
questions_from_tables.append(table_cell_texts[idx])


if True: # text CRF model
questions_from_text = []
X = []

token_texts = []
token_properties = []

text = file.text_content
char_indices_of_newlines = set()
for idx, c in enumerate(text):
if c == "\n":
char_indices_of_newlines.add(idx)

char_indices_of_question_marks = set()
for idx, c in enumerate(text):
if c == "?":
char_indices_of_question_marks.add(idx)

tokens = list(re_word.finditer(text))

last_token_properties = {}

for token in tokens:
is_number = len(re_initial_num.findall(token.group()))
is_number_dot = len(re_initial_num_dot.findall(token.group()))
is_alpha = len(re_alpha.findall(token.group()))

dist_to_newline = token.start()
for c in range(token.start(), 1, -1):
if c in char_indices_of_newlines:
dist_to_newline = token.start() - c
break

dist_to_question_mark = len(text) - token.start()
for c in range(token.start(), len(text)):
if c in char_indices_of_question_marks:
dist_to_question_mark = c - token.start()
break

is_capital = int(token.group()[0] != token.group()[0].lower())

this_token_properties = {"length": len(token.group()), "is_number": is_number,
"is_alpha": is_alpha,
"is_capital": is_capital,
"is_number_dot": is_number_dot,
"dist_to_newline": dist_to_newline, "dist_to_question_mark": dist_to_question_mark,
"char_index": token.start()}

this_token_properties["prev_length"] = last_token_properties.get("length", 0)
this_token_properties["prev_is_alpha"] = last_token_properties.get("is_alpha", 0)
this_token_properties["prev_is_number"] = last_token_properties.get("is_number", 0)
this_token_properties["prev_is_number_dot"] = last_token_properties.get("is_number_dot", 0)
this_token_properties["prev_is_capital"] = last_token_properties.get("is_capital", 0)

this_token_properties["prev_prev_length"] = last_token_properties.get("prev_length", 0)
this_token_properties["prev_prev_is_alpha"] = last_token_properties.get("prev_is_alpha", 0)
this_token_properties["prev_prev_is_number"] = last_token_properties.get("prev_is_number", 0)
this_token_properties["prev_prev_is_number_dot"] = last_token_properties.get("prev_is_number_dot", 0)
this_token_properties["prev_prev_is_capital"] = last_token_properties.get("prev_is_capital", 0)

token_texts.append(token.group())

token_properties.append(this_token_properties)

last_token_properties = this_token_properties

X.append(token_properties)

y_pred = crf_text_model.predict(X)

last_token_category = "O"
for idx in range(len(X[0])):

if y_pred[0][idx] != "O":
if last_token_category == "O" or y_pred[0][idx] == "B":
start_idx = tokens[idx].start()
end_idx = len(text)
for j in range(idx + 1, len(X[0])):
if y_pred[0][j] == "O" or y_pred[0][j] == "B":
end_idx = tokens[j - 1].end()
break

question_text = text[start_idx:end_idx]
question_text = re.sub(r'\s+', ' ', question_text)
question_text = question_text.strip()
questions_from_text.append(question_text)

last_token_category = y_pred[0][idx]
questions_from_text = predict(file.text_content)

if len(questions_from_text) > len(questions_from_tables):
print ("Source of parsing was text CRF")
instrument = harmony.create_instrument_from_list(questions_from_text, instrument_name=file.file_name, file_name=file.file_name)
print(instrument)
return [instrument]
elif len(questions_from_tables) > 0:
instrument = harmony.create_instrument_from_list(questions_from_tables, instrument_name=file.file_name, file_name=file.file_name)
return [instrument]
else:
return []

# return convert_text_to_instruments(file)
instrument = harmony.create_instrument_from_list(questions_from_text, instrument_name=file.file_name,
file_name=file.file_name)
return [instrument]
127 changes: 127 additions & 0 deletions src/harmony/parsing/util/feature_extraction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
'''
MIT License
Copyright (c) 2023 Ulster University (https://www.ulster.ac.uk).
Project: Harmony (https://harmonydata.ac.uk)
Maintainer: Thomas Wood (https://fastdatascience.com)
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
'''

import json
import re

re_word = re.compile(r'(?i)(\S+)')

re_initial_num = re.compile(r'(^\d+)')
re_contains_num = re.compile(r'\d')
re_initial_num_dot = re.compile(r'(^\d+\.)')
re_alpha = re.compile(r'(^[a-zA-Z]+)')
re_bracket = re.compile(r'(?:\(|\))')


def convert_text_to_features(text):
token_texts = []
token_start_char_indices = []
token_end_char_indices = []
token_properties = []

char_indices_of_newlines = set()
for idx, c in enumerate(text):
if c == "\n":
char_indices_of_newlines.add(idx)

char_indices_of_question_marks = set()
for idx, c in enumerate(text):
if c == "?":
char_indices_of_question_marks.add(idx)

tokens = list(re_word.finditer(text))

this_token_properties = {}

for token in tokens:
is_number = len(re_initial_num.findall(token.group()))
is_number_dot = len(re_initial_num_dot.findall(token.group()))
num_nums = len(re_contains_num.findall(token.group()))
is_alpha = len(re_alpha.findall(token.group()))
is_bracket = len(re_bracket.findall(token.group()))

dist_to_prev_newline = token.start()
for c in range(token.start(), 1, -1):
if c in char_indices_of_newlines:
dist_to_prev_newline = token.start() - c
break

dist_to_next_question_mark = len(text) - token.start()
for c in range(token.start(), len(text)):
if c in char_indices_of_question_marks:
dist_to_next_question_mark = c - token.start()
break

is_capital = int(token.group()[0] != token.group()[0].lower())

is_letters_and_numbers = int(is_alpha and num_nums > 0)

this_token_properties = {"length": len(token.group()), "is_number": is_number,
"is_alpha": is_alpha,
"is_capital": is_capital,
"is_letters_and_numbers": is_letters_and_numbers,
"is_bracket": is_bracket,
"is_number_dot": is_number_dot,
"num_nums": num_nums,
"dist_to_prev_newline": dist_to_prev_newline,
"dist_to_next_question_mark": dist_to_next_question_mark,
"char_index": token.start()}

token_texts.append(token.group())
token_start_char_indices.append(token.start())
token_end_char_indices.append(token.end())
token_properties.append(this_token_properties)

all_property_names = list(sorted(this_token_properties))

for idx in range(len(token_properties)):
focus_dict = token_properties[idx]
# Generate features including prev and next token.
# There was no increase in performance associated with increasing this window. (TW 19/07/2024)
for offset in range(-1, 2):
if offset == 0:
continue
j = idx + offset
if j >= 0 and j < len(token_properties):
offset_dict = token_properties[j]
else:
offset_dict = {}

for property_name in all_property_names:
focus_dict[f"{property_name}_{offset}"] = offset_dict.get(property_name, 0)

return token_texts, token_start_char_indices, token_end_char_indices, token_properties


if __name__ == "__main__":
test_text = "this is a test123 a)"
token_texts, token_start_char_indices, token_end_char_indices, token_properties = convert_text_to_features(
test_text)
print(token_texts)
print(token_start_char_indices)
print(token_end_char_indices)
print(json.dumps(token_properties, indent=4))
16 changes: 10 additions & 6 deletions tests/test_pdf_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,12 +55,16 @@

class TestConvertPdfTables(unittest.TestCase):

def test_empty_pdf(self):

self.assertEqual(0, len(convert_pdf_to_instruments(pdf_empty_table)))

def test_two_questions(self):
self.assertEqual(2, len(convert_pdf_to_instruments(pdf_non_empty_table)[0].questions))
pass

# Not using tables at the moment
#
# def test_empty_pdf(self):
#
# self.assertEqual(0, len(convert_pdf_to_instruments(pdf_empty_table)))
#
# def test_two_questions(self):
# self.assertEqual(2, len(convert_pdf_to_instruments(pdf_non_empty_table)[0].questions))


if __name__ == '__main__':
Expand Down

0 comments on commit a1c4561

Please sign in to comment.