Skip to content

Commit

Permalink
Adding pos changes
Browse files Browse the repository at this point in the history
  • Loading branch information
rajashekar committed Oct 11, 2023
1 parent 9d405dd commit 0a1ecba
Show file tree
Hide file tree
Showing 8 changed files with 58 additions and 30 deletions.
19 changes: 14 additions & 5 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -36,14 +36,23 @@ The general API is as follows:
df dataframe with Names to parse (with column name 'name')

# example
df = pd.DataFrame({'name': ['Jan Petersen', 'Piet', 'Janssen']})
df = pd.DataFrame({'name': ['Jan', 'Nicholas Turner', 'Petersen', 'Nichols Richard', 'Piet',
'John Smith', 'Janssen', 'Kim Yeon']})
df = ParseNames.parse(df)
print(df.to_markdown())

::

name parsed_name
0 Jan Petersen [{'name': 'Jan', 'type': 'first', 'prob': 0.6769440174102783}, {'name': 'Petersen', 'type': 'last', 'prob': 0.5342262387275696}]
1 Piet [{'name': 'Piet', 'type': 'first', 'prob': 0.5381495952606201}]
2 Janssen [{'name': 'Janssen', 'type': 'first', 'prob': 0.5929554104804993}]
| | name | parsed_name |
|---:|:----------------|:------------------------------------------------------------------------------|
| 0 | Jan | {'name': 'Jan', 'type': 'first', 'prob': 0.6769440174102783} |
| 1 | Nicholas Turner | {'name': 'Nicholas Turner', 'type': 'first_last', 'prob': 0.9990382194519043} |
| 2 | Petersen | {'name': 'Petersen', 'type': 'last', 'prob': 0.5342262387275696} |
| 3 | Nichols Richard | {'name': 'Nichols Richard', 'type': 'last_first', 'prob': 0.9998832941055298} |
| 4 | Piet | {'name': 'Piet', 'type': 'first', 'prob': 0.5381495952606201} |
| 5 | John Smith | {'name': 'John Smith', 'type': 'first_last', 'prob': 0.9975730776786804} |
| 6 | Janssen | {'name': 'Janssen', 'type': 'first', 'prob': 0.5929554104804993} |
| 7 | Kim Yeon | {'name': 'Kim Yeon', 'type': 'last_first', 'prob': 0.9987115859985352} |


Data
Expand Down
11 changes: 6 additions & 5 deletions gradio_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,21 +4,22 @@
pd.set_option('display.max_colwidth', None)
from parsernaam.parse import ParseNames


def parse_names(names):
given_names = names.split(",")
df = pd.DataFrame({'name': given_names})
df = ParseNames.parse(df)
print(df)
output = ""
for parsed_name in df['parsed_name']:
for name_dict in parsed_name:
name = name_dict['name']
name_type = name_dict['type']
prob = name_dict['prob']
output += f"{name} (type: {name_type}, score: {prob:.2f})\n"
name = parsed_name['name']
name_type = parsed_name['type']
prob = parsed_name['prob']
output += f"{name} (type: {name_type}, score: {prob:.2f})\n"
output += "\n"
return output


iface = gr.Interface(
fn=parse_names,
inputs=gr.components.Textbox(lines=10, label="Names"),
Expand Down
1 change: 1 addition & 0 deletions parsernaam/model.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import torch
import torch.nn as nn


class LSTM(nn.Module):
def __init__(self, input_size, hidden_size, output_size, num_layers=1):
super(LSTM, self).__init__()
Expand Down
Binary file added parsernaam/models/parsernaam_pos.pt
Binary file not shown.
25 changes: 19 additions & 6 deletions parsernaam/naam.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,25 +8,29 @@

from .model import LSTM


class Parsernaam:
"""
Parse names
"""


@staticmethod
def parse(df: pd.DataFrame, model_fn: str, vocab_fn: str) -> pd.DataFrame:
def parse(df: pd.DataFrame, model_fn: str, model_fn_pos: str, vocab_fn: str) -> pd.DataFrame:
"""
Parse names
"""
MODEL = resource_filename(__name__, model_fn)
MODEL_POS = resource_filename(__name__, model_fn_pos)
VOCAB = resource_filename(__name__, vocab_fn)

vectorizer = joblib.load(VOCAB)
vocab = list(vectorizer.get_feature_names_out())
n_letters = len(vocab)
all_letters = ''.join(vocab)
oob = n_letters + 1

all_categories_pos = ['last_first', 'first_last']
all_categories = ['last', 'first']
n_categories = len(all_categories)

Expand All @@ -40,6 +44,10 @@ def parse(df: pd.DataFrame, model_fn: str, vocab_fn: str) -> pd.DataFrame:
model.load_state_dict(torch.load(MODEL, map_location=device))
model.to(device)

model_pos = LSTM(vocab_size, n_hidden, len(all_categories_pos), num_layers=2)
model_pos.load_state_dict(torch.load(MODEL_POS, map_location=device))
model_pos.to(device)

# set the model to evaluation mode
model.eval()

Expand All @@ -56,16 +64,21 @@ def lineToTensor(line):
return tensor

def name_parser(name):
name_types = []
names = name.split()
for n in names:
name_tokens = lineToTensor(n)
name_tokens = lineToTensor(name)
# if there is only one name, use the non-pos model
# otherwise use the pos model
if len(names) == 1:
out = model(name_tokens.unsqueeze(0).to(device))
probs = torch.exp(out)
out = torch.argmax(probs)
name_type = all_categories[out.item()]
name_types.append({'name': n, 'type': name_type, 'prob': probs[0][out].item()})
return name_types
else:
out = model_pos(name_tokens.unsqueeze(0).to(device))
probs = torch.exp(out)
out = torch.argmax(probs)
name_type = all_categories_pos[out.item()]
return {'name': name, 'type': name_type, 'prob': probs[0][out].item()}

df['parsed_name'] = df['name'].apply(name_parser)
return df
7 changes: 5 additions & 2 deletions parsernaam/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,14 @@
from .naam import Parsernaam
from .utils import get_args


class ParseNames(Parsernaam):
"""
Parse names
"""

MODEL_FN = "models/parsernaam.pt"
MODEL_POS_FN = "models/parsernaam_pos.pt"
VOCAB_FN = "models/parsernaam.joblib"

@classmethod
Expand All @@ -25,12 +27,13 @@ def parse(cls, df: pd.DataFrame) -> pd.DataFrame:
Returns:
DataFrame with parsed names
"""
return super().parse(df, cls.MODEL_FN, cls.VOCAB_FN)
"""
return super().parse(df, cls.MODEL_FN , cls.MODEL_POS_FN, cls.VOCAB_FN)


parse_names = ParseNames.parse


def main() -> None:
"""
Main method to parse names
Expand Down
23 changes: 12 additions & 11 deletions parsernaam/tests/test_010_name_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

from parsernaam.parse import ParseNames


class TestParseNames(unittest.TestCase):
"""
TestParseNames
Expand All @@ -19,26 +20,26 @@ def setUp(self) -> None:
"""
Set up
"""
self.df = pd.DataFrame({'name': ['Jan Petersen', 'Piet', 'Janssen']})
self.expected = ["first", "last", "first", "first"]
self.df = pd.DataFrame({'name': ['Jan', 'Nicholas Turner', 'Petersen', 'Nichols Richard', 'Piet',
'John Smith', 'Janssen', 'Kim Yeon']})
self.expected = ["first", "first_last", "last", "last_first", "first", "first_last", "first", "last_first"]

def tearDown(self) -> None:
return super().tearDown()

def test_parse(self) -> None:
"""
Test parse
Test parse pos
"""
df = ParseNames.parse(self.df)
print(df.to_markdown())
for parsed_name in df['parsed_name']:
for name_dict in parsed_name:
name_type = name_dict['type']
prob = name_dict['prob']
expected_type = self.expected.pop(0)
self.assertEqual(name_type, expected_type)
self.assertGreater(prob, 0.5)

name_type = parsed_name['type']
prob = parsed_name['prob']
expected_type = self.expected.pop(0)
self.assertEqual(name_type, expected_type)
self.assertGreater(prob, 0.5)


if __name__ == '__main__':
unittest.main()
unittest.main()
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def run_tests(self):

setup(
name="parsernaam",
version="0.0.3",
version="0.0.4",
description=("Name parser"),
long_description=long_description,
# The project's main homepage.
Expand Down

0 comments on commit 0a1ecba

Please sign in to comment.