diff --git a/README.rst b/README.rst index ecad5b1..7839662 100644 --- a/README.rst +++ b/README.rst @@ -36,14 +36,23 @@ The general API is as follows: df dataframe with Names to parse (with column name 'name') # example - df = pd.DataFrame({'name': ['Jan Petersen', 'Piet', 'Janssen']}) + df = pd.DataFrame({'name': ['Jan', 'Nicholas Turner', 'Petersen', 'Nichols Richard', 'Piet', + 'John Smith', 'Janssen', 'Kim Yeon']}) + df = ParseNames.parse(df) + print(df.to_markdown()) :: - name parsed_name - 0 Jan Petersen [{'name': 'Jan', 'type': 'first', 'prob': 0.6769440174102783}, {'name': 'Petersen', 'type': 'last', 'prob': 0.5342262387275696}] - 1 Piet [{'name': 'Piet', 'type': 'first', 'prob': 0.5381495952606201}] - 2 Janssen [{'name': 'Janssen', 'type': 'first', 'prob': 0.5929554104804993}] + | | name | parsed_name | + |---:|:----------------|:------------------------------------------------------------------------------| + | 0 | Jan | {'name': 'Jan', 'type': 'first', 'prob': 0.6769440174102783} | + | 1 | Nicholas Turner | {'name': 'Nicholas Turner', 'type': 'first_last', 'prob': 0.9990382194519043} | + | 2 | Petersen | {'name': 'Petersen', 'type': 'last', 'prob': 0.5342262387275696} | + | 3 | Nichols Richard | {'name': 'Nichols Richard', 'type': 'last_first', 'prob': 0.9998832941055298} | + | 4 | Piet | {'name': 'Piet', 'type': 'first', 'prob': 0.5381495952606201} | + | 5 | John Smith | {'name': 'John Smith', 'type': 'first_last', 'prob': 0.9975730776786804} | + | 6 | Janssen | {'name': 'Janssen', 'type': 'first', 'prob': 0.5929554104804993} | + | 7 | Kim Yeon | {'name': 'Kim Yeon', 'type': 'last_first', 'prob': 0.9987115859985352} | Data diff --git a/gradio_app.py b/gradio_app.py index 8a21321..8e9a9a0 100644 --- a/gradio_app.py +++ b/gradio_app.py @@ -4,6 +4,7 @@ pd.set_option('display.max_colwidth', None) from parsernaam.parse import ParseNames + def parse_names(names): given_names = names.split(",") df = pd.DataFrame({'name': given_names}) @@ -11,14 +12,14 @@ def parse_names(names): print(df) output = "" for parsed_name in df['parsed_name']: - for name_dict in parsed_name: - name = name_dict['name'] - name_type = name_dict['type'] - prob = name_dict['prob'] - output += f"{name} (type: {name_type}, score: {prob:.2f})\n" + name = parsed_name['name'] + name_type = parsed_name['type'] + prob = parsed_name['prob'] + output += f"{name} (type: {name_type}, score: {prob:.2f})\n" output += "\n" return output + iface = gr.Interface( fn=parse_names, inputs=gr.components.Textbox(lines=10, label="Names"), diff --git a/parsernaam/model.py b/parsernaam/model.py index 68509b0..26a44b6 100644 --- a/parsernaam/model.py +++ b/parsernaam/model.py @@ -1,6 +1,7 @@ import torch import torch.nn as nn + class LSTM(nn.Module): def __init__(self, input_size, hidden_size, output_size, num_layers=1): super(LSTM, self).__init__() diff --git a/parsernaam/models/parsernaam_pos.pt b/parsernaam/models/parsernaam_pos.pt new file mode 100644 index 0000000..389f311 Binary files /dev/null and b/parsernaam/models/parsernaam_pos.pt differ diff --git a/parsernaam/naam.py b/parsernaam/naam.py index 0f0ec2a..73e169f 100644 --- a/parsernaam/naam.py +++ b/parsernaam/naam.py @@ -8,6 +8,7 @@ from .model import LSTM + class Parsernaam: """ Parse names @@ -15,11 +16,12 @@ class Parsernaam: @staticmethod - def parse(df: pd.DataFrame, model_fn: str, vocab_fn: str) -> pd.DataFrame: + def parse(df: pd.DataFrame, model_fn: str, model_fn_pos: str, vocab_fn: str) -> pd.DataFrame: """ Parse names """ MODEL = resource_filename(__name__, model_fn) + MODEL_POS = resource_filename(__name__, model_fn_pos) VOCAB = resource_filename(__name__, vocab_fn) vectorizer = joblib.load(VOCAB) @@ -27,6 +29,8 @@ def parse(df: pd.DataFrame, model_fn: str, vocab_fn: str) -> pd.DataFrame: n_letters = len(vocab) all_letters = ''.join(vocab) oob = n_letters + 1 + + all_categories_pos = ['last_first', 'first_last'] all_categories = ['last', 'first'] n_categories = len(all_categories) @@ -40,6 +44,10 @@ def parse(df: pd.DataFrame, model_fn: str, vocab_fn: str) -> pd.DataFrame: model.load_state_dict(torch.load(MODEL, map_location=device)) model.to(device) + model_pos = LSTM(vocab_size, n_hidden, len(all_categories_pos), num_layers=2) + model_pos.load_state_dict(torch.load(MODEL_POS, map_location=device)) + model_pos.to(device) + # set the model to evaluation mode model.eval() @@ -56,16 +64,21 @@ def lineToTensor(line): return tensor def name_parser(name): - name_types = [] names = name.split() - for n in names: - name_tokens = lineToTensor(n) + name_tokens = lineToTensor(name) + # if there is only one name, use the non-pos model + # otherwise use the pos model + if len(names) == 1: out = model(name_tokens.unsqueeze(0).to(device)) probs = torch.exp(out) out = torch.argmax(probs) name_type = all_categories[out.item()] - name_types.append({'name': n, 'type': name_type, 'prob': probs[0][out].item()}) - return name_types + else: + out = model_pos(name_tokens.unsqueeze(0).to(device)) + probs = torch.exp(out) + out = torch.argmax(probs) + name_type = all_categories_pos[out.item()] + return {'name': name, 'type': name_type, 'prob': probs[0][out].item()} df['parsed_name'] = df['name'].apply(name_parser) return df diff --git a/parsernaam/parse.py b/parsernaam/parse.py index a0fee82..7383c53 100644 --- a/parsernaam/parse.py +++ b/parsernaam/parse.py @@ -7,12 +7,14 @@ from .naam import Parsernaam from .utils import get_args + class ParseNames(Parsernaam): """ Parse names """ MODEL_FN = "models/parsernaam.pt" + MODEL_POS_FN = "models/parsernaam_pos.pt" VOCAB_FN = "models/parsernaam.joblib" @classmethod @@ -25,12 +27,13 @@ def parse(cls, df: pd.DataFrame) -> pd.DataFrame: Returns: DataFrame with parsed names - """ - return super().parse(df, cls.MODEL_FN, cls.VOCAB_FN) + """ + return super().parse(df, cls.MODEL_FN , cls.MODEL_POS_FN, cls.VOCAB_FN) parse_names = ParseNames.parse + def main() -> None: """ Main method to parse names diff --git a/parsernaam/tests/test_010_name_parser.py b/parsernaam/tests/test_010_name_parser.py index a41ef92..9aaabbe 100644 --- a/parsernaam/tests/test_010_name_parser.py +++ b/parsernaam/tests/test_010_name_parser.py @@ -10,6 +10,7 @@ from parsernaam.parse import ParseNames + class TestParseNames(unittest.TestCase): """ TestParseNames @@ -19,26 +20,26 @@ def setUp(self) -> None: """ Set up """ - self.df = pd.DataFrame({'name': ['Jan Petersen', 'Piet', 'Janssen']}) - self.expected = ["first", "last", "first", "first"] + self.df = pd.DataFrame({'name': ['Jan', 'Nicholas Turner', 'Petersen', 'Nichols Richard', 'Piet', + 'John Smith', 'Janssen', 'Kim Yeon']}) + self.expected = ["first", "first_last", "last", "last_first", "first", "first_last", "first", "last_first"] def tearDown(self) -> None: return super().tearDown() def test_parse(self) -> None: """ - Test parse + Test parse pos """ df = ParseNames.parse(self.df) + print(df.to_markdown()) for parsed_name in df['parsed_name']: - for name_dict in parsed_name: - name_type = name_dict['type'] - prob = name_dict['prob'] - expected_type = self.expected.pop(0) - self.assertEqual(name_type, expected_type) - self.assertGreater(prob, 0.5) - + name_type = parsed_name['type'] + prob = parsed_name['prob'] + expected_type = self.expected.pop(0) + self.assertEqual(name_type, expected_type) + self.assertGreater(prob, 0.5) if __name__ == '__main__': - unittest.main() \ No newline at end of file + unittest.main() diff --git a/setup.py b/setup.py index 5ca9657..ed16553 100644 --- a/setup.py +++ b/setup.py @@ -55,7 +55,7 @@ def run_tests(self): setup( name="parsernaam", - version="0.0.3", + version="0.0.4", description=("Name parser"), long_description=long_description, # The project's main homepage.