Adding pos changes

appeler · Oct 11, 2023 · 0a1ecba · 0a1ecba
1 parent 9d405dd
commit 0a1ecba
Show file tree

Hide file tree

Showing 8 changed files with 58 additions and 30 deletions.
diff --git a/README.rst b/README.rst
@@ -36,14 +36,23 @@ The general API is as follows:
       df                 dataframe with Names to parse (with column name 'name')
 
     # example
-    df = pd.DataFrame({'name': ['Jan Petersen', 'Piet', 'Janssen']})
+    df = pd.DataFrame({'name': ['Jan', 'Nicholas Turner', 'Petersen', 'Nichols Richard', 'Piet',
+                                         'John Smith', 'Janssen', 'Kim Yeon']})
+    df = ParseNames.parse(df)
+    print(df.to_markdown())
 
 ::
 
-               name                                                                                                                       parsed_name
-    0  Jan Petersen  [{'name': 'Jan', 'type': 'first', 'prob': 0.6769440174102783}, {'name': 'Petersen', 'type': 'last', 'prob': 0.5342262387275696}]
-    1          Piet                                                                   [{'name': 'Piet', 'type': 'first', 'prob': 0.5381495952606201}]
-    2       Janssen                                                                [{'name': 'Janssen', 'type': 'first', 'prob': 0.5929554104804993}]
+    |    | name            | parsed_name                                                                   |
+    |---:|:----------------|:------------------------------------------------------------------------------|
+    |  0 | Jan             | {'name': 'Jan', 'type': 'first', 'prob': 0.6769440174102783}                  |
+    |  1 | Nicholas Turner | {'name': 'Nicholas Turner', 'type': 'first_last', 'prob': 0.9990382194519043} |
+    |  2 | Petersen        | {'name': 'Petersen', 'type': 'last', 'prob': 0.5342262387275696}              |
+    |  3 | Nichols Richard | {'name': 'Nichols Richard', 'type': 'last_first', 'prob': 0.9998832941055298} |
+    |  4 | Piet            | {'name': 'Piet', 'type': 'first', 'prob': 0.5381495952606201}                 |
+    |  5 | John Smith      | {'name': 'John Smith', 'type': 'first_last', 'prob': 0.9975730776786804}      |
+    |  6 | Janssen         | {'name': 'Janssen', 'type': 'first', 'prob': 0.5929554104804993}              |
+    |  7 | Kim Yeon        | {'name': 'Kim Yeon', 'type': 'last_first', 'prob': 0.9987115859985352}        |
 
 
 Data

diff --git a/gradio_app.py b/gradio_app.py
@@ -4,21 +4,22 @@
 pd.set_option('display.max_colwidth', None)
 from parsernaam.parse import ParseNames
 
+
 def parse_names(names):
     given_names = names.split(",")
     df = pd.DataFrame({'name': given_names})
     df = ParseNames.parse(df)
     print(df)
     output = ""
     for parsed_name in df['parsed_name']:
-        for name_dict in parsed_name:
-            name = name_dict['name']
-            name_type = name_dict['type']
-            prob = name_dict['prob']
-            output += f"{name} (type: {name_type}, score: {prob:.2f})\n"
+        name = parsed_name['name']
+        name_type = parsed_name['type']
+        prob = parsed_name['prob']
+        output += f"{name} (type: {name_type}, score: {prob:.2f})\n"
         output += "\n"
     return output
 
+
 iface = gr.Interface(
     fn=parse_names,
     inputs=gr.components.Textbox(lines=10, label="Names"),

diff --git a/parsernaam/model.py b/parsernaam/model.py
@@ -1,6 +1,7 @@
 import torch
 import torch.nn as nn
 
+
 class LSTM(nn.Module):
     def __init__(self, input_size, hidden_size, output_size, num_layers=1):
         super(LSTM, self).__init__()

diff --git a/parsernaam/models/parsernaam_pos.pt b/parsernaam/models/parsernaam_pos.pt
diff --git a/parsernaam/naam.py b/parsernaam/naam.py
@@ -8,25 +8,29 @@
 
 from .model import LSTM
 
+
 class Parsernaam:
     """
     Parse names
     """
 
 
     @staticmethod
-    def parse(df: pd.DataFrame, model_fn: str, vocab_fn: str) -> pd.DataFrame:
+    def parse(df: pd.DataFrame, model_fn: str, model_fn_pos: str, vocab_fn: str) -> pd.DataFrame:
         """
         Parse names
         """
         MODEL = resource_filename(__name__, model_fn)
+        MODEL_POS = resource_filename(__name__, model_fn_pos)
         VOCAB = resource_filename(__name__, vocab_fn)
 
         vectorizer = joblib.load(VOCAB)
         vocab = list(vectorizer.get_feature_names_out())
         n_letters = len(vocab)
         all_letters = ''.join(vocab)
         oob = n_letters + 1
+
+        all_categories_pos = ['last_first', 'first_last']
         all_categories = ['last', 'first']
         n_categories = len(all_categories)
 
@@ -40,6 +44,10 @@ def parse(df: pd.DataFrame, model_fn: str, vocab_fn: str) -> pd.DataFrame:
         model.load_state_dict(torch.load(MODEL, map_location=device))
         model.to(device)
 
+        model_pos = LSTM(vocab_size, n_hidden, len(all_categories_pos), num_layers=2)
+        model_pos.load_state_dict(torch.load(MODEL_POS, map_location=device))
+        model_pos.to(device)
+
         # set the model to evaluation mode
         model.eval()
 
@@ -56,16 +64,21 @@ def lineToTensor(line):
             return tensor
 
         def name_parser(name):
-            name_types = []
             names = name.split()
-            for n in names:
-                name_tokens = lineToTensor(n)
+            name_tokens = lineToTensor(name)
+            # if there is only one name, use the non-pos model
+            # otherwise use the pos model
+            if len(names) == 1:
                 out = model(name_tokens.unsqueeze(0).to(device))
                 probs = torch.exp(out)
                 out = torch.argmax(probs)
                 name_type = all_categories[out.item()]
-                name_types.append({'name': n, 'type': name_type, 'prob': probs[0][out].item()})
-            return name_types
+            else:
+                out = model_pos(name_tokens.unsqueeze(0).to(device))
+                probs = torch.exp(out)
+                out = torch.argmax(probs)
+                name_type = all_categories_pos[out.item()]
+            return {'name': name, 'type': name_type, 'prob': probs[0][out].item()}
 
         df['parsed_name'] = df['name'].apply(name_parser)
         return df
diff --git a/parsernaam/parse.py b/parsernaam/parse.py
@@ -7,12 +7,14 @@
 from .naam import Parsernaam
 from .utils import get_args
 
+
 class ParseNames(Parsernaam):
     """
     Parse names
     """
 
     MODEL_FN = "models/parsernaam.pt"
+    MODEL_POS_FN = "models/parsernaam_pos.pt"
     VOCAB_FN = "models/parsernaam.joblib"
 
     @classmethod
@@ -25,12 +27,13 @@ def parse(cls, df: pd.DataFrame) -> pd.DataFrame:
 
         Returns:
             DataFrame with parsed names
-        """        
-        return super().parse(df, cls.MODEL_FN, cls.VOCAB_FN)
+        """
+        return super().parse(df, cls.MODEL_FN , cls.MODEL_POS_FN, cls.VOCAB_FN)
 
 
 parse_names = ParseNames.parse
 
+
 def main() -> None:
     """
     Main method to parse names

diff --git a/parsernaam/tests/test_010_name_parser.py b/parsernaam/tests/test_010_name_parser.py
@@ -10,6 +10,7 @@
 
 from parsernaam.parse import ParseNames
 
+
 class TestParseNames(unittest.TestCase):
     """
     TestParseNames
@@ -19,26 +20,26 @@ def setUp(self) -> None:
         """
         Set up
         """
-        self.df = pd.DataFrame({'name': ['Jan Petersen', 'Piet', 'Janssen']})
-        self.expected = ["first", "last", "first", "first"]
+        self.df = pd.DataFrame({'name': ['Jan', 'Nicholas Turner', 'Petersen', 'Nichols Richard', 'Piet',
+                                         'John Smith', 'Janssen', 'Kim Yeon']})
+        self.expected = ["first", "first_last", "last", "last_first", "first", "first_last", "first", "last_first"]
 
     def tearDown(self) -> None:
         return super().tearDown()
 
     def test_parse(self) -> None:
         """
-        Test parse
+        Test parse pos
         """
         df = ParseNames.parse(self.df)
+        print(df.to_markdown())
         for parsed_name in df['parsed_name']:
-            for name_dict in parsed_name:
-                name_type = name_dict['type']
-                prob = name_dict['prob']
-                expected_type = self.expected.pop(0)
-                self.assertEqual(name_type, expected_type)
-                self.assertGreater(prob, 0.5)
-
+            name_type = parsed_name['type']
+            prob = parsed_name['prob']
+            expected_type = self.expected.pop(0)
+            self.assertEqual(name_type, expected_type)
+            self.assertGreater(prob, 0.5)
 
 
 if __name__ == '__main__':
-    unittest.main()
+    unittest.main()
diff --git a/setup.py b/setup.py
@@ -55,7 +55,7 @@ def run_tests(self):
 
 setup(
     name="parsernaam",
-    version="0.0.3",
+    version="0.0.4",
     description=("Name parser"),
     long_description=long_description,
     # The project's main homepage.