Adding 2 methods to find lang based on name

appeler · Aug 18, 2024 · c1a0647 · c1a0647
1 parent c00156c
commit c1a0647
Show file tree

Hide file tree

Showing 17 changed files with 284 additions and 113 deletions.
diff --git a/LICENSE b/LICENSE
@@ -1,6 +1,6 @@
 The MIT License (MIT)
 
-Copyright (c) 2023 Atul Dhingra and Gaurav Sood
+Copyright (c) 2023 Atul Dhingra, Gaurav Sood and Rajashekar Chintalapati
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

diff --git a/README.rst b/README.rst
@@ -41,7 +41,7 @@ Examples
 API
 ----------
 
-instate exposes 3 functions. 
+instate exposes 5 functions. 
 
 - **last_state**
 
@@ -90,6 +90,35 @@ instate exposes 3 functions.
     1   sood        punjab      Punjabi
     2   gowda       andhra      Telugu
 
+
+- **lookup_lang**
+
+    - takes a pandas dataframe, the column name with the last names, and produces a dataframe with 1 more column (lang), reflecting the most spoken language in the state. This method will find nearest names and then look up in dataset to find the most spoken language.
+
+::
+    
+      from instate import lookup_lang
+      df = pd.DataFrame({'last_name': ['sood', 'chintalapati']})
+      lookup_lang(df, "last_name")
+      
+            last_name predicted_lang
+    0          sood          hindi
+    1  chintalapati         telugu
+
+- **predict_lang**
+
+    - takes a pandas dataframe, the column name with the last names, and produces a dataframe with 1 more column (lang), reflecting the most spoken language in the state. This method will predict the language based on the names.
+
+::
+    
+      from instate import predict_lang
+      df = pd.DataFrame({'last_name': ['sood', 'chintalapati']})
+      predict_lang(df, "last_name")
+      
+            last_name predicted_lang
+    0          sood   [hindi, punjabi, urdu]
+    1  chintalapati  [telugu, urdu, chenchu]
+
 Data
 ----
 
@@ -103,7 +132,7 @@ The model has a top-3 accuracy of 85.3\% on `unseen names <https://github.com/ap
 Authors
 -------
 
-Atul Dhingra and Gaurav Sood
+Atul Dhingra, Gaurav Sood and Rajashekar Chintalapati
 
 Contributor Code of Conduct
 ---------------------------------

diff --git a/data/notebooks/04_train_multi_label.ipynb b/data/notebooks/04_train_multi_label.ipynb
diff --git a/instate/data/char2idx.json b/instate/data/char2idx.json
@@ -0,0 +1 @@
+{"n": 1, "g": 2, "i": 3, "m": 4, "c": 5, "w": 6, "u": 7, "e": 8, "v": 9, "d": 10, "a": 11, "l": 12, "t": 13, "s": 14, "q": 15, "b": 16, "f": 17, "o": 18, "z": 19, "p": 20, "r": 21, "k": 22, "h": 23, "y": 24, "x": 25, "j": 26, "<PAD>": 0}
diff --git a/instate/data/langs.txt b/instate/data/langs.txt
@@ -0,0 +1,37 @@
+sindhi
+nepali
+kannada
+marathi
+mizo
+adi
+garo
+tagin
+assamese
+hindi
+odia
+french
+punjabi
+naga languages
+english
+chenchu
+urdu
+bengali
+maithili
+dogri
+kokborok
+santali
+kashmiri
+gujarati
+apatani
+tulu
+konkani
+telugu
+malayalam
+tamil
+meitei
+khasi
+gondi
+bodo
+nishi
+chakma
+pahari and kumauni
diff --git a/instate/data/lastname_langs_india.csv.tar.gz b/instate/data/lastname_langs_india.csv.tar.gz
diff --git a/instate/data/state_lang_labels.pt b/instate/data/state_lang_labels.pt
diff --git a/instate/instate.py b/instate/instate.py
@@ -1,15 +1,16 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
-import sys
 import os
+import tarfile
+import json
 
 import pandas as pd
 import torch
-import torch.nn as nn
+from Levenshtein import distance
 
 from typing import Union, List
-from pkg_resources import resource_filename
+from .models.model_lang import LanguagePredictor
 
 from .utils import column_exists, get_app_file_path, download_file, _load_model, _pred_last_state
 from .nnets import infer, GRU_net, GT_KEYS, n_letters, n_hidden
@@ -29,6 +30,19 @@ class InRollsLnData:
     __year = None
     __dataset = None
 
+
+    @staticmethod
+    def load_data(file_name: str) -> Union[str, os.PathLike]:
+        data_path = get_app_file_path("instate", file_name)
+        if not os.path.exists(data_path):
+            print(f"Downloading instate data from the server ({file_name})...")
+            if not download_file(IN_ROLLS_DATA[file_name], data_path):
+                print("ERROR: Cannot download instate data file")
+                return None
+        else:
+            print(f"Using cached instate data from local ({data_path})...")
+        return data_path
+
     @staticmethod
     def load_instate_data(dataset: str) -> Union[str, os.PathLike]:
         data_fn = f"instate_unique_ln_state_prop_{dataset}.csv.gz"
@@ -142,8 +156,98 @@ def list_states(dataset: str = "v1") -> List[str]:
         adf = pd.read_csv(data_path, usecols=["state"])
         return adf.state.unique()
 
+    @staticmethod
+    def lookup_lang(df: pd.DataFrame, lastnamecol: str) -> pd.DataFrame:
+        if not column_exists(df, lastnamecol):
+            return df
+        data_file_name = "lastname_langs_india"
+        data_path = get_app_file_path("instate", data_file_name)
+        if not os.path.exists(data_path+".csv"):
+            data_dir = os.path.dirname(__file__)
+            gz_path = os.path.join(data_dir, 'data', f'{data_file_name}.csv.tar.gz')  
+            print(f"Extracting {gz_path} to {data_path}")
+            with tarfile.open(gz_path, "r:gz") as tar:
+                tar.extract(f"{data_file_name}.csv", data_path)
+        name_to_lang = pd.read_csv(f"{data_path}/{data_file_name}.csv")
+        langs = name_to_lang.columns[1:]
+        final = []
+        for lastname in df[lastnamecol]:
+            # use edit distance find top 3 nearest names
+            distances = name_to_lang['last_name'].apply(lambda x: distance(lastname, x))
+            nearest_lang = name_to_lang.loc[distances.nsmallest(3).index, langs].sum().idxmax()
+            final.append(nearest_lang)
+
+        # append final to df
+        df['predicted_lang'] = final
+        return df
+
+
+    @staticmethod
+    # do inference based on last_name
+    def infer(lastname, char2idx, idx2lang, model, device):
+        with torch.no_grad():
+            last_name_indices = [char2idx[char] for char in lastname]
+            last_name_tensor = torch.tensor(last_name_indices, dtype=torch.long).unsqueeze(0).to(device)
+            lengths = torch.tensor([len(lastname)], dtype=torch.long)
+            out1, out2, out3 = model(last_name_tensor, lengths)
+            pred_first_lang = torch.argmax(out1, dim=1)
+            pred_second_lang = torch.argmax(out2, dim=1)
+            pred_third_lang = torch.argmax(out3, dim=1)
+            # if second lang matches first, go to the next argmax
+            if pred_second_lang == pred_first_lang:
+                pred_second_lang = torch.topk(out2, k=2, dim=1)[1][0][1]
+            if pred_third_lang == pred_first_lang or pred_third_lang == pred_second_lang:
+                pred_third_lang = torch.topk(out3, k=3, dim=1)[1][0][1]
+            return [idx2lang[pred_first_lang.item()], idx2lang[pred_second_lang.item()], idx2lang[pred_third_lang.item()]]
+
+
+    @staticmethod
+    def predict_lang(df: pd.DataFrame, lastnamecol: str) -> pd.DataFrame:
+        if not column_exists(df, lastnamecol):
+            return df
+
+        data_dir = os.path.dirname(__file__)
+        langs_file = os.path.join(data_dir, 'data', "langs.txt")
+        with open(langs_file) as f:
+            langs = f.read().splitlines()
+
+        char2idx_file = os.path.join(data_dir, 'data', "char2idx.json")
+        with open(char2idx_file) as f:
+            char2idx = json.load(f)
+
+        idx2char = {idx: char for char, idx in char2idx.items()}
+        lang2idx = {lang: idx for idx, lang in enumerate(langs)}
+        idx2lang = {idx: lang for lang, idx in lang2idx.items()}
+
+        vocab_size = len(char2idx)
+        embedding_dim = 50
+        hidden_dim = 128  # Number of features in the hidden state of LSTM
+        num_languages = len(langs)  
+
+        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        model = LanguagePredictor(vocab_size, embedding_dim, hidden_dim, num_languages)
+        model.to(device)
+
+        model_file = os.path.join(data_dir, 'data', "state_lang_labels.pt")
+        if torch.cuda.is_available():
+            model.load_state_dict(torch.load(model_file))
+        else:
+            model.load_state_dict(torch.load(model_file, map_location=torch.device('cpu')))
+        model.eval()
+
+        # for every last name, predict the language
+        pred_langs = []
+        for lastname in df[lastnamecol]:
+            pred_langs.append(InRollsLnData.infer(lastname, char2idx, idx2lang, model, device))
+
+        df['predicted_lang'] = pred_langs
+        return df
+
+
 last_state = InRollsLnData.last_state
 pred_last_state = InRollsLnData.pred_last_state
 state_to_lang = InRollsLnData.state_to_lang
 list_states = InRollsLnData.list_states
+lookup_lang = InRollsLnData.lookup_lang
+predict_lang = InRollsLnData.predict_lang
 
diff --git a/instate/models/model_lang.py b/instate/models/model_lang.py
@@ -0,0 +1,20 @@
+import torch.nn as nn
+
+class LanguagePredictor(nn.Module):
+    def __init__(self, num_chars, embedding_dim=64, lstm_hidden_dim=128, num_languages=37):
+        super(LanguagePredictor, self).__init__()
+        self.embedding = nn.Embedding(num_chars, embedding_dim)
+        self.lstm = nn.LSTM(embedding_dim, lstm_hidden_dim, batch_first=True)
+        self.fc1 = nn.Linear(lstm_hidden_dim, num_languages)
+        self.fc2 = nn.Linear(lstm_hidden_dim, num_languages)
+        self.fc3 = nn.Linear(lstm_hidden_dim, num_languages)
+
+    def forward(self, x, lengths):
+        x = self.embedding(x)
+        x = nn.utils.rnn.pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
+        _, (h_n, _) = self.lstm(x)
+        h_n = h_n.squeeze(0)
+        out1 = self.fc1(h_n)
+        out2 = self.fc2(h_n)
+        out3 = self.fc3(h_n)
+        return out1, out2, out3
diff --git a/instate/models/model_lstm.py b/instate/models/model_lstm.py
@@ -0,0 +1,20 @@
+from torch import nn
+
+class LanguagePredictor(nn.Module):
+    def __init__(self, num_chars, embedding_dim=64, lstm_hidden_dim=128, num_languages=37):
+        super(LanguagePredictor, self).__init__()
+        self.embedding = nn.Embedding(num_chars, embedding_dim)
+        self.lstm = nn.LSTM(embedding_dim, lstm_hidden_dim, batch_first=True)
+        self.fc1 = nn.Linear(lstm_hidden_dim, num_languages)
+        self.fc2 = nn.Linear(lstm_hidden_dim, num_languages)
+        self.fc3 = nn.Linear(lstm_hidden_dim, num_languages)
+
+    def forward(self, x, lengths):
+        x = self.embedding(x)
+        x = nn.utils.rnn.pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)  
+        _, (h_n, _) = self.lstm(x)
+        h_n = h_n.squeeze(0)
+        out1 = self.fc1(h_n)
+        out2 = self.fc2(h_n)
+        out3 = self.fc3(h_n)
+        return out1, out2, out3
diff --git a/instate/tests/test_010_in_rolls_ln.py b/instate/tests/test_010_in_rolls_ln.py
diff --git a/instate/tests/test_010_instate.py b/instate/tests/test_010_instate.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+Tests for in_rolls_fn.py
+
+"""
+
+import unittest
+import pandas as pd
+from instate.instate import lookup_lang
+
+class TestInRollsLn(unittest.TestCase):
+    def setUp(self):
+        names = [{"name": "sood"}, {"name": "chintalapati"}]
+        self.pred_lang = ["hindi", "telugu"]
+        self.df = pd.DataFrame(names)
+
+    def tearDown(self):
+        pass
+
+    def test_in_rolls_fn(self):
+        odf = lookup_lang(self.df, "name")
+        print(odf)
+        self.assertIn("name", odf.columns)
+        self.assertIn("predicted_lang", odf.columns)
+        # check predicted_lang matches with pred_lang
+        self.assertListEqual(odf["predicted_lang"].tolist(), self.pred_lang)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"n": 1, "g": 2, "i": 3, "m": 4, "c": 5, "w": 6, "u": 7, "e": 8, "v": 9, "d": 10, "a": 11, "l": 12, "t": 13, "s": 14, "q": 15, "b": 16, "f": 17, "o": 18, "z": 19, "p": 20, "r": 21, "k": 22, "h": 23, "y": 24, "x": 25, "j": 26, "<PAD>": 0}