Skip to content

Commit

Permalink
Adding 2 methods to find lang based on name
Browse files Browse the repository at this point in the history
  • Loading branch information
Rajashekar Chintalapati committed Aug 18, 2024
1 parent c00156c commit c1a0647
Show file tree
Hide file tree
Showing 17 changed files with 284 additions and 113 deletions.
2 changes: 1 addition & 1 deletion LICENSE
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
The MIT License (MIT)

Copyright (c) 2023 Atul Dhingra and Gaurav Sood
Copyright (c) 2023 Atul Dhingra, Gaurav Sood and Rajashekar Chintalapati

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
Expand Down
33 changes: 31 additions & 2 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ Examples
API
----------

instate exposes 3 functions.
instate exposes 5 functions.

- **last_state**

Expand Down Expand Up @@ -90,6 +90,35 @@ instate exposes 3 functions.
1 sood punjab Punjabi
2 gowda andhra Telugu


- **lookup_lang**

- takes a pandas dataframe, the column name with the last names, and produces a dataframe with 1 more column (lang), reflecting the most spoken language in the state. This method will find nearest names and then look up in dataset to find the most spoken language.

::
from instate import lookup_lang
df = pd.DataFrame({'last_name': ['sood', 'chintalapati']})
lookup_lang(df, "last_name")
last_name predicted_lang
0 sood hindi
1 chintalapati telugu

- **predict_lang**

- takes a pandas dataframe, the column name with the last names, and produces a dataframe with 1 more column (lang), reflecting the most spoken language in the state. This method will predict the language based on the names.

::
from instate import predict_lang
df = pd.DataFrame({'last_name': ['sood', 'chintalapati']})
predict_lang(df, "last_name")
last_name predicted_lang
0 sood [hindi, punjabi, urdu]
1 chintalapati [telugu, urdu, chenchu]

Data
----

Expand All @@ -103,7 +132,7 @@ The model has a top-3 accuracy of 85.3\% on `unseen names <https://github.com/ap
Authors
-------

Atul Dhingra and Gaurav Sood
Atul Dhingra, Gaurav Sood and Rajashekar Chintalapati

Contributor Code of Conduct
---------------------------------
Expand Down
1 change: 1 addition & 0 deletions data/notebooks/04_train_multi_label.ipynb

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions instate/data/char2idx.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"n": 1, "g": 2, "i": 3, "m": 4, "c": 5, "w": 6, "u": 7, "e": 8, "v": 9, "d": 10, "a": 11, "l": 12, "t": 13, "s": 14, "q": 15, "b": 16, "f": 17, "o": 18, "z": 19, "p": 20, "r": 21, "k": 22, "h": 23, "y": 24, "x": 25, "j": 26, "<PAD>": 0}
37 changes: 37 additions & 0 deletions instate/data/langs.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
sindhi
nepali
kannada
marathi
mizo
adi
garo
tagin
assamese
hindi
odia
french
punjabi
naga languages
english
chenchu
urdu
bengali
maithili
dogri
kokborok
santali
kashmiri
gujarati
apatani
tulu
konkani
telugu
malayalam
tamil
meitei
khasi
gondi
bodo
nishi
chakma
pahari and kumauni
Binary file added instate/data/lastname_langs_india.csv.tar.gz
Binary file not shown.
Binary file added instate/data/state_lang_labels.pt
Binary file not shown.
110 changes: 107 additions & 3 deletions instate/instate.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import sys
import os
import tarfile
import json

import pandas as pd
import torch
import torch.nn as nn
from Levenshtein import distance

from typing import Union, List
from pkg_resources import resource_filename
from .models.model_lang import LanguagePredictor

from .utils import column_exists, get_app_file_path, download_file, _load_model, _pred_last_state
from .nnets import infer, GRU_net, GT_KEYS, n_letters, n_hidden
Expand All @@ -29,6 +30,19 @@ class InRollsLnData:
__year = None
__dataset = None


@staticmethod
def load_data(file_name: str) -> Union[str, os.PathLike]:
data_path = get_app_file_path("instate", file_name)
if not os.path.exists(data_path):
print(f"Downloading instate data from the server ({file_name})...")
if not download_file(IN_ROLLS_DATA[file_name], data_path):
print("ERROR: Cannot download instate data file")
return None
else:
print(f"Using cached instate data from local ({data_path})...")
return data_path

@staticmethod
def load_instate_data(dataset: str) -> Union[str, os.PathLike]:
data_fn = f"instate_unique_ln_state_prop_{dataset}.csv.gz"
Expand Down Expand Up @@ -142,8 +156,98 @@ def list_states(dataset: str = "v1") -> List[str]:
adf = pd.read_csv(data_path, usecols=["state"])
return adf.state.unique()

@staticmethod
def lookup_lang(df: pd.DataFrame, lastnamecol: str) -> pd.DataFrame:
if not column_exists(df, lastnamecol):
return df
data_file_name = "lastname_langs_india"
data_path = get_app_file_path("instate", data_file_name)
if not os.path.exists(data_path+".csv"):
data_dir = os.path.dirname(__file__)
gz_path = os.path.join(data_dir, 'data', f'{data_file_name}.csv.tar.gz')
print(f"Extracting {gz_path} to {data_path}")
with tarfile.open(gz_path, "r:gz") as tar:
tar.extract(f"{data_file_name}.csv", data_path)
name_to_lang = pd.read_csv(f"{data_path}/{data_file_name}.csv")
langs = name_to_lang.columns[1:]
final = []
for lastname in df[lastnamecol]:
# use edit distance find top 3 nearest names
distances = name_to_lang['last_name'].apply(lambda x: distance(lastname, x))
nearest_lang = name_to_lang.loc[distances.nsmallest(3).index, langs].sum().idxmax()
final.append(nearest_lang)

# append final to df
df['predicted_lang'] = final
return df


@staticmethod
# do inference based on last_name
def infer(lastname, char2idx, idx2lang, model, device):
with torch.no_grad():
last_name_indices = [char2idx[char] for char in lastname]
last_name_tensor = torch.tensor(last_name_indices, dtype=torch.long).unsqueeze(0).to(device)
lengths = torch.tensor([len(lastname)], dtype=torch.long)
out1, out2, out3 = model(last_name_tensor, lengths)
pred_first_lang = torch.argmax(out1, dim=1)
pred_second_lang = torch.argmax(out2, dim=1)
pred_third_lang = torch.argmax(out3, dim=1)
# if second lang matches first, go to the next argmax
if pred_second_lang == pred_first_lang:
pred_second_lang = torch.topk(out2, k=2, dim=1)[1][0][1]
if pred_third_lang == pred_first_lang or pred_third_lang == pred_second_lang:
pred_third_lang = torch.topk(out3, k=3, dim=1)[1][0][1]
return [idx2lang[pred_first_lang.item()], idx2lang[pred_second_lang.item()], idx2lang[pred_third_lang.item()]]


@staticmethod
def predict_lang(df: pd.DataFrame, lastnamecol: str) -> pd.DataFrame:
if not column_exists(df, lastnamecol):
return df

data_dir = os.path.dirname(__file__)
langs_file = os.path.join(data_dir, 'data', "langs.txt")
with open(langs_file) as f:
langs = f.read().splitlines()

char2idx_file = os.path.join(data_dir, 'data', "char2idx.json")
with open(char2idx_file) as f:
char2idx = json.load(f)

idx2char = {idx: char for char, idx in char2idx.items()}
lang2idx = {lang: idx for idx, lang in enumerate(langs)}
idx2lang = {idx: lang for lang, idx in lang2idx.items()}

vocab_size = len(char2idx)
embedding_dim = 50
hidden_dim = 128 # Number of features in the hidden state of LSTM
num_languages = len(langs)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = LanguagePredictor(vocab_size, embedding_dim, hidden_dim, num_languages)
model.to(device)

model_file = os.path.join(data_dir, 'data', "state_lang_labels.pt")
if torch.cuda.is_available():
model.load_state_dict(torch.load(model_file))
else:
model.load_state_dict(torch.load(model_file, map_location=torch.device('cpu')))
model.eval()

# for every last name, predict the language
pred_langs = []
for lastname in df[lastnamecol]:
pred_langs.append(InRollsLnData.infer(lastname, char2idx, idx2lang, model, device))

df['predicted_lang'] = pred_langs
return df


last_state = InRollsLnData.last_state
pred_last_state = InRollsLnData.pred_last_state
state_to_lang = InRollsLnData.state_to_lang
list_states = InRollsLnData.list_states
lookup_lang = InRollsLnData.lookup_lang
predict_lang = InRollsLnData.predict_lang

20 changes: 20 additions & 0 deletions instate/models/model_lang.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import torch.nn as nn

class LanguagePredictor(nn.Module):
def __init__(self, num_chars, embedding_dim=64, lstm_hidden_dim=128, num_languages=37):
super(LanguagePredictor, self).__init__()
self.embedding = nn.Embedding(num_chars, embedding_dim)
self.lstm = nn.LSTM(embedding_dim, lstm_hidden_dim, batch_first=True)
self.fc1 = nn.Linear(lstm_hidden_dim, num_languages)
self.fc2 = nn.Linear(lstm_hidden_dim, num_languages)
self.fc3 = nn.Linear(lstm_hidden_dim, num_languages)

def forward(self, x, lengths):
x = self.embedding(x)
x = nn.utils.rnn.pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
_, (h_n, _) = self.lstm(x)
h_n = h_n.squeeze(0)
out1 = self.fc1(h_n)
out2 = self.fc2(h_n)
out3 = self.fc3(h_n)
return out1, out2, out3
20 changes: 20 additions & 0 deletions instate/models/model_lstm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from torch import nn

class LanguagePredictor(nn.Module):
def __init__(self, num_chars, embedding_dim=64, lstm_hidden_dim=128, num_languages=37):
super(LanguagePredictor, self).__init__()
self.embedding = nn.Embedding(num_chars, embedding_dim)
self.lstm = nn.LSTM(embedding_dim, lstm_hidden_dim, batch_first=True)
self.fc1 = nn.Linear(lstm_hidden_dim, num_languages)
self.fc2 = nn.Linear(lstm_hidden_dim, num_languages)
self.fc3 = nn.Linear(lstm_hidden_dim, num_languages)

def forward(self, x, lengths):
x = self.embedding(x)
x = nn.utils.rnn.pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
_, (h_n, _) = self.lstm(x)
h_n = h_n.squeeze(0)
out1 = self.fc1(h_n)
out2 = self.fc2(h_n)
out3 = self.fc3(h_n)
return out1, out2, out3
61 changes: 0 additions & 61 deletions instate/tests/test_010_in_rolls_ln.py

This file was deleted.

28 changes: 28 additions & 0 deletions instate/tests/test_010_instate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
Tests for in_rolls_fn.py
"""

import unittest
import pandas as pd
from instate.instate import lookup_lang

class TestInRollsLn(unittest.TestCase):
def setUp(self):
names = [{"name": "sood"}, {"name": "chintalapati"}]
self.pred_lang = ["hindi", "telugu"]
self.df = pd.DataFrame(names)

def tearDown(self):
pass

def test_in_rolls_fn(self):
odf = lookup_lang(self.df, "name")
print(odf)
self.assertIn("name", odf.columns)
self.assertIn("predicted_lang", odf.columns)
# check predicted_lang matches with pred_lang
self.assertListEqual(odf["predicted_lang"].tolist(), self.pred_lang)
Loading

0 comments on commit c1a0647

Please sign in to comment.