diff --git a/.gitignore b/.gitignore
index 629a99d..54172d2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -20,3 +20,6 @@ models/rf_vec_fullname_2M.joblib
 models/rf_vec_lastname.joblib
 models/rf_vec_lastname_1M.joblib
 models/rf_vec_lastname_2M.joblib
+ms/icwsm/name_race.aux
+ms/icwsm/name_race.bbl
+ms/icwsm/name_race.blg
diff --git a/notebooks/0.1_data_preprocessing_FullName.ipynb b/notebooks/0.1_data_preprocessing_FullName.ipynb
index 55afab9..1b0b6bd 100644
--- a/notebooks/0.1_data_preprocessing_FullName.ipynb
+++ b/notebooks/0.1_data_preprocessing_FullName.ipynb
@@ -1,258 +1,116 @@
 {
  "cells": [
   {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "71a4990d-d590-4f49-9d25-97982e6d58c0",
+   "cell_type": "markdown",
+   "id": "ce92ae6b-1cfb-47f9-b947-55f2448b7500",
    "metadata": {
     "tags": []
    },
-   "outputs": [],
-   "source": [
-    "import numpy as np\n",
-    "import pandas as pd\n",
-    "\n",
-    "from sklearn.model_selection import train_test_split"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "78ee21e5-efa5-4c7f-b2cc-32f8df219d08",
-   "metadata": {},
    "source": [
-    "# Preprocessing data"
+    "### Full Name Dataset (Train/Validation/Test)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 45,
-   "id": "af05aa15-8218-4086-9626-adadd2552183",
-   "metadata": {
-    "tags": []
-   },
+   "execution_count": 1,
+   "id": "a34fa1c0-cf2e-464a-bc56-73a4f7a38a55",
+   "metadata": {},
    "outputs": [],
    "source": [
-    "df = pd.read_csv('./data/fl_reg_name_race_2022.csv.gz')"
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from sklearn.model_selection import train_test_split"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 46,
-   "id": "05503af7-99a2-4cf2-bdd4-b25d9015aa92",
+   "execution_count": 2,
+   "id": "af05aa15-8218-4086-9626-adadd2552183",
    "metadata": {
     "tags": []
    },
    "outputs": [
     {
      "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>name_last</th>\n",
-       "      <th>name_first</th>\n",
-       "      <th>race</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>Hessler-Smith</td>\n",
-       "      <td>Jason</td>\n",
-       "      <td>nh_white</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>Rogers</td>\n",
-       "      <td>Renee</td>\n",
-       "      <td>nh_white</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>Bartolome</td>\n",
-       "      <td>Crystal</td>\n",
-       "      <td>nh_white</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>Bailey</td>\n",
-       "      <td>Donna</td>\n",
-       "      <td>nh_white</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>Carlson</td>\n",
-       "      <td>Greggory</td>\n",
-       "      <td>nh_white</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
       "text/plain": [
-       "       name_last name_first      race\n",
-       "0  Hessler-Smith      Jason  nh_white\n",
-       "1         Rogers      Renee  nh_white\n",
-       "2      Bartolome    Crystal  nh_white\n",
-       "3         Bailey      Donna  nh_white\n",
-       "4        Carlson   Greggory  nh_white"
+       "(15455110, 3)"
       ]
      },
-     "execution_count": 46,
+     "execution_count": 2,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "df.head()"
+    "df = pd.read_csv('data/fl_reg_name_race_2022.csv.gz')\n",
+    "df.shape"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 47,
-   "id": "b6b123c8-70ba-4ab4-841c-b5486a1ba69a",
+   "execution_count": 3,
+   "id": "05503af7-99a2-4cf2-bdd4-b25d9015aa92",
    "metadata": {
     "tags": []
    },
    "outputs": [
     {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>name_last</th>\n",
-       "      <th>name_first</th>\n",
-       "      <th>race</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>count</th>\n",
-       "      <td>15454992</td>\n",
-       "      <td>15455022</td>\n",
-       "      <td>15455110</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>unique</th>\n",
-       "      <td>1341195</td>\n",
-       "      <td>641103</td>\n",
-       "      <td>8</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>top</th>\n",
-       "      <td>Smith</td>\n",
-       "      <td>Michael</td>\n",
-       "      <td>nh_white</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>freq</th>\n",
-       "      <td>79362</td>\n",
-       "      <td>153753</td>\n",
-       "      <td>9446851</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "       name_last name_first      race\n",
-       "count   15454992   15455022  15455110\n",
-       "unique   1341195     641103         8\n",
-       "top        Smith    Michael  nh_white\n",
-       "freq       79362     153753   9446851"
-      ]
-     },
-     "execution_count": 47,
-     "metadata": {},
-     "output_type": "execute_result"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Size after dropping missing first and last names: (15454979, 3)\n",
+      "Size after dropping unknown: (15009244, 3)\n",
+      "Size after dropping last names less than 2 chars: (14933334, 3)\n"
+     ]
     }
    ],
    "source": [
-    "df.describe()"
+    "# Remove NA first/last\n",
+    "df.dropna(subset=['name_first', 'name_last'], inplace=True)\n",
+    "print(\"Size after dropping missing first and last names:\", df.shape)\n",
+    "\n",
+    "# We assume unknown as missing at random\n",
+    "sdf = df[df.race.isin(['unknown']) == False]\n",
+    "print(\"Size after dropping unknown:\", sdf.shape)\n",
+    "del df\n",
+    "\n",
+    "# Drop cases where last name is less than 2 chars\n",
+    "sdf = sdf.drop(sdf[sdf['name_last'].str.len() < 2].index)\n",
+    "print(\"Size after dropping last names less than 2 chars:\", sdf.shape)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 48,
-   "id": "33b5941c-619c-485f-8767-27cdfa71ab27",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array(['nh_white', 'nh_black', 'other', 'hispanic', 'asian',\n",
-       "       'native_indian', 'unknown', 'multi_racial'], dtype=object)"
-      ]
-     },
-     "execution_count": 48,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df['race'].unique()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "65f441b5-3c7d-48f2-b6ba-352e897a72ae",
+   "execution_count": 4,
+   "id": "98a49e48-1dfc-4d1b-ad98-874ce9559d0d",
    "metadata": {},
+   "outputs": [],
    "source": [
-    "## Drop None Values"
+    "# Full Name\n",
+    "sdf['name_first'] = sdf.name_first.str.strip().str.title()\n",
+    "sdf['name_last'] = sdf.name_last.str.strip().str.title()\n",
+    "sdf['full_name'] = sdf['name_last'] + ' ' + sdf['name_first']\n",
+    "# Remove special chars\n",
+    "sdf['full_name'] = sdf['full_name'].str.replace(\"[^a-zA-Z' -]\", '', regex=True)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 49,
-   "id": "0522d1ca-b9bc-4028-b3f9-58ea8d143357",
-   "metadata": {
-    "tags": []
-   },
+   "execution_count": 5,
+   "id": "9bfb3b34-bdea-4c60-bff9-ffb0a63aa265",
+   "metadata": {},
    "outputs": [],
    "source": [
-    "df.dropna(subset=['name_first', 'name_last'], inplace=True)"
+    "# recode race\n",
+    "mapping = {'multi_racial': 'other', 'native_indian': 'other'}\n",
+    "sdf['race'] = sdf['race'].replace(mapping)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 50,
-   "id": "49bd34b0-6543-48bd-b0cd-99cb15bf2569",
-   "metadata": {
-    "tags": []
-   },
+   "execution_count": 6,
+   "id": "041729a5-8518-405f-9cd5-5e6e1869cc2f",
+   "metadata": {},
    "outputs": [
     {
      "data": {
@@ -274,796 +132,223 @@
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>name_last</th>\n",
-       "      <th>name_first</th>\n",
        "      <th>race</th>\n",
+       "      <th>full_name</th>\n",
+       "      <th>asian</th>\n",
+       "      <th>hispanic</th>\n",
+       "      <th>nh_black</th>\n",
+       "      <th>nh_white</th>\n",
+       "      <th>other</th>\n",
+       "      <th>total_n</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>count</th>\n",
-       "      <td>15454908</td>\n",
-       "      <td>15454908</td>\n",
-       "      <td>15454908</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>unique</th>\n",
-       "      <td>1341176</td>\n",
-       "      <td>641095</td>\n",
-       "      <td>8</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>top</th>\n",
-       "      <td>Smith</td>\n",
-       "      <td>Michael</td>\n",
-       "      <td>nh_white</td>\n",
+       "      <th>0</th>\n",
+       "      <td>A Arup Erik</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>freq</th>\n",
-       "      <td>79362</td>\n",
-       "      <td>153753</td>\n",
-       "      <td>9446749</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "       name_last name_first      race\n",
-       "count   15454908   15454908  15454908\n",
-       "unique   1341176     641095         8\n",
-       "top        Smith    Michael  nh_white\n",
-       "freq       79362     153753   9446749"
-      ]
-     },
-     "execution_count": 50,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df.describe()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "5a1c7014-99e8-4b29-b7e5-98ba16caa3b3",
-   "metadata": {
-    "tags": []
-   },
-   "source": [
-    "## Drop Last name and first name of length 1"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 51,
-   "id": "42f1fd6c-ef4f-4538-bb6c-286db30c250a",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "df = df.drop(df[df['name_last'].str.len() < 2].index)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 52,
-   "id": "1fc0d6fd-5ffc-4190-9a11-314f9c34535a",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "df = df.drop(df[df['name_first'].str.len() < 2].index)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 53,
-   "id": "02479d63-5cf8-43f3-a208-9119df5b5457",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>name_last</th>\n",
-       "      <th>name_first</th>\n",
-       "      <th>race</th>\n",
+       "      <th>1</th>\n",
+       "      <td>A Bitang Ahmad</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
        "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
        "    <tr>\n",
-       "      <th>count</th>\n",
-       "      <td>15366690</td>\n",
-       "      <td>15366690</td>\n",
-       "      <td>15366690</td>\n",
+       "      <th>2</th>\n",
+       "      <td>A De Feria Graciela</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>unique</th>\n",
-       "      <td>1340617</td>\n",
-       "      <td>641055</td>\n",
-       "      <td>8</td>\n",
+       "      <th>3</th>\n",
+       "      <td>A F R Stephenson John Alexander</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>top</th>\n",
-       "      <td>Smith</td>\n",
-       "      <td>Michael</td>\n",
-       "      <td>nh_white</td>\n",
+       "      <th>4</th>\n",
+       "      <td>A Felix Noehmi</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>freq</th>\n",
-       "      <td>79297</td>\n",
-       "      <td>153752</td>\n",
-       "      <td>9383680</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "       name_last name_first      race\n",
-       "count   15366690   15366690  15366690\n",
-       "unique   1340617     641055         8\n",
-       "top        Smith    Michael  nh_white\n",
-       "freq       79297     153752   9383680"
-      ]
-     },
-     "execution_count": 53,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df.describe()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "058f71b5-0798-4ceb-89d7-e241000b7e1f",
-   "metadata": {},
-   "source": [
-    "## Make all names title case"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 54,
-   "id": "313b98b0-89f3-4ed8-9dd2-dad9bee428b1",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "df['name_first'] = df['name_first'].str.title()\n",
-    "df['name_last'] = df['name_last'].str.title()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "6a7d0983-9883-4777-a29e-6b9c913f9271",
-   "metadata": {},
-   "source": [
-    "## Remove Special Characters"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 56,
-   "id": "8c415ea0-1763-4fc2-b025-ba3cb5f7b786",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "df['full_name'] = df['name_last'] + ' ' + df['name_first']"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 57,
-   "id": "12dea730-e4dc-4bbd-91f6-286abf4c2fee",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "df['full_name'] = df['full_name'].str.replace(\"[^a-zA-Z' -]\", '', regex=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 58,
-   "id": "7d693324-de62-4ea3-8edd-ed3892f52a2f",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>name_last</th>\n",
-       "      <th>name_first</th>\n",
-       "      <th>race</th>\n",
-       "      <th>full_name</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>Hessler-Smith</td>\n",
-       "      <td>Jason</td>\n",
-       "      <td>nh_white</td>\n",
-       "      <td>Hessler-Smith Jason</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>Rogers</td>\n",
-       "      <td>Renee</td>\n",
-       "      <td>nh_white</td>\n",
-       "      <td>Rogers Renee</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>Bartolome</td>\n",
-       "      <td>Crystal</td>\n",
-       "      <td>nh_white</td>\n",
-       "      <td>Bartolome Crystal</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>Bailey</td>\n",
-       "      <td>Donna</td>\n",
-       "      <td>nh_white</td>\n",
-       "      <td>Bailey Donna</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>Carlson</td>\n",
-       "      <td>Greggory</td>\n",
-       "      <td>nh_white</td>\n",
-       "      <td>Carlson Greggory</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "       name_last name_first      race            full_name\n",
-       "0  Hessler-Smith      Jason  nh_white  Hessler-Smith Jason\n",
-       "1         Rogers      Renee  nh_white         Rogers Renee\n",
-       "2      Bartolome    Crystal  nh_white    Bartolome Crystal\n",
-       "3         Bailey      Donna  nh_white         Bailey Donna\n",
-       "4        Carlson   Greggory  nh_white     Carlson Greggory"
-      ]
-     },
-     "execution_count": 58,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df.head()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "7ce1807d-f228-482d-a2d8-43211e2806c1",
-   "metadata": {},
-   "source": [
-    "## Drop duplicates"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 59,
-   "id": "98363907-538a-4255-9cd6-35846c58d044",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>name_last</th>\n",
-       "      <th>name_first</th>\n",
-       "      <th>race</th>\n",
-       "      <th>full_name</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>837</th>\n",
-       "      <td>Moser</td>\n",
-       "      <td>Patricia</td>\n",
-       "      <td>nh_white</td>\n",
-       "      <td>Moser Patricia</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>928</th>\n",
-       "      <td>Johnson</td>\n",
-       "      <td>Tiffany</td>\n",
-       "      <td>nh_black</td>\n",
-       "      <td>Johnson Tiffany</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1247</th>\n",
-       "      <td>Perry</td>\n",
-       "      <td>Charles</td>\n",
-       "      <td>nh_white</td>\n",
-       "      <td>Perry Charles</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2120</th>\n",
-       "      <td>Johnson</td>\n",
-       "      <td>Ashley</td>\n",
-       "      <td>nh_black</td>\n",
-       "      <td>Johnson Ashley</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2285</th>\n",
-       "      <td>Johnson</td>\n",
-       "      <td>Clayton</td>\n",
-       "      <td>nh_white</td>\n",
-       "      <td>Johnson Clayton</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>15455104</th>\n",
-       "      <td>Ballentine</td>\n",
-       "      <td>Robert</td>\n",
-       "      <td>nh_white</td>\n",
-       "      <td>Ballentine Robert</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>15455106</th>\n",
-       "      <td>Watts</td>\n",
-       "      <td>Mark</td>\n",
-       "      <td>nh_white</td>\n",
-       "      <td>Watts Mark</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>15455107</th>\n",
-       "      <td>Mcrae</td>\n",
-       "      <td>Evelyn</td>\n",
-       "      <td>nh_white</td>\n",
-       "      <td>Mcrae Evelyn</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>15455108</th>\n",
-       "      <td>Ward</td>\n",
-       "      <td>Stephanie</td>\n",
-       "      <td>nh_white</td>\n",
-       "      <td>Ward Stephanie</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>15455109</th>\n",
-       "      <td>Edenfield</td>\n",
-       "      <td>Marcus</td>\n",
-       "      <td>nh_white</td>\n",
-       "      <td>Edenfield Marcus</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>5364911 rows × 4 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "           name_last name_first      race          full_name\n",
-       "837            Moser   Patricia  nh_white     Moser Patricia\n",
-       "928          Johnson    Tiffany  nh_black    Johnson Tiffany\n",
-       "1247           Perry    Charles  nh_white      Perry Charles\n",
-       "2120         Johnson     Ashley  nh_black     Johnson Ashley\n",
-       "2285         Johnson    Clayton  nh_white    Johnson Clayton\n",
-       "...              ...        ...       ...                ...\n",
-       "15455104  Ballentine     Robert  nh_white  Ballentine Robert\n",
-       "15455106       Watts       Mark  nh_white         Watts Mark\n",
-       "15455107       Mcrae     Evelyn  nh_white       Mcrae Evelyn\n",
-       "15455108        Ward  Stephanie  nh_white     Ward Stephanie\n",
-       "15455109   Edenfield     Marcus  nh_white   Edenfield Marcus\n",
-       "\n",
-       "[5364911 rows x 4 columns]"
-      ]
-     },
-     "execution_count": 59,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df[df[['full_name','race']].duplicated()]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 60,
-   "id": "5dcbb0d6-ede7-4b20-af15-4466f04d3fcd",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>name_last</th>\n",
-       "      <th>name_first</th>\n",
-       "      <th>race</th>\n",
-       "      <th>full_name</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>136</th>\n",
-       "      <td>Porter</td>\n",
-       "      <td>Paula</td>\n",
-       "      <td>nh_white</td>\n",
-       "      <td>Porter Paula</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>550</th>\n",
-       "      <td>Porter</td>\n",
-       "      <td>Paula</td>\n",
-       "      <td>nh_black</td>\n",
-       "      <td>Porter Paula</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>263636</th>\n",
-       "      <td>Porter</td>\n",
-       "      <td>Paula</td>\n",
-       "      <td>nh_white</td>\n",
-       "      <td>Porter Paula</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1527456</th>\n",
-       "      <td>Porter</td>\n",
-       "      <td>Paula</td>\n",
-       "      <td>nh_white</td>\n",
-       "      <td>Porter Paula</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>7563599</th>\n",
-       "      <td>Porter</td>\n",
-       "      <td>Paula</td>\n",
-       "      <td>nh_white</td>\n",
-       "      <td>Porter Paula</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>7631191</th>\n",
-       "      <td>Porter</td>\n",
-       "      <td>Paula</td>\n",
-       "      <td>nh_white</td>\n",
-       "      <td>Porter Paula</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>8383292</th>\n",
-       "      <td>Porter</td>\n",
-       "      <td>Paula</td>\n",
-       "      <td>nh_white</td>\n",
-       "      <td>Porter Paula</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>8945658</th>\n",
-       "      <td>Porter</td>\n",
-       "      <td>Paula</td>\n",
-       "      <td>nh_white</td>\n",
-       "      <td>Porter Paula</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>9402546</th>\n",
-       "      <td>Porter</td>\n",
-       "      <td>Paula</td>\n",
-       "      <td>nh_white</td>\n",
-       "      <td>Porter Paula</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>10682106</th>\n",
-       "      <td>Porter</td>\n",
-       "      <td>Paula</td>\n",
-       "      <td>nh_white</td>\n",
-       "      <td>Porter Paula</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>12427420</th>\n",
-       "      <td>Porter</td>\n",
-       "      <td>Paula</td>\n",
-       "      <td>nh_white</td>\n",
-       "      <td>Porter Paula</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>12731429</th>\n",
-       "      <td>Porter</td>\n",
-       "      <td>Paula</td>\n",
-       "      <td>nh_white</td>\n",
-       "      <td>Porter Paula</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>14637476</th>\n",
-       "      <td>Porter</td>\n",
-       "      <td>Paula</td>\n",
-       "      <td>nh_white</td>\n",
-       "      <td>Porter Paula</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "         name_last name_first      race     full_name\n",
-       "136         Porter      Paula  nh_white  Porter Paula\n",
-       "550         Porter      Paula  nh_black  Porter Paula\n",
-       "263636      Porter      Paula  nh_white  Porter Paula\n",
-       "1527456     Porter      Paula  nh_white  Porter Paula\n",
-       "7563599     Porter      Paula  nh_white  Porter Paula\n",
-       "7631191     Porter      Paula  nh_white  Porter Paula\n",
-       "8383292     Porter      Paula  nh_white  Porter Paula\n",
-       "8945658     Porter      Paula  nh_white  Porter Paula\n",
-       "9402546     Porter      Paula  nh_white  Porter Paula\n",
-       "10682106    Porter      Paula  nh_white  Porter Paula\n",
-       "12427420    Porter      Paula  nh_white  Porter Paula\n",
-       "12731429    Porter      Paula  nh_white  Porter Paula\n",
-       "14637476    Porter      Paula  nh_white  Porter Paula"
-      ]
-     },
-     "execution_count": 60,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df[df['full_name'] == \"Porter Paula\"]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 61,
-   "id": "68f7146a-45e7-40c6-8f2a-9e50bd914743",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "df = df.drop_duplicates(['full_name','race'],keep= 'last')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 62,
-   "id": "ed066729-7970-47ea-ad65-9842bc71366a",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>name_last</th>\n",
-       "      <th>name_first</th>\n",
-       "      <th>race</th>\n",
-       "      <th>full_name</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>550</th>\n",
-       "      <td>Porter</td>\n",
-       "      <td>Paula</td>\n",
-       "      <td>nh_black</td>\n",
-       "      <td>Porter Paula</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>14637476</th>\n",
-       "      <td>Porter</td>\n",
-       "      <td>Paula</td>\n",
-       "      <td>nh_white</td>\n",
-       "      <td>Porter Paula</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "         name_last name_first      race     full_name\n",
-       "550         Porter      Paula  nh_black  Porter Paula\n",
-       "14637476    Porter      Paula  nh_white  Porter Paula"
-      ]
-     },
-     "execution_count": 62,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df[df['full_name'] == \"Porter Paula\"]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 63,
-   "id": "0d2504cf-d9f9-4eff-a53f-fa859f686ed1",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9018613</th>\n",
+       "      <td>Zyzdryn Krzysztof</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9018614</th>\n",
+       "      <td>Zyznomyrsky John</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9018615</th>\n",
+       "      <td>Zzaman Md</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9018616</th>\n",
+       "      <td>Zzaman Mohammad</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9018617</th>\n",
+       "      <td>Zzie Richard</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>9018618 rows × 7 columns</p>\n",
+       "</div>"
+      ],
       "text/plain": [
-       "(10001779, 4)"
+       "race                           full_name  asian  hispanic  nh_black  nh_white  \\\n",
+       "0                            A Arup Erik    0.0       0.0       0.0       1.0   \n",
+       "1                         A Bitang Ahmad    0.0       0.0       1.0       0.0   \n",
+       "2                    A De Feria Graciela    0.0       1.0       0.0       0.0   \n",
+       "3        A F R Stephenson John Alexander    0.0       0.0       0.0       1.0   \n",
+       "4                         A Felix Noehmi    0.0       1.0       0.0       0.0   \n",
+       "...                                  ...    ...       ...       ...       ...   \n",
+       "9018613                Zyzdryn Krzysztof    0.0       0.0       0.0       1.0   \n",
+       "9018614                 Zyznomyrsky John    0.0       0.0       0.0       1.0   \n",
+       "9018615                        Zzaman Md    0.0       0.0       0.0       0.0   \n",
+       "9018616                  Zzaman Mohammad    0.0       0.0       0.0       0.0   \n",
+       "9018617                     Zzie Richard    0.0       0.0       0.0       1.0   \n",
+       "\n",
+       "race     other  total_n  \n",
+       "0          0.0      1.0  \n",
+       "1          0.0      1.0  \n",
+       "2          0.0      1.0  \n",
+       "3          0.0      1.0  \n",
+       "4          0.0      1.0  \n",
+       "...        ...      ...  \n",
+       "9018613    0.0      1.0  \n",
+       "9018614    0.0      1.0  \n",
+       "9018615    1.0      1.0  \n",
+       "9018616    1.0      1.0  \n",
+       "9018617    0.0      1.0  \n",
+       "\n",
+       "[9018618 rows x 7 columns]"
       ]
      },
-     "execution_count": 63,
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "df.shape"
+    "# Summing the count of each name & race combination\n",
+    "gdf = sdf.groupby(['full_name','race'], as_index=False)['race'].agg(['count'])\n",
+    "# creating a pivot table so that each name has a count of the # of races with that last name\n",
+    "gdf = gdf.pivot_table(values='count', columns='race', index='full_name')\n",
+    "\n",
+    "# Converting NaN to zeros since that means there is no one that identifies with that race with that last name\n",
+    "gdf = gdf.fillna(0)\n",
+    "\n",
+    "gdf['total_n'] = gdf.sum(axis=1)\n",
+    "gdf.reset_index(inplace=True)\n",
+    "gdf.iloc[:, 1:-1] = gdf.iloc[:, 1:-1].div(gdf.total_n, axis=0)\n",
+    "\n",
+    "gdf"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 64,
-   "id": "6351f866-44f2-4c5d-b9ec-ccbfb7278d9c",
-   "metadata": {
-    "tags": []
-   },
+   "execution_count": 7,
+   "id": "3e953fb6-2fe8-4d34-be56-10cf6698dc35",
+   "metadata": {},
    "outputs": [
     {
-     "data": {
-      "text/plain": [
-       "10001779"
-      ]
-     },
-     "execution_count": 64,
-     "metadata": {},
-     "output_type": "execute_result"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['asian', 'hispanic', 'nh_black', 'nh_white', 'other']\n"
+     ]
     }
    ],
    "source": [
-    "len(df)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "9830ca67-c7a7-4587-bbc1-db7e0f552ae5",
-   "metadata": {},
-   "source": [
-    "## Drop and merge columns"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 65,
-   "id": "8c0468aa-dae8-4671-bbe5-88196c1b0fb1",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "# dropping unknown column\n",
-    "df = df.drop(df[df['race'] == 'unknown'].index)"
+    "races = sorted(sdf.race.unique().tolist())\n",
+    "print(races)\n",
+    "\n",
+    "def get_race_idx(val, races):\n",
+    "    race_idx = races.index(val)\n",
+    "    return race_idx"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 66,
-   "id": "9d21c416-ea30-48f4-8794-721dd65f6259",
-   "metadata": {
-    "tags": []
-   },
+   "execution_count": 9,
+   "id": "db8c31ec-17d4-464a-8693-754a389c9bf5",
+   "metadata": {},
    "outputs": [],
    "source": [
-    "# combine multi_racial and native_indian to other\n",
-    "mapping = {'multi_racial': 'other', 'native_indian': 'other'}\n",
-    "df['race'] = df['race'].replace(mapping)"
+    "# for one set of analysis, we define 'true race/ethincity' = where max prob (so modal race = true race)\n",
+    "gdf['race'] = gdf[races].idxmax(axis=1)\n",
+    "gdf['race_code'] = gdf['race'].apply(lambda c: get_race_idx(c,races))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 67,
-   "id": "4aeb7f29-01df-4702-bbbe-156924242bb0",
-   "metadata": {
-    "tags": []
-   },
+   "execution_count": 10,
+   "id": "734fc268-8d20-4120-8693-7da140e4c8ec",
+   "metadata": {},
    "outputs": [
     {
      "data": {
@@ -1085,68 +370,116 @@
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
+       "      <th>race</th>\n",
        "      <th>full_name</th>\n",
-       "    </tr>\n",
-       "    <tr>\n",
+       "      <th>asian</th>\n",
+       "      <th>hispanic</th>\n",
+       "      <th>nh_black</th>\n",
+       "      <th>nh_white</th>\n",
+       "      <th>other</th>\n",
+       "      <th>total_n</th>\n",
        "      <th>race</th>\n",
-       "      <th></th>\n",
+       "      <th>race_code</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>asian</th>\n",
-       "      <td>278290</td>\n",
+       "      <th>0</th>\n",
+       "      <td>A Arup Erik</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>nh_white</td>\n",
+       "      <td>3</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>hispanic</th>\n",
-       "      <td>1690573</td>\n",
+       "      <th>1</th>\n",
+       "      <td>A Bitang Ahmad</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>nh_black</td>\n",
+       "      <td>2</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>nh_black</th>\n",
-       "      <td>1492989</td>\n",
+       "      <th>2</th>\n",
+       "      <td>A De Feria Graciela</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>hispanic</td>\n",
+       "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>nh_white</th>\n",
-       "      <td>5734701</td>\n",
+       "      <th>3</th>\n",
+       "      <td>A F R Stephenson John Alexander</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>nh_white</td>\n",
+       "      <td>3</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>other</th>\n",
-       "      <td>390648</td>\n",
+       "      <th>4</th>\n",
+       "      <td>A Felix Noehmi</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>hispanic</td>\n",
+       "      <td>1</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "          full_name\n",
-       "race               \n",
-       "asian        278290\n",
-       "hispanic    1690573\n",
-       "nh_black    1492989\n",
-       "nh_white    5734701\n",
-       "other        390648"
+       "race                        full_name  asian  hispanic  nh_black  nh_white  \\\n",
+       "0                         A Arup Erik    0.0       0.0       0.0       1.0   \n",
+       "1                      A Bitang Ahmad    0.0       0.0       1.0       0.0   \n",
+       "2                 A De Feria Graciela    0.0       1.0       0.0       0.0   \n",
+       "3     A F R Stephenson John Alexander    0.0       0.0       0.0       1.0   \n",
+       "4                      A Felix Noehmi    0.0       1.0       0.0       0.0   \n",
+       "\n",
+       "race  other  total_n      race  race_code  \n",
+       "0       0.0      1.0  nh_white          3  \n",
+       "1       0.0      1.0  nh_black          2  \n",
+       "2       0.0      1.0  hispanic          1  \n",
+       "3       0.0      1.0  nh_white          3  \n",
+       "4       0.0      1.0  hispanic          1  "
       ]
      },
-     "execution_count": 67,
+     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "df.groupby('race').agg({'full_name':'nunique'})"
+    "gdf.head()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 68,
-   "id": "2e4d2735-03e0-4c3c-b049-216c662a84cb",
-   "metadata": {
-    "tags": []
-   },
+   "execution_count": 11,
+   "id": "036bbb11-0d02-45db-801c-03a2873291c5",
+   "metadata": {},
    "outputs": [],
    "source": [
-    "df['race_code'] = df.race.factorize()[0]"
+    "gdf.to_csv(\"train_validation_test/gdf_fullname.csv.gz\", index = False, compression=\"gzip\")"
    ]
   },
   {
@@ -1159,29 +492,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 69,
+   "execution_count": 13,
    "id": "3a11b216-fde5-48c7-b5ec-904dbde4bb29",
    "metadata": {},
    "outputs": [],
    "source": [
-    "train_df, rest_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['race_code'])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 70,
-   "id": "f7d44e05-63d5-47cc-85db-c59fc3e169f1",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
+    "train_df, rest_df = train_test_split(gdf, test_size=0.2, random_state=42, stratify=gdf['race_code'])\n",
     "val_df, test_df = train_test_split(rest_df, test_size=0.5, random_state=42, stratify=rest_df['race_code'])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 71,
+   "execution_count": 14,
    "id": "e47fa000-d58f-4360-9487-bd33c149433e",
    "metadata": {
     "tags": []
@@ -1195,7 +517,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 72,
+   "execution_count": 15,
    "id": "fce465b6-0a17-437d-ad38-7d8036c74d0e",
    "metadata": {
     "tags": []
@@ -1205,9 +527,9 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "(7678780, 5)\n",
-      "(959847, 5)\n",
-      "(959848, 5)\n"
+      "(7214894, 9)\n",
+      "(901862, 9)\n",
+      "(901862, 9)\n"
      ]
     }
    ],
@@ -1219,87 +541,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 73,
-   "id": "c30f722b-995b-4269-bac7-38bc4ba64999",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>full_name</th>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>race</th>\n",
-       "      <th></th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>asian</th>\n",
-       "      <td>278290</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>hispanic</th>\n",
-       "      <td>1690573</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>nh_black</th>\n",
-       "      <td>1492989</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>nh_white</th>\n",
-       "      <td>5734701</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>other</th>\n",
-       "      <td>390648</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "          full_name\n",
-       "race               \n",
-       "asian        278290\n",
-       "hispanic    1690573\n",
-       "nh_black    1492989\n",
-       "nh_white    5734701\n",
-       "other        390648"
-      ]
-     },
-     "execution_count": 73,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df.groupby('race').agg({'full_name':'nunique'})"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 74,
+   "execution_count": 16,
    "id": "66f97f4e-4fda-44bc-8d3c-65fbe6eea9b6",
    "metadata": {
     "tags": []
@@ -1325,7 +567,7 @@
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
+       "      <th>race</th>\n",
        "      <th>full_name</th>\n",
        "    </tr>\n",
        "    <tr>\n",
@@ -1336,39 +578,39 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>asian</th>\n",
-       "      <td>222632</td>\n",
+       "      <td>206042</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>hispanic</th>\n",
-       "      <td>1352458</td>\n",
+       "      <td>1308198</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>nh_black</th>\n",
-       "      <td>1194391</td>\n",
+       "      <td>1067770</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>nh_white</th>\n",
-       "      <td>4587761</td>\n",
+       "      <td>4421898</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>other</th>\n",
-       "      <td>314216</td>\n",
+       "      <td>210986</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "          full_name\n",
+       "race      full_name\n",
        "race               \n",
-       "asian        222632\n",
-       "hispanic    1352458\n",
-       "nh_black    1194391\n",
-       "nh_white    4587761\n",
-       "other        314216"
+       "asian        206042\n",
+       "hispanic    1308198\n",
+       "nh_black    1067770\n",
+       "nh_white    4421898\n",
+       "other        210986"
       ]
      },
-     "execution_count": 74,
+     "execution_count": 16,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1379,7 +621,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 75,
+   "execution_count": 17,
    "id": "7d9a625b-fd10-4374-b744-e809620c86d5",
    "metadata": {
     "tags": []
@@ -1405,7 +647,7 @@
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
+       "      <th>race</th>\n",
        "      <th>full_name</th>\n",
        "    </tr>\n",
        "    <tr>\n",
@@ -1416,39 +658,39 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>asian</th>\n",
-       "      <td>27829</td>\n",
+       "      <td>25755</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>hispanic</th>\n",
-       "      <td>169057</td>\n",
+       "      <td>163525</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>nh_black</th>\n",
-       "      <td>149299</td>\n",
+       "      <td>133471</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>nh_white</th>\n",
-       "      <td>573470</td>\n",
+       "      <td>552738</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>other</th>\n",
-       "      <td>40061</td>\n",
+       "      <td>26373</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "          full_name\n",
+       "race      full_name\n",
        "race               \n",
-       "asian         27829\n",
-       "hispanic     169057\n",
-       "nh_black     149299\n",
-       "nh_white     573470\n",
-       "other         40061"
+       "asian         25755\n",
+       "hispanic     163525\n",
+       "nh_black     133471\n",
+       "nh_white     552738\n",
+       "other         26373"
       ]
      },
-     "execution_count": 75,
+     "execution_count": 17,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1459,7 +701,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 76,
+   "execution_count": 18,
    "id": "e61fc0f4-1b6c-42a0-a1d5-cf7cbff8a290",
    "metadata": {
     "tags": []
@@ -1485,7 +727,7 @@
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
+       "      <th>race</th>\n",
        "      <th>full_name</th>\n",
        "    </tr>\n",
        "    <tr>\n",
@@ -1496,39 +738,39 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>asian</th>\n",
-       "      <td>27829</td>\n",
+       "      <td>25756</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>hispanic</th>\n",
-       "      <td>169058</td>\n",
+       "      <td>163525</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>nh_black</th>\n",
-       "      <td>149299</td>\n",
+       "      <td>133471</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>nh_white</th>\n",
-       "      <td>573470</td>\n",
+       "      <td>552737</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>other</th>\n",
-       "      <td>40068</td>\n",
+       "      <td>26373</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "          full_name\n",
+       "race      full_name\n",
        "race               \n",
-       "asian         27829\n",
-       "hispanic     169058\n",
-       "nh_black     149299\n",
-       "nh_white     573470\n",
-       "other         40068"
+       "asian         25756\n",
+       "hispanic     163525\n",
+       "nh_black     133471\n",
+       "nh_white     552737\n",
+       "other         26373"
       ]
      },
-     "execution_count": 76,
+     "execution_count": 18,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1547,143 +789,22 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 77,
+   "execution_count": 19,
    "id": "ecc12a2c-bee0-49bd-b42d-ab8cb5589a15",
    "metadata": {
     "tags": []
    },
    "outputs": [],
    "source": [
-    "train_df.to_csv(\"data/fl_2022_FullName_train.csv.gz\",index=False,compression=\"gzip\")\n",
-    "val_df.to_csv(\"data/fl_2022_FullName_val.csv.gz\",index=False,compression=\"gzip\")\n",
-    "test_df.to_csv(\"data/fl_2022_FullName_test.csv.gz\",index=False,compression=\"gzip\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 78,
-   "id": "aa9be3b5-ee0d-4935-9b21-14012c676235",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "13M\tdata/fl_2022_FullName_test.csv.gz\n",
-      "101M\tdata/fl_2022_FullName_train.csv.gz\n",
-      "13M\tdata/fl_2022_FullName_val.csv.gz\n"
-     ]
-    }
-   ],
-   "source": [
-    "!du -sh data/fl_2022_FullName_*"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 79,
-   "id": "2fb93bc9-448d-45e0-a976-9312bf94e708",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>name_last</th>\n",
-       "      <th>name_first</th>\n",
-       "      <th>race</th>\n",
-       "      <th>full_name</th>\n",
-       "      <th>race_code</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>Baxla</td>\n",
-       "      <td>Phyllis</td>\n",
-       "      <td>nh_white</td>\n",
-       "      <td>Baxla Phyllis</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>Ludwin</td>\n",
-       "      <td>Ron</td>\n",
-       "      <td>nh_white</td>\n",
-       "      <td>Ludwin Ron</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>Signer Welton</td>\n",
-       "      <td>Jessica</td>\n",
-       "      <td>nh_white</td>\n",
-       "      <td>Signer Welton Jessica</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>Stamps</td>\n",
-       "      <td>Joshua</td>\n",
-       "      <td>nh_white</td>\n",
-       "      <td>Stamps Joshua</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>Vassell</td>\n",
-       "      <td>Lillie</td>\n",
-       "      <td>nh_black</td>\n",
-       "      <td>Vassell Lillie</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "       name_last name_first      race              full_name  race_code\n",
-       "0          Baxla    Phyllis  nh_white          Baxla Phyllis          0\n",
-       "1         Ludwin        Ron  nh_white             Ludwin Ron          0\n",
-       "2  Signer Welton    Jessica  nh_white  Signer Welton Jessica          0\n",
-       "3         Stamps     Joshua  nh_white          Stamps Joshua          0\n",
-       "4        Vassell     Lillie  nh_black         Vassell Lillie          1"
-      ]
-     },
-     "execution_count": 79,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "train_df.head()"
+    "train_df.to_csv(\"data/fl_2022_FullName_train.csv.gz\", index=False, compression=\"gzip\")\n",
+    "val_df.to_csv(\"data/fl_2022_FullName_val.csv.gz\", index=False, compression=\"gzip\")\n",
+    "test_df.to_csv(\"data/fl_2022_FullName_test.csv.gz\", index=False, compression=\"gzip\")"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "b0a55dbb-c317-4ab8-b4a8-7ff0eb9d645a",
+   "id": "41d34d62-afb2-479e-a21b-040f1eb962ff",
    "metadata": {},
    "outputs": [],
    "source": []
@@ -1705,7 +826,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.6"
+   "version": "3.8.10"
   }
  },
  "nbformat": 4,
diff --git a/notebooks/0.2_data_preprocessing_LastName.ipynb b/notebooks/0.2_data_preprocessing_LastName.ipynb
index 320c956..e1ecb38 100644
--- a/notebooks/0.2_data_preprocessing_LastName.ipynb
+++ b/notebooks/0.2_data_preprocessing_LastName.ipynb
@@ -1,969 +1,32 @@
 {
  "cells": [
   {
-   "cell_type": "code",
-   "execution_count": 34,
-   "id": "71a4990d-d590-4f49-9d25-97982e6d58c0",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "import numpy as np\n",
-    "import pandas as pd\n",
-    "\n",
-    "from sklearn.model_selection import train_test_split"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "78ee21e5-efa5-4c7f-b2cc-32f8df219d08",
-   "metadata": {},
-   "source": [
-    "# Preprocessing data"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 35,
-   "id": "af05aa15-8218-4086-9626-adadd2552183",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "df = pd.read_csv('./data/fl_reg_name_race_2022.csv.gz')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 36,
-   "id": "05503af7-99a2-4cf2-bdd4-b25d9015aa92",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>name_last</th>\n",
-       "      <th>name_first</th>\n",
-       "      <th>race</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>Hessler-Smith</td>\n",
-       "      <td>Jason</td>\n",
-       "      <td>nh_white</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>Rogers</td>\n",
-       "      <td>Renee</td>\n",
-       "      <td>nh_white</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>Bartolome</td>\n",
-       "      <td>Crystal</td>\n",
-       "      <td>nh_white</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>Bailey</td>\n",
-       "      <td>Donna</td>\n",
-       "      <td>nh_white</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>Carlson</td>\n",
-       "      <td>Greggory</td>\n",
-       "      <td>nh_white</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "       name_last name_first      race\n",
-       "0  Hessler-Smith      Jason  nh_white\n",
-       "1         Rogers      Renee  nh_white\n",
-       "2      Bartolome    Crystal  nh_white\n",
-       "3         Bailey      Donna  nh_white\n",
-       "4        Carlson   Greggory  nh_white"
-      ]
-     },
-     "execution_count": 36,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df.head()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 37,
-   "id": "b6b123c8-70ba-4ab4-841c-b5486a1ba69a",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>name_last</th>\n",
-       "      <th>name_first</th>\n",
-       "      <th>race</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>count</th>\n",
-       "      <td>15454992</td>\n",
-       "      <td>15455022</td>\n",
-       "      <td>15455110</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>unique</th>\n",
-       "      <td>1341195</td>\n",
-       "      <td>641103</td>\n",
-       "      <td>8</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>top</th>\n",
-       "      <td>Smith</td>\n",
-       "      <td>Michael</td>\n",
-       "      <td>nh_white</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>freq</th>\n",
-       "      <td>79362</td>\n",
-       "      <td>153753</td>\n",
-       "      <td>9446851</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "       name_last name_first      race\n",
-       "count   15454992   15455022  15455110\n",
-       "unique   1341195     641103         8\n",
-       "top        Smith    Michael  nh_white\n",
-       "freq       79362     153753   9446851"
-      ]
-     },
-     "execution_count": 37,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df.describe()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 38,
-   "id": "33b5941c-619c-485f-8767-27cdfa71ab27",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array(['nh_white', 'nh_black', 'other', 'hispanic', 'asian',\n",
-       "       'native_indian', 'unknown', 'multi_racial'], dtype=object)"
-      ]
-     },
-     "execution_count": 38,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df['race'].unique()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "65f441b5-3c7d-48f2-b6ba-352e897a72ae",
-   "metadata": {},
-   "source": [
-    "## Drop None Values"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 39,
-   "id": "0522d1ca-b9bc-4028-b3f9-58ea8d143357",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "df.dropna(subset=['name_first', 'name_last'], inplace=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 40,
-   "id": "49bd34b0-6543-48bd-b0cd-99cb15bf2569",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>name_last</th>\n",
-       "      <th>name_first</th>\n",
-       "      <th>race</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>count</th>\n",
-       "      <td>15454908</td>\n",
-       "      <td>15454908</td>\n",
-       "      <td>15454908</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>unique</th>\n",
-       "      <td>1341176</td>\n",
-       "      <td>641095</td>\n",
-       "      <td>8</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>top</th>\n",
-       "      <td>Smith</td>\n",
-       "      <td>Michael</td>\n",
-       "      <td>nh_white</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>freq</th>\n",
-       "      <td>79362</td>\n",
-       "      <td>153753</td>\n",
-       "      <td>9446749</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "       name_last name_first      race\n",
-       "count   15454908   15454908  15454908\n",
-       "unique   1341176     641095         8\n",
-       "top        Smith    Michael  nh_white\n",
-       "freq       79362     153753   9446749"
-      ]
-     },
-     "execution_count": 40,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df.describe()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "5a1c7014-99e8-4b29-b7e5-98ba16caa3b3",
-   "metadata": {
-    "tags": []
-   },
-   "source": [
-    "## Drop Last name and first name of length 1"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 41,
-   "id": "42f1fd6c-ef4f-4538-bb6c-286db30c250a",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "df = df.drop(df[df['name_last'].str.len() < 2].index)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 42,
-   "id": "1fc0d6fd-5ffc-4190-9a11-314f9c34535a",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "df = df.drop(df[df['name_first'].str.len() < 2].index)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 43,
-   "id": "02479d63-5cf8-43f3-a208-9119df5b5457",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>name_last</th>\n",
-       "      <th>name_first</th>\n",
-       "      <th>race</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>count</th>\n",
-       "      <td>15366690</td>\n",
-       "      <td>15366690</td>\n",
-       "      <td>15366690</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>unique</th>\n",
-       "      <td>1340617</td>\n",
-       "      <td>641055</td>\n",
-       "      <td>8</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>top</th>\n",
-       "      <td>Smith</td>\n",
-       "      <td>Michael</td>\n",
-       "      <td>nh_white</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>freq</th>\n",
-       "      <td>79297</td>\n",
-       "      <td>153752</td>\n",
-       "      <td>9383680</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "       name_last name_first      race\n",
-       "count   15366690   15366690  15366690\n",
-       "unique   1340617     641055         8\n",
-       "top        Smith    Michael  nh_white\n",
-       "freq       79297     153752   9383680"
-      ]
-     },
-     "execution_count": 43,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df.describe()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "22f6aa41-af9c-44a7-a9ce-b5faf2c36caf",
-   "metadata": {},
-   "source": [
-    "## Make all names title case"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 44,
-   "id": "96a27d5c-b423-43d8-bb10-fc1e4d4404cb",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "df['name_first'] = df['name_first'].str.title()\n",
-    "df['name_last'] = df['name_last'].str.title()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "6a7d0983-9883-4777-a29e-6b9c913f9271",
-   "metadata": {},
-   "source": [
-    "## Remove Special Characters"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 45,
-   "id": "12dea730-e4dc-4bbd-91f6-286abf4c2fee",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "df['name_last'] = df['name_last'].str.replace(\"[^a-zA-Z' -]\", '', regex=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 46,
-   "id": "7d693324-de62-4ea3-8edd-ed3892f52a2f",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>name_last</th>\n",
-       "      <th>name_first</th>\n",
-       "      <th>race</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>Hessler-Smith</td>\n",
-       "      <td>Jason</td>\n",
-       "      <td>nh_white</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>Rogers</td>\n",
-       "      <td>Renee</td>\n",
-       "      <td>nh_white</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>Bartolome</td>\n",
-       "      <td>Crystal</td>\n",
-       "      <td>nh_white</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>Bailey</td>\n",
-       "      <td>Donna</td>\n",
-       "      <td>nh_white</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>Carlson</td>\n",
-       "      <td>Greggory</td>\n",
-       "      <td>nh_white</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "       name_last name_first      race\n",
-       "0  Hessler-Smith      Jason  nh_white\n",
-       "1         Rogers      Renee  nh_white\n",
-       "2      Bartolome    Crystal  nh_white\n",
-       "3         Bailey      Donna  nh_white\n",
-       "4        Carlson   Greggory  nh_white"
-      ]
-     },
-     "execution_count": 46,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df.head()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "7ce1807d-f228-482d-a2d8-43211e2806c1",
-   "metadata": {},
-   "source": [
-    "## Drop duplicates"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 47,
-   "id": "98363907-538a-4255-9cd6-35846c58d044",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>name_last</th>\n",
-       "      <th>name_first</th>\n",
-       "      <th>race</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>52</th>\n",
-       "      <td>Gruber</td>\n",
-       "      <td>Linda</td>\n",
-       "      <td>nh_white</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>122</th>\n",
-       "      <td>Taylor</td>\n",
-       "      <td>Robert</td>\n",
-       "      <td>nh_white</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>127</th>\n",
-       "      <td>Bailey</td>\n",
-       "      <td>Pamela</td>\n",
-       "      <td>nh_white</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>138</th>\n",
-       "      <td>Johnson</td>\n",
-       "      <td>Ashley</td>\n",
-       "      <td>nh_black</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>146</th>\n",
-       "      <td>Mobley</td>\n",
-       "      <td>Robert</td>\n",
-       "      <td>nh_black</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>15455105</th>\n",
-       "      <td>Ballew</td>\n",
-       "      <td>Christina</td>\n",
-       "      <td>nh_white</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>15455106</th>\n",
-       "      <td>Watts</td>\n",
-       "      <td>Mark</td>\n",
-       "      <td>nh_white</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>15455107</th>\n",
-       "      <td>Mcrae</td>\n",
-       "      <td>Evelyn</td>\n",
-       "      <td>nh_white</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>15455108</th>\n",
-       "      <td>Ward</td>\n",
-       "      <td>Stephanie</td>\n",
-       "      <td>nh_white</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>15455109</th>\n",
-       "      <td>Edenfield</td>\n",
-       "      <td>Marcus</td>\n",
-       "      <td>nh_white</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>13894849 rows × 3 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "          name_last name_first      race\n",
-       "52           Gruber      Linda  nh_white\n",
-       "122          Taylor     Robert  nh_white\n",
-       "127          Bailey     Pamela  nh_white\n",
-       "138         Johnson     Ashley  nh_black\n",
-       "146          Mobley     Robert  nh_black\n",
-       "...             ...        ...       ...\n",
-       "15455105     Ballew  Christina  nh_white\n",
-       "15455106      Watts       Mark  nh_white\n",
-       "15455107      Mcrae     Evelyn  nh_white\n",
-       "15455108       Ward  Stephanie  nh_white\n",
-       "15455109  Edenfield     Marcus  nh_white\n",
-       "\n",
-       "[13894849 rows x 3 columns]"
-      ]
-     },
-     "execution_count": 47,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df[df[['name_last','race']].duplicated()]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 48,
-   "id": "5dcbb0d6-ede7-4b20-af15-4466f04d3fcd",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>name_last</th>\n",
-       "      <th>name_first</th>\n",
-       "      <th>race</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>136</th>\n",
-       "      <td>Porter</td>\n",
-       "      <td>Paula</td>\n",
-       "      <td>nh_white</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>550</th>\n",
-       "      <td>Porter</td>\n",
-       "      <td>Paula</td>\n",
-       "      <td>nh_black</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>7329</th>\n",
-       "      <td>Porter</td>\n",
-       "      <td>Wendell</td>\n",
-       "      <td>nh_white</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>7557</th>\n",
-       "      <td>Porter</td>\n",
-       "      <td>Anthony</td>\n",
-       "      <td>nh_white</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>9200</th>\n",
-       "      <td>Porter</td>\n",
-       "      <td>Kevin</td>\n",
-       "      <td>nh_white</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>15448598</th>\n",
-       "      <td>Porter</td>\n",
-       "      <td>William</td>\n",
-       "      <td>nh_white</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>15448772</th>\n",
-       "      <td>Porter</td>\n",
-       "      <td>Kyle</td>\n",
-       "      <td>nh_white</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>15451135</th>\n",
-       "      <td>Porter</td>\n",
-       "      <td>Jean</td>\n",
-       "      <td>nh_white</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>15451767</th>\n",
-       "      <td>Porter</td>\n",
-       "      <td>Annette</td>\n",
-       "      <td>nh_white</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>15454870</th>\n",
-       "      <td>Porter</td>\n",
-       "      <td>Ashley</td>\n",
-       "      <td>nh_white</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>7451 rows × 3 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "         name_last name_first      race\n",
-       "136         Porter      Paula  nh_white\n",
-       "550         Porter      Paula  nh_black\n",
-       "7329        Porter    Wendell  nh_white\n",
-       "7557        Porter    Anthony  nh_white\n",
-       "9200        Porter      Kevin  nh_white\n",
-       "...            ...        ...       ...\n",
-       "15448598    Porter    William  nh_white\n",
-       "15448772    Porter       Kyle  nh_white\n",
-       "15451135    Porter       Jean  nh_white\n",
-       "15451767    Porter    Annette  nh_white\n",
-       "15454870    Porter     Ashley  nh_white\n",
-       "\n",
-       "[7451 rows x 3 columns]"
-      ]
-     },
-     "execution_count": 48,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df[df['name_last'] == \"Porter\"]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 49,
-   "id": "68f7146a-45e7-40c6-8f2a-9e50bd914743",
+   "cell_type": "markdown",
+   "id": "6cc055eb-51ad-43c5-aacf-798983f0adfa",
    "metadata": {
     "tags": []
    },
-   "outputs": [],
    "source": [
-    "df = df.drop_duplicates(['name_last','race'],keep= 'last')"
+    "### Last Name Preprocessing (Train/Validation/Test)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 50,
-   "id": "ed066729-7970-47ea-ad65-9842bc71366a",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>name_last</th>\n",
-       "      <th>name_first</th>\n",
-       "      <th>race</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>14952661</th>\n",
-       "      <td>Porter</td>\n",
-       "      <td>Marisyd</td>\n",
-       "      <td>asian</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>15029071</th>\n",
-       "      <td>Porter</td>\n",
-       "      <td>Amber</td>\n",
-       "      <td>multi_racial</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>15222442</th>\n",
-       "      <td>Porter</td>\n",
-       "      <td>Anna</td>\n",
-       "      <td>other</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>15337979</th>\n",
-       "      <td>Porter</td>\n",
-       "      <td>Dennis</td>\n",
-       "      <td>unknown</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>15369699</th>\n",
-       "      <td>Porter</td>\n",
-       "      <td>Lila</td>\n",
-       "      <td>native_indian</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>15378779</th>\n",
-       "      <td>Porter</td>\n",
-       "      <td>Cristopher</td>\n",
-       "      <td>hispanic</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>15438806</th>\n",
-       "      <td>Porter</td>\n",
-       "      <td>Orrick</td>\n",
-       "      <td>nh_black</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>15454870</th>\n",
-       "      <td>Porter</td>\n",
-       "      <td>Ashley</td>\n",
-       "      <td>nh_white</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "         name_last  name_first           race\n",
-       "14952661    Porter     Marisyd          asian\n",
-       "15029071    Porter       Amber   multi_racial\n",
-       "15222442    Porter        Anna          other\n",
-       "15337979    Porter      Dennis        unknown\n",
-       "15369699    Porter        Lila  native_indian\n",
-       "15378779    Porter  Cristopher       hispanic\n",
-       "15438806    Porter      Orrick       nh_black\n",
-       "15454870    Porter      Ashley       nh_white"
-      ]
-     },
-     "execution_count": 50,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": 1,
+   "id": "722b0940-de98-48ad-96b1-89abf710e3f1",
+   "metadata": {},
+   "outputs": [],
    "source": [
-    "df[df['name_last'] == \"Porter\"]"
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "\n",
+    "from sklearn.model_selection import train_test_split"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 51,
-   "id": "0d2504cf-d9f9-4eff-a53f-fa859f686ed1",
+   "execution_count": 2,
+   "id": "af05aa15-8218-4086-9626-adadd2552183",
    "metadata": {
     "tags": []
    },
@@ -971,83 +34,80 @@
     {
      "data": {
       "text/plain": [
-       "(1471841, 3)"
+       "(15455110, 3)"
       ]
      },
-     "execution_count": 51,
+     "execution_count": 2,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
+    "df = pd.read_csv('data/fl_reg_name_race_2022.csv.gz')\n",
     "df.shape"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 52,
-   "id": "6351f866-44f2-4c5d-b9ec-ccbfb7278d9c",
-   "metadata": {
-    "tags": []
-   },
+   "execution_count": 3,
+   "id": "5c244d79-6807-4156-b318-10555d85de1e",
+   "metadata": {},
    "outputs": [
     {
-     "data": {
-      "text/plain": [
-       "1471841"
-      ]
-     },
-     "execution_count": 52,
-     "metadata": {},
-     "output_type": "execute_result"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Size after dropping missing first and last names: (15454979, 3)\n",
+      "Size after dropping unknown: (15009244, 3)\n",
+      "Size after dropping last names less than 2 chars: (14933334, 3)\n"
+     ]
     }
    ],
    "source": [
-    "len(df)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "9830ca67-c7a7-4587-bbc1-db7e0f552ae5",
-   "metadata": {},
-   "source": [
-    "## Drop and merge columns"
+    "# Remove NA first/last\n",
+    "df.dropna(subset=['name_first', 'name_last'], inplace=True)\n",
+    "print(\"Size after dropping missing first and last names:\", df.shape)\n",
+    "\n",
+    "# We assume unknown as missing at random\n",
+    "sdf = df[df.race.isin(['unknown']) == False]\n",
+    "print(\"Size after dropping unknown:\", sdf.shape)\n",
+    "del df\n",
+    "\n",
+    "# Drop cases where last name is less than 2 chars\n",
+    "sdf = sdf.drop(sdf[sdf['name_last'].str.len() < 2].index)\n",
+    "print(\"Size after dropping last names less than 2 chars:\", sdf.shape)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 53,
-   "id": "8c0468aa-dae8-4671-bbe5-88196c1b0fb1",
+   "execution_count": 4,
+   "id": "96a27d5c-b423-43d8-bb10-fc1e4d4404cb",
    "metadata": {
     "tags": []
    },
    "outputs": [],
    "source": [
-    "# dropping unknown column\n",
-    "df = df.drop(df[df['race'] == 'unknown'].index)"
+    "sdf['name_last'] = sdf['name_last'].str.title()\n",
+    "sdf['name_last'] = sdf['name_last'].str.replace(\"[^a-zA-Z' -]\", '', regex=True)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 54,
-   "id": "9d21c416-ea30-48f4-8794-721dd65f6259",
-   "metadata": {
-    "tags": []
-   },
+   "execution_count": 5,
+   "id": "76cc90e1-9961-4ea4-bba0-003eb1c9965d",
+   "metadata": {},
    "outputs": [],
    "source": [
-    "# combine multi_racial and native_indian to other\n",
+    "# recode race\n",
     "mapping = {'multi_racial': 'other', 'native_indian': 'other'}\n",
-    "df['race'] = df['race'].replace(mapping)"
+    "sdf['race'] = sdf['race'].replace(mapping)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 55,
-   "id": "4aeb7f29-01df-4702-bbbe-156924242bb0",
-   "metadata": {
-    "tags": []
-   },
+   "execution_count": 6,
+   "id": "bcf7903a-d84e-456f-a060-82ceec27b2a7",
+   "metadata": {},
    "outputs": [
     {
      "data": {
@@ -1069,68 +129,212 @@
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>name_last</th>\n",
-       "    </tr>\n",
-       "    <tr>\n",
        "      <th>race</th>\n",
-       "      <th></th>\n",
+       "      <th>name_last</th>\n",
+       "      <th>asian</th>\n",
+       "      <th>hispanic</th>\n",
+       "      <th>nh_black</th>\n",
+       "      <th>nh_white</th>\n",
+       "      <th>other</th>\n",
+       "      <th>total_n</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>asian</th>\n",
-       "      <td>68672</td>\n",
+       "      <th>0</th>\n",
+       "      <td>A Arup</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>hispanic</th>\n",
-       "      <td>389609</td>\n",
+       "      <th>1</th>\n",
+       "      <td>A Bitang</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>nh_black</th>\n",
-       "      <td>137271</td>\n",
+       "      <th>2</th>\n",
+       "      <td>A De Feria</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>nh_white</th>\n",
-       "      <td>609707</td>\n",
+       "      <th>3</th>\n",
+       "      <td>A F R Stephenson</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>other</th>\n",
-       "      <td>115218</td>\n",
+       "      <th>4</th>\n",
+       "      <td>A Felix</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1056640</th>\n",
+       "      <td>Zyzanski</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1056641</th>\n",
+       "      <td>Zyzdryn</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>2.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1056642</th>\n",
+       "      <td>Zyznomyrsky</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1056643</th>\n",
+       "      <td>Zzaman</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>2.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1056644</th>\n",
+       "      <td>Zzie</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
+       "<p>1056645 rows × 7 columns</p>\n",
        "</div>"
       ],
       "text/plain": [
-       "          name_last\n",
-       "race               \n",
-       "asian         68672\n",
-       "hispanic     389609\n",
-       "nh_black     137271\n",
-       "nh_white     609707\n",
-       "other        115218"
+       "race            name_last  asian  hispanic  nh_black  nh_white  other  total_n\n",
+       "0                  A Arup    0.0       0.0       0.0       1.0    0.0      1.0\n",
+       "1                A Bitang    0.0       0.0       1.0       0.0    0.0      1.0\n",
+       "2              A De Feria    0.0       1.0       0.0       0.0    0.0      1.0\n",
+       "3        A F R Stephenson    0.0       0.0       0.0       1.0    0.0      1.0\n",
+       "4                 A Felix    0.0       1.0       0.0       0.0    0.0      1.0\n",
+       "...                   ...    ...       ...       ...       ...    ...      ...\n",
+       "1056640          Zyzanski    0.0       0.0       0.0       1.0    0.0      1.0\n",
+       "1056641           Zyzdryn    0.0       0.0       0.0       1.0    0.0      2.0\n",
+       "1056642       Zyznomyrsky    0.0       0.0       0.0       1.0    0.0      1.0\n",
+       "1056643            Zzaman    0.0       0.0       0.0       0.0    1.0      2.0\n",
+       "1056644              Zzie    0.0       0.0       0.0       1.0    0.0      1.0\n",
+       "\n",
+       "[1056645 rows x 7 columns]"
       ]
      },
-     "execution_count": 55,
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "df.groupby('race').agg({'name_last':'nunique'})"
+    "# Summing the count of each name & race combination\n",
+    "gdf = sdf.groupby(['name_last','race'], as_index=False)['race'].agg(['count'])\n",
+    "# creating a pivot table so that each name has a count of the # of races with that last name\n",
+    "gdf = gdf.pivot_table(values='count', columns='race', index='name_last')\n",
+    "\n",
+    "# Converting NaN to zeros since that means there is no one that identifies with that race with that last name\n",
+    "gdf = gdf.fillna(0)\n",
+    "\n",
+    "gdf['total_n'] = gdf.sum(axis=1)\n",
+    "gdf.reset_index(inplace=True)\n",
+    "gdf.iloc[:, 1:-1] = gdf.iloc[:, 1:-1].div(gdf.total_n, axis=0)\n",
+    "gdf"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 56,
-   "id": "98ff85be-2f1e-4621-8aa8-11eb3965ecd4",
-   "metadata": {
-    "tags": []
-   },
+   "execution_count": 7,
+   "id": "25ebef7d-1af3-4039-8518-5ccef5f07c7c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['asian', 'hispanic', 'nh_black', 'nh_white', 'other']\n"
+     ]
+    }
+   ],
+   "source": [
+    "races = sorted(sdf.race.unique().tolist())\n",
+    "print(races)\n",
+    "\n",
+    "def get_race_idx(val, races):\n",
+    "    race_idx = races.index(val)\n",
+    "    return race_idx"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "55c26fd4-55b4-4c9e-b918-06f124d4691c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# for one set of analysis, we define 'true race/ethincity' = where max prob (so modal race = true race)\n",
+    "gdf['race'] = gdf[races].idxmax(axis=1)\n",
+    "gdf['race_code'] = gdf['race'].apply(lambda c: get_race_idx(c,races))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "8704cd24-c082-453d-90cc-acd0101f6fd7",
+   "metadata": {},
    "outputs": [],
    "source": [
-    "df['race_code'] = df.race.factorize()[0]"
+    "gdf.to_csv(\"train_validation_test/fl_2022_lastname.csv.gz\", index = False, compression=\"gzip\")"
    ]
   },
   {
@@ -1143,17 +347,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 57,
+   "execution_count": 11,
    "id": "3a11b216-fde5-48c7-b5ec-904dbde4bb29",
    "metadata": {},
    "outputs": [],
    "source": [
-    "train_df, rest_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['race_code'])"
+    "train_df, rest_df = train_test_split(gdf, test_size=0.2, random_state=42, stratify=gdf['race_code'])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 58,
+   "execution_count": 12,
    "id": "f7d44e05-63d5-47cc-85db-c59fc3e169f1",
    "metadata": {
     "tags": []
@@ -1165,7 +369,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 59,
+   "execution_count": 13,
    "id": "e47fa000-d58f-4360-9487-bd33c149433e",
    "metadata": {
     "tags": []
@@ -1179,7 +383,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 60,
+   "execution_count": 14,
    "id": "fce465b6-0a17-437d-ad38-7d8036c74d0e",
    "metadata": {
     "tags": []
@@ -1189,9 +393,9 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "(1079199, 4)\n",
-      "(134900, 4)\n",
-      "(134900, 4)\n"
+      "(845316, 9)\n",
+      "(105664, 9)\n",
+      "(105665, 9)\n"
      ]
     }
    ],
@@ -1203,87 +407,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 61,
-   "id": "c30f722b-995b-4269-bac7-38bc4ba64999",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>name_last</th>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>race</th>\n",
-       "      <th></th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>asian</th>\n",
-       "      <td>68672</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>hispanic</th>\n",
-       "      <td>389609</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>nh_black</th>\n",
-       "      <td>137271</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>nh_white</th>\n",
-       "      <td>609707</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>other</th>\n",
-       "      <td>115218</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "          name_last\n",
-       "race               \n",
-       "asian         68672\n",
-       "hispanic     389609\n",
-       "nh_black     137271\n",
-       "nh_white     609707\n",
-       "other        115218"
-      ]
-     },
-     "execution_count": 61,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df.groupby('race').agg({'name_last':'nunique'})"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 62,
+   "execution_count": 15,
    "id": "66f97f4e-4fda-44bc-8d3c-65fbe6eea9b6",
    "metadata": {
     "tags": []
@@ -1309,7 +433,7 @@
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
+       "      <th>race</th>\n",
        "      <th>name_last</th>\n",
        "    </tr>\n",
        "    <tr>\n",
@@ -1320,39 +444,39 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>asian</th>\n",
-       "      <td>54938</td>\n",
+       "      <td>29184</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>hispanic</th>\n",
-       "      <td>311687</td>\n",
+       "      <td>259689</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>nh_black</th>\n",
-       "      <td>109817</td>\n",
+       "      <td>83227</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>nh_white</th>\n",
-       "      <td>487765</td>\n",
+       "      <td>450098</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>other</th>\n",
-       "      <td>95708</td>\n",
+       "      <td>23118</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "          name_last\n",
+       "race      name_last\n",
        "race               \n",
-       "asian         54938\n",
-       "hispanic     311687\n",
-       "nh_black     109817\n",
-       "nh_white     487765\n",
-       "other         95708"
+       "asian         29184\n",
+       "hispanic     259689\n",
+       "nh_black      83227\n",
+       "nh_white     450098\n",
+       "other         23118"
       ]
      },
-     "execution_count": 62,
+     "execution_count": 15,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1363,7 +487,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 63,
+   "execution_count": 16,
    "id": "7d9a625b-fd10-4374-b744-e809620c86d5",
    "metadata": {
     "tags": []
@@ -1389,7 +513,7 @@
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
+       "      <th>race</th>\n",
        "      <th>name_last</th>\n",
        "    </tr>\n",
        "    <tr>\n",
@@ -1400,39 +524,39 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>asian</th>\n",
-       "      <td>6867</td>\n",
+       "      <td>3648</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>hispanic</th>\n",
-       "      <td>38961</td>\n",
+       "      <td>32461</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>nh_black</th>\n",
-       "      <td>13727</td>\n",
+       "      <td>10403</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>nh_white</th>\n",
-       "      <td>60971</td>\n",
+       "      <td>56262</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>other</th>\n",
-       "      <td>14068</td>\n",
+       "      <td>2890</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "          name_last\n",
+       "race      name_last\n",
        "race               \n",
-       "asian          6867\n",
-       "hispanic      38961\n",
-       "nh_black      13727\n",
-       "nh_white      60971\n",
-       "other         14068"
+       "asian          3648\n",
+       "hispanic      32461\n",
+       "nh_black      10403\n",
+       "nh_white      56262\n",
+       "other          2890"
       ]
      },
-     "execution_count": 63,
+     "execution_count": 16,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1443,7 +567,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 64,
+   "execution_count": 17,
    "id": "e61fc0f4-1b6c-42a0-a1d5-cf7cbff8a290",
    "metadata": {
     "tags": []
@@ -1469,7 +593,7 @@
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
+       "      <th>race</th>\n",
        "      <th>name_last</th>\n",
        "    </tr>\n",
        "    <tr>\n",
@@ -1480,39 +604,39 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>asian</th>\n",
-       "      <td>6867</td>\n",
+       "      <td>3648</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>hispanic</th>\n",
-       "      <td>38961</td>\n",
+       "      <td>32461</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>nh_black</th>\n",
-       "      <td>13727</td>\n",
+       "      <td>10404</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>nh_white</th>\n",
-       "      <td>60971</td>\n",
+       "      <td>56262</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>other</th>\n",
-       "      <td>14060</td>\n",
+       "      <td>2890</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "          name_last\n",
+       "race      name_last\n",
        "race               \n",
-       "asian          6867\n",
-       "hispanic      38961\n",
-       "nh_black      13727\n",
-       "nh_white      60971\n",
-       "other         14060"
+       "asian          3648\n",
+       "hispanic      32461\n",
+       "nh_black      10404\n",
+       "nh_white      56262\n",
+       "other          2890"
       ]
      },
-     "execution_count": 64,
+     "execution_count": 17,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1531,7 +655,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 65,
+   "execution_count": 18,
    "id": "ecc12a2c-bee0-49bd-b42d-ab8cb5589a15",
    "metadata": {
     "tags": []
@@ -1543,32 +667,10 @@
     "test_df.to_csv(\"data/fl_2022_LastName_test.csv.gz\",index=False,compression=\"gzip\")"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 66,
-   "id": "aa9be3b5-ee0d-4935-9b21-14012c676235",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "1.3M\tdata/fl_2022_LastName_test.csv.gz\n",
-      "11M\tdata/fl_2022_LastName_train.csv.gz\n",
-      "1.3M\tdata/fl_2022_LastName_val.csv.gz\n"
-     ]
-    }
-   ],
-   "source": [
-    "!du -sh data/fl_2022_LastName_*"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "2fb93bc9-448d-45e0-a976-9312bf94e708",
+   "id": "6b389e66-ee18-4ee6-b3d1-87c73859189c",
    "metadata": {},
    "outputs": [],
    "source": []
@@ -1590,7 +692,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.6"
+   "version": "3.8.10"
   }
  },
  "nbformat": 4,

	name_last	name_first	race
0	Hessler-Smith	Jason	nh_white
1	Rogers	Renee	nh_white
2	Bartolome	Crystal	nh_white
3	Bailey	Donna	nh_white
4	Carlson	Greggory	nh_white
	name_last	name_first	race
count	15454992	15455022	15455110
unique	1341195	641103	8
top	Smith	Michael	nh_white
freq	79362	153753	9446851
	name_last	name_first	race	full_name	asian	hispanic	nh_black	nh_white	other	total_n
count	15454908	15454908	15454908
unique	1341176	641095	8
top	Smith	Michael	nh_white	0	A Arup Erik	0.0	0.0	0.0	1.0	0.0	1.0
freq	79362	153753	9446749
	name_last	name_first	race	1	A Bitang Ahmad	0.0	0.0	1.0	0.0	0.0	1.0
count	15366690	15366690	15366690	2	A De Feria Graciela	0.0	1.0	0.0	0.0	0.0	1.0
unique	1340617	641055	8	3	A F R Stephenson John Alexander	0.0	0.0	0.0	1.0	0.0	1.0
top	Smith	Michael	nh_white	4	A Felix Noehmi	0.0	1.0	0.0	0.0	0.0	1.0
freq	79297	153752	9383680
	name_last	name_first	race	full_name
837	Moser	Patricia	nh_white	Moser Patricia
928	Johnson	Tiffany	nh_black	Johnson Tiffany
1247	Perry	Charles	nh_white	Perry Charles
2120	Johnson	Ashley	nh_black	Johnson Ashley
2285	Johnson	Clayton	nh_white	Johnson Clayton
...	...	...	...	...
15455104	Ballentine	Robert	nh_white	Ballentine Robert
15455106	Watts	Mark	nh_white	Watts Mark
15455107	Mcrae	Evelyn	nh_white	Mcrae Evelyn
15455108	Ward	Stephanie	nh_white	Ward Stephanie
15455109	Edenfield	Marcus	nh_white	Edenfield Marcus
	name_last	name_first	race	full_name
136	Porter	Paula	nh_white	Porter Paula
550	Porter	Paula	nh_black	Porter Paula
263636	Porter	Paula	nh_white	Porter Paula
1527456	Porter	Paula	nh_white	Porter Paula
7563599	Porter	Paula	nh_white	Porter Paula
7631191	Porter	Paula	nh_white	Porter Paula
8383292	Porter	Paula	nh_white	Porter Paula
8945658	Porter	Paula	nh_white	Porter Paula
9402546	Porter	Paula	nh_white	Porter Paula
10682106	Porter	Paula	nh_white	Porter Paula
12427420	Porter	Paula	nh_white	Porter Paula
12731429	Porter	Paula	nh_white	Porter Paula
14637476	Porter	Paula	nh_white	Porter Paula
	race	full_name
asian	hispanic	nh_black	nh_white	other	total_n	race		race_code
asian	278290	0	A Arup Erik	0.0	0.0	0.0	1.0	0.0	1.0	nh_white	3
hispanic	1690573	1	A Bitang Ahmad	0.0	0.0	1.0	0.0	0.0	1.0	nh_black	2
nh_black	1492989	2	A De Feria Graciela	0.0	1.0	0.0	0.0	0.0	1.0	hispanic	1
nh_white	5734701	3	A F R Stephenson John Alexander	0.0	0.0	0.0	1.0	0.0	1.0	nh_white	3
other	390648	4	A Felix Noehmi	0.0	1.0	0.0	0.0	0.0	1.0	hispanic	1
	race	full_name
asian	222632	206042
hispanic	1352458	1308198
nh_black	1194391	1067770
nh_white	4587761	4421898
other	314216	210986
	race	full_name
asian	27829	25755
hispanic	169057	163525
nh_black	149299	133471
nh_white	573470	552738
other	40061	26373
	race	full_name
asian	27829	25756
hispanic	169058	163525
nh_black	149299	133471
nh_white	573470	552737
other	40068	26373