diff --git a/.gitignore b/.gitignore
index 629a99d..54172d2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -20,3 +20,6 @@ models/rf_vec_fullname_2M.joblib
models/rf_vec_lastname.joblib
models/rf_vec_lastname_1M.joblib
models/rf_vec_lastname_2M.joblib
+ms/icwsm/name_race.aux
+ms/icwsm/name_race.bbl
+ms/icwsm/name_race.blg
diff --git a/notebooks/0.1_data_preprocessing_FullName.ipynb b/notebooks/0.1_data_preprocessing_FullName.ipynb
index 55afab9..1b0b6bd 100644
--- a/notebooks/0.1_data_preprocessing_FullName.ipynb
+++ b/notebooks/0.1_data_preprocessing_FullName.ipynb
@@ -1,258 +1,116 @@
{
"cells": [
{
- "cell_type": "code",
- "execution_count": 1,
- "id": "71a4990d-d590-4f49-9d25-97982e6d58c0",
+ "cell_type": "markdown",
+ "id": "ce92ae6b-1cfb-47f9-b947-55f2448b7500",
"metadata": {
"tags": []
},
- "outputs": [],
- "source": [
- "import numpy as np\n",
- "import pandas as pd\n",
- "\n",
- "from sklearn.model_selection import train_test_split"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "78ee21e5-efa5-4c7f-b2cc-32f8df219d08",
- "metadata": {},
"source": [
- "# Preprocessing data"
+ "### Full Name Dataset (Train/Validation/Test)"
]
},
{
"cell_type": "code",
- "execution_count": 45,
- "id": "af05aa15-8218-4086-9626-adadd2552183",
- "metadata": {
- "tags": []
- },
+ "execution_count": 1,
+ "id": "a34fa1c0-cf2e-464a-bc56-73a4f7a38a55",
+ "metadata": {},
"outputs": [],
"source": [
- "df = pd.read_csv('./data/fl_reg_name_race_2022.csv.gz')"
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "from sklearn.model_selection import train_test_split"
]
},
{
"cell_type": "code",
- "execution_count": 46,
- "id": "05503af7-99a2-4cf2-bdd4-b25d9015aa92",
+ "execution_count": 2,
+ "id": "af05aa15-8218-4086-9626-adadd2552183",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " name_last | \n",
- " name_first | \n",
- " race | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " Hessler-Smith | \n",
- " Jason | \n",
- " nh_white | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " Rogers | \n",
- " Renee | \n",
- " nh_white | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " Bartolome | \n",
- " Crystal | \n",
- " nh_white | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " Bailey | \n",
- " Donna | \n",
- " nh_white | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " Carlson | \n",
- " Greggory | \n",
- " nh_white | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
"text/plain": [
- " name_last name_first race\n",
- "0 Hessler-Smith Jason nh_white\n",
- "1 Rogers Renee nh_white\n",
- "2 Bartolome Crystal nh_white\n",
- "3 Bailey Donna nh_white\n",
- "4 Carlson Greggory nh_white"
+ "(15455110, 3)"
]
},
- "execution_count": 46,
+ "execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "df.head()"
+ "df = pd.read_csv('data/fl_reg_name_race_2022.csv.gz')\n",
+ "df.shape"
]
},
{
"cell_type": "code",
- "execution_count": 47,
- "id": "b6b123c8-70ba-4ab4-841c-b5486a1ba69a",
+ "execution_count": 3,
+ "id": "05503af7-99a2-4cf2-bdd4-b25d9015aa92",
"metadata": {
"tags": []
},
"outputs": [
{
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " name_last | \n",
- " name_first | \n",
- " race | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " count | \n",
- " 15454992 | \n",
- " 15455022 | \n",
- " 15455110 | \n",
- "
\n",
- " \n",
- " unique | \n",
- " 1341195 | \n",
- " 641103 | \n",
- " 8 | \n",
- "
\n",
- " \n",
- " top | \n",
- " Smith | \n",
- " Michael | \n",
- " nh_white | \n",
- "
\n",
- " \n",
- " freq | \n",
- " 79362 | \n",
- " 153753 | \n",
- " 9446851 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " name_last name_first race\n",
- "count 15454992 15455022 15455110\n",
- "unique 1341195 641103 8\n",
- "top Smith Michael nh_white\n",
- "freq 79362 153753 9446851"
- ]
- },
- "execution_count": 47,
- "metadata": {},
- "output_type": "execute_result"
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Size after dropping missing first and last names: (15454979, 3)\n",
+ "Size after dropping unknown: (15009244, 3)\n",
+ "Size after dropping last names less than 2 chars: (14933334, 3)\n"
+ ]
}
],
"source": [
- "df.describe()"
+ "# Remove NA first/last\n",
+ "df.dropna(subset=['name_first', 'name_last'], inplace=True)\n",
+ "print(\"Size after dropping missing first and last names:\", df.shape)\n",
+ "\n",
+ "# We assume unknown as missing at random\n",
+ "sdf = df[df.race.isin(['unknown']) == False]\n",
+ "print(\"Size after dropping unknown:\", sdf.shape)\n",
+ "del df\n",
+ "\n",
+ "# Drop cases where last name is less than 2 chars\n",
+ "sdf = sdf.drop(sdf[sdf['name_last'].str.len() < 2].index)\n",
+ "print(\"Size after dropping last names less than 2 chars:\", sdf.shape)"
]
},
{
"cell_type": "code",
- "execution_count": 48,
- "id": "33b5941c-619c-485f-8767-27cdfa71ab27",
- "metadata": {
- "tags": []
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array(['nh_white', 'nh_black', 'other', 'hispanic', 'asian',\n",
- " 'native_indian', 'unknown', 'multi_racial'], dtype=object)"
- ]
- },
- "execution_count": 48,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df['race'].unique()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "65f441b5-3c7d-48f2-b6ba-352e897a72ae",
+ "execution_count": 4,
+ "id": "98a49e48-1dfc-4d1b-ad98-874ce9559d0d",
"metadata": {},
+ "outputs": [],
"source": [
- "## Drop None Values"
+ "# Full Name\n",
+ "sdf['name_first'] = sdf.name_first.str.strip().str.title()\n",
+ "sdf['name_last'] = sdf.name_last.str.strip().str.title()\n",
+ "sdf['full_name'] = sdf['name_last'] + ' ' + sdf['name_first']\n",
+ "# Remove special chars\n",
+ "sdf['full_name'] = sdf['full_name'].str.replace(\"[^a-zA-Z' -]\", '', regex=True)"
]
},
{
"cell_type": "code",
- "execution_count": 49,
- "id": "0522d1ca-b9bc-4028-b3f9-58ea8d143357",
- "metadata": {
- "tags": []
- },
+ "execution_count": 5,
+ "id": "9bfb3b34-bdea-4c60-bff9-ffb0a63aa265",
+ "metadata": {},
"outputs": [],
"source": [
- "df.dropna(subset=['name_first', 'name_last'], inplace=True)"
+ "# recode race\n",
+ "mapping = {'multi_racial': 'other', 'native_indian': 'other'}\n",
+ "sdf['race'] = sdf['race'].replace(mapping)"
]
},
{
"cell_type": "code",
- "execution_count": 50,
- "id": "49bd34b0-6543-48bd-b0cd-99cb15bf2569",
- "metadata": {
- "tags": []
- },
+ "execution_count": 6,
+ "id": "041729a5-8518-405f-9cd5-5e6e1869cc2f",
+ "metadata": {},
"outputs": [
{
"data": {
@@ -274,796 +132,223 @@
"\n",
" \n",
" \n",
- " | \n",
- " name_last | \n",
- " name_first | \n",
" race | \n",
+ " full_name | \n",
+ " asian | \n",
+ " hispanic | \n",
+ " nh_black | \n",
+ " nh_white | \n",
+ " other | \n",
+ " total_n | \n",
"
\n",
" \n",
" \n",
" \n",
- " count | \n",
- " 15454908 | \n",
- " 15454908 | \n",
- " 15454908 | \n",
- "
\n",
- " \n",
- " unique | \n",
- " 1341176 | \n",
- " 641095 | \n",
- " 8 | \n",
- "
\n",
- " \n",
- " top | \n",
- " Smith | \n",
- " Michael | \n",
- " nh_white | \n",
+ " 0 | \n",
+ " A Arup Erik | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
"
\n",
" \n",
- " freq | \n",
- " 79362 | \n",
- " 153753 | \n",
- " 9446749 | \n",
- "
\n",
- " \n",
- "
\n",
- ""
- ],
- "text/plain": [
- " name_last name_first race\n",
- "count 15454908 15454908 15454908\n",
- "unique 1341176 641095 8\n",
- "top Smith Michael nh_white\n",
- "freq 79362 153753 9446749"
- ]
- },
- "execution_count": 50,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df.describe()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "5a1c7014-99e8-4b29-b7e5-98ba16caa3b3",
- "metadata": {
- "tags": []
- },
- "source": [
- "## Drop Last name and first name of length 1"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 51,
- "id": "42f1fd6c-ef4f-4538-bb6c-286db30c250a",
- "metadata": {
- "tags": []
- },
- "outputs": [],
- "source": [
- "df = df.drop(df[df['name_last'].str.len() < 2].index)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 52,
- "id": "1fc0d6fd-5ffc-4190-9a11-314f9c34535a",
- "metadata": {
- "tags": []
- },
- "outputs": [],
- "source": [
- "df = df.drop(df[df['name_first'].str.len() < 2].index)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 53,
- "id": "02479d63-5cf8-43f3-a208-9119df5b5457",
- "metadata": {
- "tags": []
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " name_last | \n",
- " name_first | \n",
- " race | \n",
+ " 1 | \n",
+ " A Bitang Ahmad | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
"
\n",
- " \n",
- " \n",
" \n",
- " count | \n",
- " 15366690 | \n",
- " 15366690 | \n",
- " 15366690 | \n",
+ " 2 | \n",
+ " A De Feria Graciela | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
"
\n",
" \n",
- " unique | \n",
- " 1340617 | \n",
- " 641055 | \n",
- " 8 | \n",
+ " 3 | \n",
+ " A F R Stephenson John Alexander | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
"
\n",
" \n",
- " top | \n",
- " Smith | \n",
- " Michael | \n",
- " nh_white | \n",
+ " 4 | \n",
+ " A Felix Noehmi | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
"
\n",
" \n",
- " freq | \n",
- " 79297 | \n",
- " 153752 | \n",
- " 9383680 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " name_last name_first race\n",
- "count 15366690 15366690 15366690\n",
- "unique 1340617 641055 8\n",
- "top Smith Michael nh_white\n",
- "freq 79297 153752 9383680"
- ]
- },
- "execution_count": 53,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df.describe()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "058f71b5-0798-4ceb-89d7-e241000b7e1f",
- "metadata": {},
- "source": [
- "## Make all names title case"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 54,
- "id": "313b98b0-89f3-4ed8-9dd2-dad9bee428b1",
- "metadata": {
- "tags": []
- },
- "outputs": [],
- "source": [
- "df['name_first'] = df['name_first'].str.title()\n",
- "df['name_last'] = df['name_last'].str.title()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "6a7d0983-9883-4777-a29e-6b9c913f9271",
- "metadata": {},
- "source": [
- "## Remove Special Characters"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 56,
- "id": "8c415ea0-1763-4fc2-b025-ba3cb5f7b786",
- "metadata": {
- "tags": []
- },
- "outputs": [],
- "source": [
- "df['full_name'] = df['name_last'] + ' ' + df['name_first']"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 57,
- "id": "12dea730-e4dc-4bbd-91f6-286abf4c2fee",
- "metadata": {
- "tags": []
- },
- "outputs": [],
- "source": [
- "df['full_name'] = df['full_name'].str.replace(\"[^a-zA-Z' -]\", '', regex=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 58,
- "id": "7d693324-de62-4ea3-8edd-ed3892f52a2f",
- "metadata": {
- "tags": []
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " name_last | \n",
- " name_first | \n",
- " race | \n",
- " full_name | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " Hessler-Smith | \n",
- " Jason | \n",
- " nh_white | \n",
- " Hessler-Smith Jason | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " Rogers | \n",
- " Renee | \n",
- " nh_white | \n",
- " Rogers Renee | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " Bartolome | \n",
- " Crystal | \n",
- " nh_white | \n",
- " Bartolome Crystal | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " Bailey | \n",
- " Donna | \n",
- " nh_white | \n",
- " Bailey Donna | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " Carlson | \n",
- " Greggory | \n",
- " nh_white | \n",
- " Carlson Greggory | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " name_last name_first race full_name\n",
- "0 Hessler-Smith Jason nh_white Hessler-Smith Jason\n",
- "1 Rogers Renee nh_white Rogers Renee\n",
- "2 Bartolome Crystal nh_white Bartolome Crystal\n",
- "3 Bailey Donna nh_white Bailey Donna\n",
- "4 Carlson Greggory nh_white Carlson Greggory"
- ]
- },
- "execution_count": 58,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df.head()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "7ce1807d-f228-482d-a2d8-43211e2806c1",
- "metadata": {},
- "source": [
- "## Drop duplicates"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 59,
- "id": "98363907-538a-4255-9cd6-35846c58d044",
- "metadata": {
- "tags": []
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " name_last | \n",
- " name_first | \n",
- " race | \n",
- " full_name | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 837 | \n",
- " Moser | \n",
- " Patricia | \n",
- " nh_white | \n",
- " Moser Patricia | \n",
- "
\n",
- " \n",
- " 928 | \n",
- " Johnson | \n",
- " Tiffany | \n",
- " nh_black | \n",
- " Johnson Tiffany | \n",
- "
\n",
- " \n",
- " 1247 | \n",
- " Perry | \n",
- " Charles | \n",
- " nh_white | \n",
- " Perry Charles | \n",
- "
\n",
- " \n",
- " 2120 | \n",
- " Johnson | \n",
- " Ashley | \n",
- " nh_black | \n",
- " Johnson Ashley | \n",
- "
\n",
- " \n",
- " 2285 | \n",
- " Johnson | \n",
- " Clayton | \n",
- " nh_white | \n",
- " Johnson Clayton | \n",
- "
\n",
- " \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " 15455104 | \n",
- " Ballentine | \n",
- " Robert | \n",
- " nh_white | \n",
- " Ballentine Robert | \n",
- "
\n",
- " \n",
- " 15455106 | \n",
- " Watts | \n",
- " Mark | \n",
- " nh_white | \n",
- " Watts Mark | \n",
- "
\n",
- " \n",
- " 15455107 | \n",
- " Mcrae | \n",
- " Evelyn | \n",
- " nh_white | \n",
- " Mcrae Evelyn | \n",
- "
\n",
- " \n",
- " 15455108 | \n",
- " Ward | \n",
- " Stephanie | \n",
- " nh_white | \n",
- " Ward Stephanie | \n",
- "
\n",
- " \n",
- " 15455109 | \n",
- " Edenfield | \n",
- " Marcus | \n",
- " nh_white | \n",
- " Edenfield Marcus | \n",
- "
\n",
- " \n",
- "
\n",
- "
5364911 rows × 4 columns
\n",
- "
"
- ],
- "text/plain": [
- " name_last name_first race full_name\n",
- "837 Moser Patricia nh_white Moser Patricia\n",
- "928 Johnson Tiffany nh_black Johnson Tiffany\n",
- "1247 Perry Charles nh_white Perry Charles\n",
- "2120 Johnson Ashley nh_black Johnson Ashley\n",
- "2285 Johnson Clayton nh_white Johnson Clayton\n",
- "... ... ... ... ...\n",
- "15455104 Ballentine Robert nh_white Ballentine Robert\n",
- "15455106 Watts Mark nh_white Watts Mark\n",
- "15455107 Mcrae Evelyn nh_white Mcrae Evelyn\n",
- "15455108 Ward Stephanie nh_white Ward Stephanie\n",
- "15455109 Edenfield Marcus nh_white Edenfield Marcus\n",
- "\n",
- "[5364911 rows x 4 columns]"
- ]
- },
- "execution_count": 59,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df[df[['full_name','race']].duplicated()]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 60,
- "id": "5dcbb0d6-ede7-4b20-af15-4466f04d3fcd",
- "metadata": {
- "tags": []
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " name_last | \n",
- " name_first | \n",
- " race | \n",
- " full_name | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 136 | \n",
- " Porter | \n",
- " Paula | \n",
- " nh_white | \n",
- " Porter Paula | \n",
- "
\n",
- " \n",
- " 550 | \n",
- " Porter | \n",
- " Paula | \n",
- " nh_black | \n",
- " Porter Paula | \n",
- "
\n",
- " \n",
- " 263636 | \n",
- " Porter | \n",
- " Paula | \n",
- " nh_white | \n",
- " Porter Paula | \n",
- "
\n",
- " \n",
- " 1527456 | \n",
- " Porter | \n",
- " Paula | \n",
- " nh_white | \n",
- " Porter Paula | \n",
- "
\n",
- " \n",
- " 7563599 | \n",
- " Porter | \n",
- " Paula | \n",
- " nh_white | \n",
- " Porter Paula | \n",
- "
\n",
- " \n",
- " 7631191 | \n",
- " Porter | \n",
- " Paula | \n",
- " nh_white | \n",
- " Porter Paula | \n",
- "
\n",
- " \n",
- " 8383292 | \n",
- " Porter | \n",
- " Paula | \n",
- " nh_white | \n",
- " Porter Paula | \n",
- "
\n",
- " \n",
- " 8945658 | \n",
- " Porter | \n",
- " Paula | \n",
- " nh_white | \n",
- " Porter Paula | \n",
- "
\n",
- " \n",
- " 9402546 | \n",
- " Porter | \n",
- " Paula | \n",
- " nh_white | \n",
- " Porter Paula | \n",
- "
\n",
- " \n",
- " 10682106 | \n",
- " Porter | \n",
- " Paula | \n",
- " nh_white | \n",
- " Porter Paula | \n",
- "
\n",
- " \n",
- " 12427420 | \n",
- " Porter | \n",
- " Paula | \n",
- " nh_white | \n",
- " Porter Paula | \n",
- "
\n",
- " \n",
- " 12731429 | \n",
- " Porter | \n",
- " Paula | \n",
- " nh_white | \n",
- " Porter Paula | \n",
- "
\n",
- " \n",
- " 14637476 | \n",
- " Porter | \n",
- " Paula | \n",
- " nh_white | \n",
- " Porter Paula | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " name_last name_first race full_name\n",
- "136 Porter Paula nh_white Porter Paula\n",
- "550 Porter Paula nh_black Porter Paula\n",
- "263636 Porter Paula nh_white Porter Paula\n",
- "1527456 Porter Paula nh_white Porter Paula\n",
- "7563599 Porter Paula nh_white Porter Paula\n",
- "7631191 Porter Paula nh_white Porter Paula\n",
- "8383292 Porter Paula nh_white Porter Paula\n",
- "8945658 Porter Paula nh_white Porter Paula\n",
- "9402546 Porter Paula nh_white Porter Paula\n",
- "10682106 Porter Paula nh_white Porter Paula\n",
- "12427420 Porter Paula nh_white Porter Paula\n",
- "12731429 Porter Paula nh_white Porter Paula\n",
- "14637476 Porter Paula nh_white Porter Paula"
- ]
- },
- "execution_count": 60,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df[df['full_name'] == \"Porter Paula\"]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 61,
- "id": "68f7146a-45e7-40c6-8f2a-9e50bd914743",
- "metadata": {
- "tags": []
- },
- "outputs": [],
- "source": [
- "df = df.drop_duplicates(['full_name','race'],keep= 'last')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 62,
- "id": "ed066729-7970-47ea-ad65-9842bc71366a",
- "metadata": {
- "tags": []
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " name_last | \n",
- " name_first | \n",
- " race | \n",
- " full_name | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 550 | \n",
- " Porter | \n",
- " Paula | \n",
- " nh_black | \n",
- " Porter Paula | \n",
- "
\n",
- " \n",
- " 14637476 | \n",
- " Porter | \n",
- " Paula | \n",
- " nh_white | \n",
- " Porter Paula | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " name_last name_first race full_name\n",
- "550 Porter Paula nh_black Porter Paula\n",
- "14637476 Porter Paula nh_white Porter Paula"
- ]
- },
- "execution_count": 62,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df[df['full_name'] == \"Porter Paula\"]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 63,
- "id": "0d2504cf-d9f9-4eff-a53f-fa859f686ed1",
- "metadata": {
- "tags": []
- },
- "outputs": [
- {
- "data": {
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " \n",
+ " \n",
+ " 9018613 | \n",
+ " Zyzdryn Krzysztof | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 9018614 | \n",
+ " Zyznomyrsky John | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 9018615 | \n",
+ " Zzaman Md | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 9018616 | \n",
+ " Zzaman Mohammad | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 9018617 | \n",
+ " Zzie Richard | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ "\n",
+ "9018618 rows × 7 columns
\n",
+ ""
+ ],
"text/plain": [
- "(10001779, 4)"
+ "race full_name asian hispanic nh_black nh_white \\\n",
+ "0 A Arup Erik 0.0 0.0 0.0 1.0 \n",
+ "1 A Bitang Ahmad 0.0 0.0 1.0 0.0 \n",
+ "2 A De Feria Graciela 0.0 1.0 0.0 0.0 \n",
+ "3 A F R Stephenson John Alexander 0.0 0.0 0.0 1.0 \n",
+ "4 A Felix Noehmi 0.0 1.0 0.0 0.0 \n",
+ "... ... ... ... ... ... \n",
+ "9018613 Zyzdryn Krzysztof 0.0 0.0 0.0 1.0 \n",
+ "9018614 Zyznomyrsky John 0.0 0.0 0.0 1.0 \n",
+ "9018615 Zzaman Md 0.0 0.0 0.0 0.0 \n",
+ "9018616 Zzaman Mohammad 0.0 0.0 0.0 0.0 \n",
+ "9018617 Zzie Richard 0.0 0.0 0.0 1.0 \n",
+ "\n",
+ "race other total_n \n",
+ "0 0.0 1.0 \n",
+ "1 0.0 1.0 \n",
+ "2 0.0 1.0 \n",
+ "3 0.0 1.0 \n",
+ "4 0.0 1.0 \n",
+ "... ... ... \n",
+ "9018613 0.0 1.0 \n",
+ "9018614 0.0 1.0 \n",
+ "9018615 1.0 1.0 \n",
+ "9018616 1.0 1.0 \n",
+ "9018617 0.0 1.0 \n",
+ "\n",
+ "[9018618 rows x 7 columns]"
]
},
- "execution_count": 63,
+ "execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "df.shape"
+ "# Summing the count of each name & race combination\n",
+ "gdf = sdf.groupby(['full_name','race'], as_index=False)['race'].agg(['count'])\n",
+ "# creating a pivot table so that each name has a count of the # of races with that last name\n",
+ "gdf = gdf.pivot_table(values='count', columns='race', index='full_name')\n",
+ "\n",
+ "# Converting NaN to zeros since that means there is no one that identifies with that race with that last name\n",
+ "gdf = gdf.fillna(0)\n",
+ "\n",
+ "gdf['total_n'] = gdf.sum(axis=1)\n",
+ "gdf.reset_index(inplace=True)\n",
+ "gdf.iloc[:, 1:-1] = gdf.iloc[:, 1:-1].div(gdf.total_n, axis=0)\n",
+ "\n",
+ "gdf"
]
},
{
"cell_type": "code",
- "execution_count": 64,
- "id": "6351f866-44f2-4c5d-b9ec-ccbfb7278d9c",
- "metadata": {
- "tags": []
- },
+ "execution_count": 7,
+ "id": "3e953fb6-2fe8-4d34-be56-10cf6698dc35",
+ "metadata": {},
"outputs": [
{
- "data": {
- "text/plain": [
- "10001779"
- ]
- },
- "execution_count": 64,
- "metadata": {},
- "output_type": "execute_result"
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "['asian', 'hispanic', 'nh_black', 'nh_white', 'other']\n"
+ ]
}
],
"source": [
- "len(df)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "9830ca67-c7a7-4587-bbc1-db7e0f552ae5",
- "metadata": {},
- "source": [
- "## Drop and merge columns"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 65,
- "id": "8c0468aa-dae8-4671-bbe5-88196c1b0fb1",
- "metadata": {
- "tags": []
- },
- "outputs": [],
- "source": [
- "# dropping unknown column\n",
- "df = df.drop(df[df['race'] == 'unknown'].index)"
+ "races = sorted(sdf.race.unique().tolist())\n",
+ "print(races)\n",
+ "\n",
+ "def get_race_idx(val, races):\n",
+ " race_idx = races.index(val)\n",
+ " return race_idx"
]
},
{
"cell_type": "code",
- "execution_count": 66,
- "id": "9d21c416-ea30-48f4-8794-721dd65f6259",
- "metadata": {
- "tags": []
- },
+ "execution_count": 9,
+ "id": "db8c31ec-17d4-464a-8693-754a389c9bf5",
+ "metadata": {},
"outputs": [],
"source": [
- "# combine multi_racial and native_indian to other\n",
- "mapping = {'multi_racial': 'other', 'native_indian': 'other'}\n",
- "df['race'] = df['race'].replace(mapping)"
+ "# for one set of analysis, we define 'true race/ethincity' = where max prob (so modal race = true race)\n",
+ "gdf['race'] = gdf[races].idxmax(axis=1)\n",
+ "gdf['race_code'] = gdf['race'].apply(lambda c: get_race_idx(c,races))"
]
},
{
"cell_type": "code",
- "execution_count": 67,
- "id": "4aeb7f29-01df-4702-bbbe-156924242bb0",
- "metadata": {
- "tags": []
- },
+ "execution_count": 10,
+ "id": "734fc268-8d20-4120-8693-7da140e4c8ec",
+ "metadata": {},
"outputs": [
{
"data": {
@@ -1085,68 +370,116 @@
"\n",
" \n",
" \n",
- " | \n",
+ " race | \n",
" full_name | \n",
- "
\n",
- " \n",
+ " asian | \n",
+ " hispanic | \n",
+ " nh_black | \n",
+ " nh_white | \n",
+ " other | \n",
+ " total_n | \n",
" race | \n",
- " | \n",
+ " race_code | \n",
"
\n",
" \n",
" \n",
" \n",
- " asian | \n",
- " 278290 | \n",
+ " 0 | \n",
+ " A Arup Erik | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " nh_white | \n",
+ " 3 | \n",
"
\n",
" \n",
- " hispanic | \n",
- " 1690573 | \n",
+ " 1 | \n",
+ " A Bitang Ahmad | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " nh_black | \n",
+ " 2 | \n",
"
\n",
" \n",
- " nh_black | \n",
- " 1492989 | \n",
+ " 2 | \n",
+ " A De Feria Graciela | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " hispanic | \n",
+ " 1 | \n",
"
\n",
" \n",
- " nh_white | \n",
- " 5734701 | \n",
+ " 3 | \n",
+ " A F R Stephenson John Alexander | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " nh_white | \n",
+ " 3 | \n",
"
\n",
" \n",
- " other | \n",
- " 390648 | \n",
+ " 4 | \n",
+ " A Felix Noehmi | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " hispanic | \n",
+ " 1 | \n",
"
\n",
" \n",
"
\n",
""
],
"text/plain": [
- " full_name\n",
- "race \n",
- "asian 278290\n",
- "hispanic 1690573\n",
- "nh_black 1492989\n",
- "nh_white 5734701\n",
- "other 390648"
+ "race full_name asian hispanic nh_black nh_white \\\n",
+ "0 A Arup Erik 0.0 0.0 0.0 1.0 \n",
+ "1 A Bitang Ahmad 0.0 0.0 1.0 0.0 \n",
+ "2 A De Feria Graciela 0.0 1.0 0.0 0.0 \n",
+ "3 A F R Stephenson John Alexander 0.0 0.0 0.0 1.0 \n",
+ "4 A Felix Noehmi 0.0 1.0 0.0 0.0 \n",
+ "\n",
+ "race other total_n race race_code \n",
+ "0 0.0 1.0 nh_white 3 \n",
+ "1 0.0 1.0 nh_black 2 \n",
+ "2 0.0 1.0 hispanic 1 \n",
+ "3 0.0 1.0 nh_white 3 \n",
+ "4 0.0 1.0 hispanic 1 "
]
},
- "execution_count": 67,
+ "execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "df.groupby('race').agg({'full_name':'nunique'})"
+ "gdf.head()"
]
},
{
"cell_type": "code",
- "execution_count": 68,
- "id": "2e4d2735-03e0-4c3c-b049-216c662a84cb",
- "metadata": {
- "tags": []
- },
+ "execution_count": 11,
+ "id": "036bbb11-0d02-45db-801c-03a2873291c5",
+ "metadata": {},
"outputs": [],
"source": [
- "df['race_code'] = df.race.factorize()[0]"
+ "gdf.to_csv(\"train_validation_test/gdf_fullname.csv.gz\", index = False, compression=\"gzip\")"
]
},
{
@@ -1159,29 +492,18 @@
},
{
"cell_type": "code",
- "execution_count": 69,
+ "execution_count": 13,
"id": "3a11b216-fde5-48c7-b5ec-904dbde4bb29",
"metadata": {},
"outputs": [],
"source": [
- "train_df, rest_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['race_code'])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 70,
- "id": "f7d44e05-63d5-47cc-85db-c59fc3e169f1",
- "metadata": {
- "tags": []
- },
- "outputs": [],
- "source": [
+ "train_df, rest_df = train_test_split(gdf, test_size=0.2, random_state=42, stratify=gdf['race_code'])\n",
"val_df, test_df = train_test_split(rest_df, test_size=0.5, random_state=42, stratify=rest_df['race_code'])"
]
},
{
"cell_type": "code",
- "execution_count": 71,
+ "execution_count": 14,
"id": "e47fa000-d58f-4360-9487-bd33c149433e",
"metadata": {
"tags": []
@@ -1195,7 +517,7 @@
},
{
"cell_type": "code",
- "execution_count": 72,
+ "execution_count": 15,
"id": "fce465b6-0a17-437d-ad38-7d8036c74d0e",
"metadata": {
"tags": []
@@ -1205,9 +527,9 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "(7678780, 5)\n",
- "(959847, 5)\n",
- "(959848, 5)\n"
+ "(7214894, 9)\n",
+ "(901862, 9)\n",
+ "(901862, 9)\n"
]
}
],
@@ -1219,87 +541,7 @@
},
{
"cell_type": "code",
- "execution_count": 73,
- "id": "c30f722b-995b-4269-bac7-38bc4ba64999",
- "metadata": {
- "tags": []
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " full_name | \n",
- "
\n",
- " \n",
- " race | \n",
- " | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " asian | \n",
- " 278290 | \n",
- "
\n",
- " \n",
- " hispanic | \n",
- " 1690573 | \n",
- "
\n",
- " \n",
- " nh_black | \n",
- " 1492989 | \n",
- "
\n",
- " \n",
- " nh_white | \n",
- " 5734701 | \n",
- "
\n",
- " \n",
- " other | \n",
- " 390648 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " full_name\n",
- "race \n",
- "asian 278290\n",
- "hispanic 1690573\n",
- "nh_black 1492989\n",
- "nh_white 5734701\n",
- "other 390648"
- ]
- },
- "execution_count": 73,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df.groupby('race').agg({'full_name':'nunique'})"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 74,
+ "execution_count": 16,
"id": "66f97f4e-4fda-44bc-8d3c-65fbe6eea9b6",
"metadata": {
"tags": []
@@ -1325,7 +567,7 @@
"\n",
" \n",
" \n",
- " | \n",
+ " race | \n",
" full_name | \n",
"
\n",
" \n",
@@ -1336,39 +578,39 @@
"
\n",
" \n",
" asian | \n",
- " 222632 | \n",
+ " 206042 | \n",
"
\n",
" \n",
" hispanic | \n",
- " 1352458 | \n",
+ " 1308198 | \n",
"
\n",
" \n",
" nh_black | \n",
- " 1194391 | \n",
+ " 1067770 | \n",
"
\n",
" \n",
" nh_white | \n",
- " 4587761 | \n",
+ " 4421898 | \n",
"
\n",
" \n",
" other | \n",
- " 314216 | \n",
+ " 210986 | \n",
"
\n",
" \n",
"
\n",
""
],
"text/plain": [
- " full_name\n",
+ "race full_name\n",
"race \n",
- "asian 222632\n",
- "hispanic 1352458\n",
- "nh_black 1194391\n",
- "nh_white 4587761\n",
- "other 314216"
+ "asian 206042\n",
+ "hispanic 1308198\n",
+ "nh_black 1067770\n",
+ "nh_white 4421898\n",
+ "other 210986"
]
},
- "execution_count": 74,
+ "execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
@@ -1379,7 +621,7 @@
},
{
"cell_type": "code",
- "execution_count": 75,
+ "execution_count": 17,
"id": "7d9a625b-fd10-4374-b744-e809620c86d5",
"metadata": {
"tags": []
@@ -1405,7 +647,7 @@
"\n",
" \n",
" \n",
- " | \n",
+ " race | \n",
" full_name | \n",
"
\n",
" \n",
@@ -1416,39 +658,39 @@
"
\n",
" \n",
" asian | \n",
- " 27829 | \n",
+ " 25755 | \n",
"
\n",
" \n",
" hispanic | \n",
- " 169057 | \n",
+ " 163525 | \n",
"
\n",
" \n",
" nh_black | \n",
- " 149299 | \n",
+ " 133471 | \n",
"
\n",
" \n",
" nh_white | \n",
- " 573470 | \n",
+ " 552738 | \n",
"
\n",
" \n",
" other | \n",
- " 40061 | \n",
+ " 26373 | \n",
"
\n",
" \n",
"
\n",
""
],
"text/plain": [
- " full_name\n",
+ "race full_name\n",
"race \n",
- "asian 27829\n",
- "hispanic 169057\n",
- "nh_black 149299\n",
- "nh_white 573470\n",
- "other 40061"
+ "asian 25755\n",
+ "hispanic 163525\n",
+ "nh_black 133471\n",
+ "nh_white 552738\n",
+ "other 26373"
]
},
- "execution_count": 75,
+ "execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
@@ -1459,7 +701,7 @@
},
{
"cell_type": "code",
- "execution_count": 76,
+ "execution_count": 18,
"id": "e61fc0f4-1b6c-42a0-a1d5-cf7cbff8a290",
"metadata": {
"tags": []
@@ -1485,7 +727,7 @@
"\n",
" \n",
" \n",
- " | \n",
+ " race | \n",
" full_name | \n",
"
\n",
" \n",
@@ -1496,39 +738,39 @@
"
\n",
" \n",
" asian | \n",
- " 27829 | \n",
+ " 25756 | \n",
"
\n",
" \n",
" hispanic | \n",
- " 169058 | \n",
+ " 163525 | \n",
"
\n",
" \n",
" nh_black | \n",
- " 149299 | \n",
+ " 133471 | \n",
"
\n",
" \n",
" nh_white | \n",
- " 573470 | \n",
+ " 552737 | \n",
"
\n",
" \n",
" other | \n",
- " 40068 | \n",
+ " 26373 | \n",
"
\n",
" \n",
"
\n",
""
],
"text/plain": [
- " full_name\n",
+ "race full_name\n",
"race \n",
- "asian 27829\n",
- "hispanic 169058\n",
- "nh_black 149299\n",
- "nh_white 573470\n",
- "other 40068"
+ "asian 25756\n",
+ "hispanic 163525\n",
+ "nh_black 133471\n",
+ "nh_white 552737\n",
+ "other 26373"
]
},
- "execution_count": 76,
+ "execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
@@ -1547,143 +789,22 @@
},
{
"cell_type": "code",
- "execution_count": 77,
+ "execution_count": 19,
"id": "ecc12a2c-bee0-49bd-b42d-ab8cb5589a15",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
- "train_df.to_csv(\"data/fl_2022_FullName_train.csv.gz\",index=False,compression=\"gzip\")\n",
- "val_df.to_csv(\"data/fl_2022_FullName_val.csv.gz\",index=False,compression=\"gzip\")\n",
- "test_df.to_csv(\"data/fl_2022_FullName_test.csv.gz\",index=False,compression=\"gzip\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 78,
- "id": "aa9be3b5-ee0d-4935-9b21-14012c676235",
- "metadata": {
- "tags": []
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "13M\tdata/fl_2022_FullName_test.csv.gz\n",
- "101M\tdata/fl_2022_FullName_train.csv.gz\n",
- "13M\tdata/fl_2022_FullName_val.csv.gz\n"
- ]
- }
- ],
- "source": [
- "!du -sh data/fl_2022_FullName_*"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 79,
- "id": "2fb93bc9-448d-45e0-a976-9312bf94e708",
- "metadata": {
- "tags": []
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " name_last | \n",
- " name_first | \n",
- " race | \n",
- " full_name | \n",
- " race_code | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " Baxla | \n",
- " Phyllis | \n",
- " nh_white | \n",
- " Baxla Phyllis | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " Ludwin | \n",
- " Ron | \n",
- " nh_white | \n",
- " Ludwin Ron | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " Signer Welton | \n",
- " Jessica | \n",
- " nh_white | \n",
- " Signer Welton Jessica | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " Stamps | \n",
- " Joshua | \n",
- " nh_white | \n",
- " Stamps Joshua | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " Vassell | \n",
- " Lillie | \n",
- " nh_black | \n",
- " Vassell Lillie | \n",
- " 1 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " name_last name_first race full_name race_code\n",
- "0 Baxla Phyllis nh_white Baxla Phyllis 0\n",
- "1 Ludwin Ron nh_white Ludwin Ron 0\n",
- "2 Signer Welton Jessica nh_white Signer Welton Jessica 0\n",
- "3 Stamps Joshua nh_white Stamps Joshua 0\n",
- "4 Vassell Lillie nh_black Vassell Lillie 1"
- ]
- },
- "execution_count": 79,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "train_df.head()"
+ "train_df.to_csv(\"data/fl_2022_FullName_train.csv.gz\", index=False, compression=\"gzip\")\n",
+ "val_df.to_csv(\"data/fl_2022_FullName_val.csv.gz\", index=False, compression=\"gzip\")\n",
+ "test_df.to_csv(\"data/fl_2022_FullName_test.csv.gz\", index=False, compression=\"gzip\")"
]
},
{
"cell_type": "code",
"execution_count": null,
- "id": "b0a55dbb-c317-4ab8-b4a8-7ff0eb9d645a",
+ "id": "41d34d62-afb2-479e-a21b-040f1eb962ff",
"metadata": {},
"outputs": [],
"source": []
@@ -1705,7 +826,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.10.6"
+ "version": "3.8.10"
}
},
"nbformat": 4,
diff --git a/notebooks/0.2_data_preprocessing_LastName.ipynb b/notebooks/0.2_data_preprocessing_LastName.ipynb
index 320c956..e1ecb38 100644
--- a/notebooks/0.2_data_preprocessing_LastName.ipynb
+++ b/notebooks/0.2_data_preprocessing_LastName.ipynb
@@ -1,969 +1,32 @@
{
"cells": [
{
- "cell_type": "code",
- "execution_count": 34,
- "id": "71a4990d-d590-4f49-9d25-97982e6d58c0",
- "metadata": {
- "tags": []
- },
- "outputs": [],
- "source": [
- "import numpy as np\n",
- "import pandas as pd\n",
- "\n",
- "from sklearn.model_selection import train_test_split"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "78ee21e5-efa5-4c7f-b2cc-32f8df219d08",
- "metadata": {},
- "source": [
- "# Preprocessing data"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 35,
- "id": "af05aa15-8218-4086-9626-adadd2552183",
- "metadata": {
- "tags": []
- },
- "outputs": [],
- "source": [
- "df = pd.read_csv('./data/fl_reg_name_race_2022.csv.gz')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 36,
- "id": "05503af7-99a2-4cf2-bdd4-b25d9015aa92",
- "metadata": {
- "tags": []
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " name_last | \n",
- " name_first | \n",
- " race | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " Hessler-Smith | \n",
- " Jason | \n",
- " nh_white | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " Rogers | \n",
- " Renee | \n",
- " nh_white | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " Bartolome | \n",
- " Crystal | \n",
- " nh_white | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " Bailey | \n",
- " Donna | \n",
- " nh_white | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " Carlson | \n",
- " Greggory | \n",
- " nh_white | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " name_last name_first race\n",
- "0 Hessler-Smith Jason nh_white\n",
- "1 Rogers Renee nh_white\n",
- "2 Bartolome Crystal nh_white\n",
- "3 Bailey Donna nh_white\n",
- "4 Carlson Greggory nh_white"
- ]
- },
- "execution_count": 36,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 37,
- "id": "b6b123c8-70ba-4ab4-841c-b5486a1ba69a",
- "metadata": {
- "tags": []
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " name_last | \n",
- " name_first | \n",
- " race | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " count | \n",
- " 15454992 | \n",
- " 15455022 | \n",
- " 15455110 | \n",
- "
\n",
- " \n",
- " unique | \n",
- " 1341195 | \n",
- " 641103 | \n",
- " 8 | \n",
- "
\n",
- " \n",
- " top | \n",
- " Smith | \n",
- " Michael | \n",
- " nh_white | \n",
- "
\n",
- " \n",
- " freq | \n",
- " 79362 | \n",
- " 153753 | \n",
- " 9446851 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " name_last name_first race\n",
- "count 15454992 15455022 15455110\n",
- "unique 1341195 641103 8\n",
- "top Smith Michael nh_white\n",
- "freq 79362 153753 9446851"
- ]
- },
- "execution_count": 37,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df.describe()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 38,
- "id": "33b5941c-619c-485f-8767-27cdfa71ab27",
- "metadata": {
- "tags": []
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array(['nh_white', 'nh_black', 'other', 'hispanic', 'asian',\n",
- " 'native_indian', 'unknown', 'multi_racial'], dtype=object)"
- ]
- },
- "execution_count": 38,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df['race'].unique()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "65f441b5-3c7d-48f2-b6ba-352e897a72ae",
- "metadata": {},
- "source": [
- "## Drop None Values"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 39,
- "id": "0522d1ca-b9bc-4028-b3f9-58ea8d143357",
- "metadata": {
- "tags": []
- },
- "outputs": [],
- "source": [
- "df.dropna(subset=['name_first', 'name_last'], inplace=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 40,
- "id": "49bd34b0-6543-48bd-b0cd-99cb15bf2569",
- "metadata": {
- "tags": []
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " name_last | \n",
- " name_first | \n",
- " race | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " count | \n",
- " 15454908 | \n",
- " 15454908 | \n",
- " 15454908 | \n",
- "
\n",
- " \n",
- " unique | \n",
- " 1341176 | \n",
- " 641095 | \n",
- " 8 | \n",
- "
\n",
- " \n",
- " top | \n",
- " Smith | \n",
- " Michael | \n",
- " nh_white | \n",
- "
\n",
- " \n",
- " freq | \n",
- " 79362 | \n",
- " 153753 | \n",
- " 9446749 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " name_last name_first race\n",
- "count 15454908 15454908 15454908\n",
- "unique 1341176 641095 8\n",
- "top Smith Michael nh_white\n",
- "freq 79362 153753 9446749"
- ]
- },
- "execution_count": 40,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df.describe()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "5a1c7014-99e8-4b29-b7e5-98ba16caa3b3",
- "metadata": {
- "tags": []
- },
- "source": [
- "## Drop Last name and first name of length 1"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 41,
- "id": "42f1fd6c-ef4f-4538-bb6c-286db30c250a",
- "metadata": {
- "tags": []
- },
- "outputs": [],
- "source": [
- "df = df.drop(df[df['name_last'].str.len() < 2].index)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 42,
- "id": "1fc0d6fd-5ffc-4190-9a11-314f9c34535a",
- "metadata": {
- "tags": []
- },
- "outputs": [],
- "source": [
- "df = df.drop(df[df['name_first'].str.len() < 2].index)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 43,
- "id": "02479d63-5cf8-43f3-a208-9119df5b5457",
- "metadata": {
- "tags": []
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " name_last | \n",
- " name_first | \n",
- " race | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " count | \n",
- " 15366690 | \n",
- " 15366690 | \n",
- " 15366690 | \n",
- "
\n",
- " \n",
- " unique | \n",
- " 1340617 | \n",
- " 641055 | \n",
- " 8 | \n",
- "
\n",
- " \n",
- " top | \n",
- " Smith | \n",
- " Michael | \n",
- " nh_white | \n",
- "
\n",
- " \n",
- " freq | \n",
- " 79297 | \n",
- " 153752 | \n",
- " 9383680 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " name_last name_first race\n",
- "count 15366690 15366690 15366690\n",
- "unique 1340617 641055 8\n",
- "top Smith Michael nh_white\n",
- "freq 79297 153752 9383680"
- ]
- },
- "execution_count": 43,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df.describe()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "22f6aa41-af9c-44a7-a9ce-b5faf2c36caf",
- "metadata": {},
- "source": [
- "## Make all names title case"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 44,
- "id": "96a27d5c-b423-43d8-bb10-fc1e4d4404cb",
- "metadata": {
- "tags": []
- },
- "outputs": [],
- "source": [
- "df['name_first'] = df['name_first'].str.title()\n",
- "df['name_last'] = df['name_last'].str.title()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "6a7d0983-9883-4777-a29e-6b9c913f9271",
- "metadata": {},
- "source": [
- "## Remove Special Characters"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 45,
- "id": "12dea730-e4dc-4bbd-91f6-286abf4c2fee",
- "metadata": {
- "tags": []
- },
- "outputs": [],
- "source": [
- "df['name_last'] = df['name_last'].str.replace(\"[^a-zA-Z' -]\", '', regex=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 46,
- "id": "7d693324-de62-4ea3-8edd-ed3892f52a2f",
- "metadata": {
- "tags": []
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " name_last | \n",
- " name_first | \n",
- " race | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " Hessler-Smith | \n",
- " Jason | \n",
- " nh_white | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " Rogers | \n",
- " Renee | \n",
- " nh_white | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " Bartolome | \n",
- " Crystal | \n",
- " nh_white | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " Bailey | \n",
- " Donna | \n",
- " nh_white | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " Carlson | \n",
- " Greggory | \n",
- " nh_white | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " name_last name_first race\n",
- "0 Hessler-Smith Jason nh_white\n",
- "1 Rogers Renee nh_white\n",
- "2 Bartolome Crystal nh_white\n",
- "3 Bailey Donna nh_white\n",
- "4 Carlson Greggory nh_white"
- ]
- },
- "execution_count": 46,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df.head()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "7ce1807d-f228-482d-a2d8-43211e2806c1",
- "metadata": {},
- "source": [
- "## Drop duplicates"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 47,
- "id": "98363907-538a-4255-9cd6-35846c58d044",
- "metadata": {
- "tags": []
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " name_last | \n",
- " name_first | \n",
- " race | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 52 | \n",
- " Gruber | \n",
- " Linda | \n",
- " nh_white | \n",
- "
\n",
- " \n",
- " 122 | \n",
- " Taylor | \n",
- " Robert | \n",
- " nh_white | \n",
- "
\n",
- " \n",
- " 127 | \n",
- " Bailey | \n",
- " Pamela | \n",
- " nh_white | \n",
- "
\n",
- " \n",
- " 138 | \n",
- " Johnson | \n",
- " Ashley | \n",
- " nh_black | \n",
- "
\n",
- " \n",
- " 146 | \n",
- " Mobley | \n",
- " Robert | \n",
- " nh_black | \n",
- "
\n",
- " \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " 15455105 | \n",
- " Ballew | \n",
- " Christina | \n",
- " nh_white | \n",
- "
\n",
- " \n",
- " 15455106 | \n",
- " Watts | \n",
- " Mark | \n",
- " nh_white | \n",
- "
\n",
- " \n",
- " 15455107 | \n",
- " Mcrae | \n",
- " Evelyn | \n",
- " nh_white | \n",
- "
\n",
- " \n",
- " 15455108 | \n",
- " Ward | \n",
- " Stephanie | \n",
- " nh_white | \n",
- "
\n",
- " \n",
- " 15455109 | \n",
- " Edenfield | \n",
- " Marcus | \n",
- " nh_white | \n",
- "
\n",
- " \n",
- "
\n",
- "
13894849 rows × 3 columns
\n",
- "
"
- ],
- "text/plain": [
- " name_last name_first race\n",
- "52 Gruber Linda nh_white\n",
- "122 Taylor Robert nh_white\n",
- "127 Bailey Pamela nh_white\n",
- "138 Johnson Ashley nh_black\n",
- "146 Mobley Robert nh_black\n",
- "... ... ... ...\n",
- "15455105 Ballew Christina nh_white\n",
- "15455106 Watts Mark nh_white\n",
- "15455107 Mcrae Evelyn nh_white\n",
- "15455108 Ward Stephanie nh_white\n",
- "15455109 Edenfield Marcus nh_white\n",
- "\n",
- "[13894849 rows x 3 columns]"
- ]
- },
- "execution_count": 47,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df[df[['name_last','race']].duplicated()]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 48,
- "id": "5dcbb0d6-ede7-4b20-af15-4466f04d3fcd",
- "metadata": {
- "tags": []
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " name_last | \n",
- " name_first | \n",
- " race | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 136 | \n",
- " Porter | \n",
- " Paula | \n",
- " nh_white | \n",
- "
\n",
- " \n",
- " 550 | \n",
- " Porter | \n",
- " Paula | \n",
- " nh_black | \n",
- "
\n",
- " \n",
- " 7329 | \n",
- " Porter | \n",
- " Wendell | \n",
- " nh_white | \n",
- "
\n",
- " \n",
- " 7557 | \n",
- " Porter | \n",
- " Anthony | \n",
- " nh_white | \n",
- "
\n",
- " \n",
- " 9200 | \n",
- " Porter | \n",
- " Kevin | \n",
- " nh_white | \n",
- "
\n",
- " \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " 15448598 | \n",
- " Porter | \n",
- " William | \n",
- " nh_white | \n",
- "
\n",
- " \n",
- " 15448772 | \n",
- " Porter | \n",
- " Kyle | \n",
- " nh_white | \n",
- "
\n",
- " \n",
- " 15451135 | \n",
- " Porter | \n",
- " Jean | \n",
- " nh_white | \n",
- "
\n",
- " \n",
- " 15451767 | \n",
- " Porter | \n",
- " Annette | \n",
- " nh_white | \n",
- "
\n",
- " \n",
- " 15454870 | \n",
- " Porter | \n",
- " Ashley | \n",
- " nh_white | \n",
- "
\n",
- " \n",
- "
\n",
- "
7451 rows × 3 columns
\n",
- "
"
- ],
- "text/plain": [
- " name_last name_first race\n",
- "136 Porter Paula nh_white\n",
- "550 Porter Paula nh_black\n",
- "7329 Porter Wendell nh_white\n",
- "7557 Porter Anthony nh_white\n",
- "9200 Porter Kevin nh_white\n",
- "... ... ... ...\n",
- "15448598 Porter William nh_white\n",
- "15448772 Porter Kyle nh_white\n",
- "15451135 Porter Jean nh_white\n",
- "15451767 Porter Annette nh_white\n",
- "15454870 Porter Ashley nh_white\n",
- "\n",
- "[7451 rows x 3 columns]"
- ]
- },
- "execution_count": 48,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df[df['name_last'] == \"Porter\"]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 49,
- "id": "68f7146a-45e7-40c6-8f2a-9e50bd914743",
+ "cell_type": "markdown",
+ "id": "6cc055eb-51ad-43c5-aacf-798983f0adfa",
"metadata": {
"tags": []
},
- "outputs": [],
"source": [
- "df = df.drop_duplicates(['name_last','race'],keep= 'last')"
+ "### Last Name Preprocessing (Train/Validation/Test)"
]
},
{
"cell_type": "code",
- "execution_count": 50,
- "id": "ed066729-7970-47ea-ad65-9842bc71366a",
- "metadata": {
- "tags": []
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " name_last | \n",
- " name_first | \n",
- " race | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 14952661 | \n",
- " Porter | \n",
- " Marisyd | \n",
- " asian | \n",
- "
\n",
- " \n",
- " 15029071 | \n",
- " Porter | \n",
- " Amber | \n",
- " multi_racial | \n",
- "
\n",
- " \n",
- " 15222442 | \n",
- " Porter | \n",
- " Anna | \n",
- " other | \n",
- "
\n",
- " \n",
- " 15337979 | \n",
- " Porter | \n",
- " Dennis | \n",
- " unknown | \n",
- "
\n",
- " \n",
- " 15369699 | \n",
- " Porter | \n",
- " Lila | \n",
- " native_indian | \n",
- "
\n",
- " \n",
- " 15378779 | \n",
- " Porter | \n",
- " Cristopher | \n",
- " hispanic | \n",
- "
\n",
- " \n",
- " 15438806 | \n",
- " Porter | \n",
- " Orrick | \n",
- " nh_black | \n",
- "
\n",
- " \n",
- " 15454870 | \n",
- " Porter | \n",
- " Ashley | \n",
- " nh_white | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " name_last name_first race\n",
- "14952661 Porter Marisyd asian\n",
- "15029071 Porter Amber multi_racial\n",
- "15222442 Porter Anna other\n",
- "15337979 Porter Dennis unknown\n",
- "15369699 Porter Lila native_indian\n",
- "15378779 Porter Cristopher hispanic\n",
- "15438806 Porter Orrick nh_black\n",
- "15454870 Porter Ashley nh_white"
- ]
- },
- "execution_count": 50,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "execution_count": 1,
+ "id": "722b0940-de98-48ad-96b1-89abf710e3f1",
+ "metadata": {},
+ "outputs": [],
"source": [
- "df[df['name_last'] == \"Porter\"]"
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "\n",
+ "from sklearn.model_selection import train_test_split"
]
},
{
"cell_type": "code",
- "execution_count": 51,
- "id": "0d2504cf-d9f9-4eff-a53f-fa859f686ed1",
+ "execution_count": 2,
+ "id": "af05aa15-8218-4086-9626-adadd2552183",
"metadata": {
"tags": []
},
@@ -971,83 +34,80 @@
{
"data": {
"text/plain": [
- "(1471841, 3)"
+ "(15455110, 3)"
]
},
- "execution_count": 51,
+ "execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
+ "df = pd.read_csv('data/fl_reg_name_race_2022.csv.gz')\n",
"df.shape"
]
},
{
"cell_type": "code",
- "execution_count": 52,
- "id": "6351f866-44f2-4c5d-b9ec-ccbfb7278d9c",
- "metadata": {
- "tags": []
- },
+ "execution_count": 3,
+ "id": "5c244d79-6807-4156-b318-10555d85de1e",
+ "metadata": {},
"outputs": [
{
- "data": {
- "text/plain": [
- "1471841"
- ]
- },
- "execution_count": 52,
- "metadata": {},
- "output_type": "execute_result"
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Size after dropping missing first and last names: (15454979, 3)\n",
+ "Size after dropping unknown: (15009244, 3)\n",
+ "Size after dropping last names less than 2 chars: (14933334, 3)\n"
+ ]
}
],
"source": [
- "len(df)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "9830ca67-c7a7-4587-bbc1-db7e0f552ae5",
- "metadata": {},
- "source": [
- "## Drop and merge columns"
+ "# Remove NA first/last\n",
+ "df.dropna(subset=['name_first', 'name_last'], inplace=True)\n",
+ "print(\"Size after dropping missing first and last names:\", df.shape)\n",
+ "\n",
+ "# We assume unknown as missing at random\n",
+ "sdf = df[df.race.isin(['unknown']) == False]\n",
+ "print(\"Size after dropping unknown:\", sdf.shape)\n",
+ "del df\n",
+ "\n",
+ "# Drop cases where last name is less than 2 chars\n",
+ "sdf = sdf.drop(sdf[sdf['name_last'].str.len() < 2].index)\n",
+ "print(\"Size after dropping last names less than 2 chars:\", sdf.shape)"
]
},
{
"cell_type": "code",
- "execution_count": 53,
- "id": "8c0468aa-dae8-4671-bbe5-88196c1b0fb1",
+ "execution_count": 4,
+ "id": "96a27d5c-b423-43d8-bb10-fc1e4d4404cb",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
- "# dropping unknown column\n",
- "df = df.drop(df[df['race'] == 'unknown'].index)"
+ "sdf['name_last'] = sdf['name_last'].str.title()\n",
+ "sdf['name_last'] = sdf['name_last'].str.replace(\"[^a-zA-Z' -]\", '', regex=True)"
]
},
{
"cell_type": "code",
- "execution_count": 54,
- "id": "9d21c416-ea30-48f4-8794-721dd65f6259",
- "metadata": {
- "tags": []
- },
+ "execution_count": 5,
+ "id": "76cc90e1-9961-4ea4-bba0-003eb1c9965d",
+ "metadata": {},
"outputs": [],
"source": [
- "# combine multi_racial and native_indian to other\n",
+ "# recode race\n",
"mapping = {'multi_racial': 'other', 'native_indian': 'other'}\n",
- "df['race'] = df['race'].replace(mapping)"
+ "sdf['race'] = sdf['race'].replace(mapping)"
]
},
{
"cell_type": "code",
- "execution_count": 55,
- "id": "4aeb7f29-01df-4702-bbbe-156924242bb0",
- "metadata": {
- "tags": []
- },
+ "execution_count": 6,
+ "id": "bcf7903a-d84e-456f-a060-82ceec27b2a7",
+ "metadata": {},
"outputs": [
{
"data": {
@@ -1069,68 +129,212 @@
"\n",
" \n",
" \n",
- " | \n",
- " name_last | \n",
- "
\n",
- " \n",
" race | \n",
- " | \n",
+ " name_last | \n",
+ " asian | \n",
+ " hispanic | \n",
+ " nh_black | \n",
+ " nh_white | \n",
+ " other | \n",
+ " total_n | \n",
"
\n",
" \n",
" \n",
" \n",
- " asian | \n",
- " 68672 | \n",
+ " 0 | \n",
+ " A Arup | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
"
\n",
" \n",
- " hispanic | \n",
- " 389609 | \n",
+ " 1 | \n",
+ " A Bitang | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
"
\n",
" \n",
- " nh_black | \n",
- " 137271 | \n",
+ " 2 | \n",
+ " A De Feria | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
"
\n",
" \n",
- " nh_white | \n",
- " 609707 | \n",
+ " 3 | \n",
+ " A F R Stephenson | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
"
\n",
" \n",
- " other | \n",
- " 115218 | \n",
+ " 4 | \n",
+ " A Felix | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 1056640 | \n",
+ " Zyzanski | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 1056641 | \n",
+ " Zyzdryn | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 2.0 | \n",
+ "
\n",
+ " \n",
+ " 1056642 | \n",
+ " Zyznomyrsky | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 1056643 | \n",
+ " Zzaman | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 2.0 | \n",
+ "
\n",
+ " \n",
+ " 1056644 | \n",
+ " Zzie | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
"
\n",
" \n",
"
\n",
+ "1056645 rows × 7 columns
\n",
""
],
"text/plain": [
- " name_last\n",
- "race \n",
- "asian 68672\n",
- "hispanic 389609\n",
- "nh_black 137271\n",
- "nh_white 609707\n",
- "other 115218"
+ "race name_last asian hispanic nh_black nh_white other total_n\n",
+ "0 A Arup 0.0 0.0 0.0 1.0 0.0 1.0\n",
+ "1 A Bitang 0.0 0.0 1.0 0.0 0.0 1.0\n",
+ "2 A De Feria 0.0 1.0 0.0 0.0 0.0 1.0\n",
+ "3 A F R Stephenson 0.0 0.0 0.0 1.0 0.0 1.0\n",
+ "4 A Felix 0.0 1.0 0.0 0.0 0.0 1.0\n",
+ "... ... ... ... ... ... ... ...\n",
+ "1056640 Zyzanski 0.0 0.0 0.0 1.0 0.0 1.0\n",
+ "1056641 Zyzdryn 0.0 0.0 0.0 1.0 0.0 2.0\n",
+ "1056642 Zyznomyrsky 0.0 0.0 0.0 1.0 0.0 1.0\n",
+ "1056643 Zzaman 0.0 0.0 0.0 0.0 1.0 2.0\n",
+ "1056644 Zzie 0.0 0.0 0.0 1.0 0.0 1.0\n",
+ "\n",
+ "[1056645 rows x 7 columns]"
]
},
- "execution_count": 55,
+ "execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "df.groupby('race').agg({'name_last':'nunique'})"
+ "# Summing the count of each name & race combination\n",
+ "gdf = sdf.groupby(['name_last','race'], as_index=False)['race'].agg(['count'])\n",
+ "# creating a pivot table so that each name has a count of the # of races with that last name\n",
+ "gdf = gdf.pivot_table(values='count', columns='race', index='name_last')\n",
+ "\n",
+ "# Converting NaN to zeros since that means there is no one that identifies with that race with that last name\n",
+ "gdf = gdf.fillna(0)\n",
+ "\n",
+ "gdf['total_n'] = gdf.sum(axis=1)\n",
+ "gdf.reset_index(inplace=True)\n",
+ "gdf.iloc[:, 1:-1] = gdf.iloc[:, 1:-1].div(gdf.total_n, axis=0)\n",
+ "gdf"
]
},
{
"cell_type": "code",
- "execution_count": 56,
- "id": "98ff85be-2f1e-4621-8aa8-11eb3965ecd4",
- "metadata": {
- "tags": []
- },
+ "execution_count": 7,
+ "id": "25ebef7d-1af3-4039-8518-5ccef5f07c7c",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "['asian', 'hispanic', 'nh_black', 'nh_white', 'other']\n"
+ ]
+ }
+ ],
+ "source": [
+ "races = sorted(sdf.race.unique().tolist())\n",
+ "print(races)\n",
+ "\n",
+ "def get_race_idx(val, races):\n",
+ " race_idx = races.index(val)\n",
+ " return race_idx"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "55c26fd4-55b4-4c9e-b918-06f124d4691c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# for one set of analysis, we define 'true race/ethincity' = where max prob (so modal race = true race)\n",
+ "gdf['race'] = gdf[races].idxmax(axis=1)\n",
+ "gdf['race_code'] = gdf['race'].apply(lambda c: get_race_idx(c,races))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "8704cd24-c082-453d-90cc-acd0101f6fd7",
+ "metadata": {},
"outputs": [],
"source": [
- "df['race_code'] = df.race.factorize()[0]"
+ "gdf.to_csv(\"train_validation_test/fl_2022_lastname.csv.gz\", index = False, compression=\"gzip\")"
]
},
{
@@ -1143,17 +347,17 @@
},
{
"cell_type": "code",
- "execution_count": 57,
+ "execution_count": 11,
"id": "3a11b216-fde5-48c7-b5ec-904dbde4bb29",
"metadata": {},
"outputs": [],
"source": [
- "train_df, rest_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['race_code'])"
+ "train_df, rest_df = train_test_split(gdf, test_size=0.2, random_state=42, stratify=gdf['race_code'])"
]
},
{
"cell_type": "code",
- "execution_count": 58,
+ "execution_count": 12,
"id": "f7d44e05-63d5-47cc-85db-c59fc3e169f1",
"metadata": {
"tags": []
@@ -1165,7 +369,7 @@
},
{
"cell_type": "code",
- "execution_count": 59,
+ "execution_count": 13,
"id": "e47fa000-d58f-4360-9487-bd33c149433e",
"metadata": {
"tags": []
@@ -1179,7 +383,7 @@
},
{
"cell_type": "code",
- "execution_count": 60,
+ "execution_count": 14,
"id": "fce465b6-0a17-437d-ad38-7d8036c74d0e",
"metadata": {
"tags": []
@@ -1189,9 +393,9 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "(1079199, 4)\n",
- "(134900, 4)\n",
- "(134900, 4)\n"
+ "(845316, 9)\n",
+ "(105664, 9)\n",
+ "(105665, 9)\n"
]
}
],
@@ -1203,87 +407,7 @@
},
{
"cell_type": "code",
- "execution_count": 61,
- "id": "c30f722b-995b-4269-bac7-38bc4ba64999",
- "metadata": {
- "tags": []
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " name_last | \n",
- "
\n",
- " \n",
- " race | \n",
- " | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " asian | \n",
- " 68672 | \n",
- "
\n",
- " \n",
- " hispanic | \n",
- " 389609 | \n",
- "
\n",
- " \n",
- " nh_black | \n",
- " 137271 | \n",
- "
\n",
- " \n",
- " nh_white | \n",
- " 609707 | \n",
- "
\n",
- " \n",
- " other | \n",
- " 115218 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " name_last\n",
- "race \n",
- "asian 68672\n",
- "hispanic 389609\n",
- "nh_black 137271\n",
- "nh_white 609707\n",
- "other 115218"
- ]
- },
- "execution_count": 61,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df.groupby('race').agg({'name_last':'nunique'})"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 62,
+ "execution_count": 15,
"id": "66f97f4e-4fda-44bc-8d3c-65fbe6eea9b6",
"metadata": {
"tags": []
@@ -1309,7 +433,7 @@
"\n",
" \n",
" \n",
- " | \n",
+ " race | \n",
" name_last | \n",
"
\n",
" \n",
@@ -1320,39 +444,39 @@
"
\n",
" \n",
" asian | \n",
- " 54938 | \n",
+ " 29184 | \n",
"
\n",
" \n",
" hispanic | \n",
- " 311687 | \n",
+ " 259689 | \n",
"
\n",
" \n",
" nh_black | \n",
- " 109817 | \n",
+ " 83227 | \n",
"
\n",
" \n",
" nh_white | \n",
- " 487765 | \n",
+ " 450098 | \n",
"
\n",
" \n",
" other | \n",
- " 95708 | \n",
+ " 23118 | \n",
"
\n",
" \n",
"
\n",
""
],
"text/plain": [
- " name_last\n",
+ "race name_last\n",
"race \n",
- "asian 54938\n",
- "hispanic 311687\n",
- "nh_black 109817\n",
- "nh_white 487765\n",
- "other 95708"
+ "asian 29184\n",
+ "hispanic 259689\n",
+ "nh_black 83227\n",
+ "nh_white 450098\n",
+ "other 23118"
]
},
- "execution_count": 62,
+ "execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
@@ -1363,7 +487,7 @@
},
{
"cell_type": "code",
- "execution_count": 63,
+ "execution_count": 16,
"id": "7d9a625b-fd10-4374-b744-e809620c86d5",
"metadata": {
"tags": []
@@ -1389,7 +513,7 @@
"\n",
" \n",
" \n",
- " | \n",
+ " race | \n",
" name_last | \n",
"
\n",
" \n",
@@ -1400,39 +524,39 @@
"
\n",
" \n",
" asian | \n",
- " 6867 | \n",
+ " 3648 | \n",
"
\n",
" \n",
" hispanic | \n",
- " 38961 | \n",
+ " 32461 | \n",
"
\n",
" \n",
" nh_black | \n",
- " 13727 | \n",
+ " 10403 | \n",
"
\n",
" \n",
" nh_white | \n",
- " 60971 | \n",
+ " 56262 | \n",
"
\n",
" \n",
" other | \n",
- " 14068 | \n",
+ " 2890 | \n",
"
\n",
" \n",
"
\n",
""
],
"text/plain": [
- " name_last\n",
+ "race name_last\n",
"race \n",
- "asian 6867\n",
- "hispanic 38961\n",
- "nh_black 13727\n",
- "nh_white 60971\n",
- "other 14068"
+ "asian 3648\n",
+ "hispanic 32461\n",
+ "nh_black 10403\n",
+ "nh_white 56262\n",
+ "other 2890"
]
},
- "execution_count": 63,
+ "execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
@@ -1443,7 +567,7 @@
},
{
"cell_type": "code",
- "execution_count": 64,
+ "execution_count": 17,
"id": "e61fc0f4-1b6c-42a0-a1d5-cf7cbff8a290",
"metadata": {
"tags": []
@@ -1469,7 +593,7 @@
"\n",
" \n",
" \n",
- " | \n",
+ " race | \n",
" name_last | \n",
"
\n",
" \n",
@@ -1480,39 +604,39 @@
"
\n",
" \n",
" asian | \n",
- " 6867 | \n",
+ " 3648 | \n",
"
\n",
" \n",
" hispanic | \n",
- " 38961 | \n",
+ " 32461 | \n",
"
\n",
" \n",
" nh_black | \n",
- " 13727 | \n",
+ " 10404 | \n",
"
\n",
" \n",
" nh_white | \n",
- " 60971 | \n",
+ " 56262 | \n",
"
\n",
" \n",
" other | \n",
- " 14060 | \n",
+ " 2890 | \n",
"
\n",
" \n",
"
\n",
""
],
"text/plain": [
- " name_last\n",
+ "race name_last\n",
"race \n",
- "asian 6867\n",
- "hispanic 38961\n",
- "nh_black 13727\n",
- "nh_white 60971\n",
- "other 14060"
+ "asian 3648\n",
+ "hispanic 32461\n",
+ "nh_black 10404\n",
+ "nh_white 56262\n",
+ "other 2890"
]
},
- "execution_count": 64,
+ "execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
@@ -1531,7 +655,7 @@
},
{
"cell_type": "code",
- "execution_count": 65,
+ "execution_count": 18,
"id": "ecc12a2c-bee0-49bd-b42d-ab8cb5589a15",
"metadata": {
"tags": []
@@ -1543,32 +667,10 @@
"test_df.to_csv(\"data/fl_2022_LastName_test.csv.gz\",index=False,compression=\"gzip\")"
]
},
- {
- "cell_type": "code",
- "execution_count": 66,
- "id": "aa9be3b5-ee0d-4935-9b21-14012c676235",
- "metadata": {
- "tags": []
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "1.3M\tdata/fl_2022_LastName_test.csv.gz\n",
- "11M\tdata/fl_2022_LastName_train.csv.gz\n",
- "1.3M\tdata/fl_2022_LastName_val.csv.gz\n"
- ]
- }
- ],
- "source": [
- "!du -sh data/fl_2022_LastName_*"
- ]
- },
{
"cell_type": "code",
"execution_count": null,
- "id": "2fb93bc9-448d-45e0-a976-9312bf94e708",
+ "id": "6b389e66-ee18-4ee6-b3d1-87c73859189c",
"metadata": {},
"outputs": [],
"source": []
@@ -1590,7 +692,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.10.6"
+ "version": "3.8.10"
}
},
"nbformat": 4,