diff --git a/.gitignore b/.gitignore index 629a99d..54172d2 100644 --- a/.gitignore +++ b/.gitignore @@ -20,3 +20,6 @@ models/rf_vec_fullname_2M.joblib models/rf_vec_lastname.joblib models/rf_vec_lastname_1M.joblib models/rf_vec_lastname_2M.joblib +ms/icwsm/name_race.aux +ms/icwsm/name_race.bbl +ms/icwsm/name_race.blg diff --git a/notebooks/0.1_data_preprocessing_FullName.ipynb b/notebooks/0.1_data_preprocessing_FullName.ipynb index 55afab9..1b0b6bd 100644 --- a/notebooks/0.1_data_preprocessing_FullName.ipynb +++ b/notebooks/0.1_data_preprocessing_FullName.ipynb @@ -1,258 +1,116 @@ { "cells": [ { - "cell_type": "code", - "execution_count": 1, - "id": "71a4990d-d590-4f49-9d25-97982e6d58c0", + "cell_type": "markdown", + "id": "ce92ae6b-1cfb-47f9-b947-55f2448b7500", "metadata": { "tags": [] }, - "outputs": [], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "\n", - "from sklearn.model_selection import train_test_split" - ] - }, - { - "cell_type": "markdown", - "id": "78ee21e5-efa5-4c7f-b2cc-32f8df219d08", - "metadata": {}, "source": [ - "# Preprocessing data" + "### Full Name Dataset (Train/Validation/Test)" ] }, { "cell_type": "code", - "execution_count": 45, - "id": "af05aa15-8218-4086-9626-adadd2552183", - "metadata": { - "tags": [] - }, + "execution_count": 1, + "id": "a34fa1c0-cf2e-464a-bc56-73a4f7a38a55", + "metadata": {}, "outputs": [], "source": [ - "df = pd.read_csv('./data/fl_reg_name_race_2022.csv.gz')" + "import numpy as np\n", + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split" ] }, { "cell_type": "code", - "execution_count": 46, - "id": "05503af7-99a2-4cf2-bdd4-b25d9015aa92", + "execution_count": 2, + "id": "af05aa15-8218-4086-9626-adadd2552183", "metadata": { "tags": [] }, "outputs": [ { "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
name_lastname_firstrace
0Hessler-SmithJasonnh_white
1RogersReneenh_white
2BartolomeCrystalnh_white
3BaileyDonnanh_white
4CarlsonGreggorynh_white
\n", - "
" - ], "text/plain": [ - " name_last name_first race\n", - "0 Hessler-Smith Jason nh_white\n", - "1 Rogers Renee nh_white\n", - "2 Bartolome Crystal nh_white\n", - "3 Bailey Donna nh_white\n", - "4 Carlson Greggory nh_white" + "(15455110, 3)" ] }, - "execution_count": 46, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df.head()" + "df = pd.read_csv('data/fl_reg_name_race_2022.csv.gz')\n", + "df.shape" ] }, { "cell_type": "code", - "execution_count": 47, - "id": "b6b123c8-70ba-4ab4-841c-b5486a1ba69a", + "execution_count": 3, + "id": "05503af7-99a2-4cf2-bdd4-b25d9015aa92", "metadata": { "tags": [] }, "outputs": [ { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
name_lastname_firstrace
count154549921545502215455110
unique13411956411038
topSmithMichaelnh_white
freq793621537539446851
\n", - "
" - ], - "text/plain": [ - " name_last name_first race\n", - "count 15454992 15455022 15455110\n", - "unique 1341195 641103 8\n", - "top Smith Michael nh_white\n", - "freq 79362 153753 9446851" - ] - }, - "execution_count": 47, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "Size after dropping missing first and last names: (15454979, 3)\n", + "Size after dropping unknown: (15009244, 3)\n", + "Size after dropping last names less than 2 chars: (14933334, 3)\n" + ] } ], "source": [ - "df.describe()" + "# Remove NA first/last\n", + "df.dropna(subset=['name_first', 'name_last'], inplace=True)\n", + "print(\"Size after dropping missing first and last names:\", df.shape)\n", + "\n", + "# We assume unknown as missing at random\n", + "sdf = df[df.race.isin(['unknown']) == False]\n", + "print(\"Size after dropping unknown:\", sdf.shape)\n", + "del df\n", + "\n", + "# Drop cases where last name is less than 2 chars\n", + "sdf = sdf.drop(sdf[sdf['name_last'].str.len() < 2].index)\n", + "print(\"Size after dropping last names less than 2 chars:\", sdf.shape)" ] }, { "cell_type": "code", - "execution_count": 48, - "id": "33b5941c-619c-485f-8767-27cdfa71ab27", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "array(['nh_white', 'nh_black', 'other', 'hispanic', 'asian',\n", - " 'native_indian', 'unknown', 'multi_racial'], dtype=object)" - ] - }, - "execution_count": 48, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df['race'].unique()" - ] - }, - { - "cell_type": "markdown", - "id": "65f441b5-3c7d-48f2-b6ba-352e897a72ae", + "execution_count": 4, + "id": "98a49e48-1dfc-4d1b-ad98-874ce9559d0d", "metadata": {}, + "outputs": [], "source": [ - "## Drop None Values" + "# Full Name\n", + "sdf['name_first'] = sdf.name_first.str.strip().str.title()\n", + "sdf['name_last'] = sdf.name_last.str.strip().str.title()\n", + "sdf['full_name'] = sdf['name_last'] + ' ' + sdf['name_first']\n", + "# Remove special chars\n", + "sdf['full_name'] = sdf['full_name'].str.replace(\"[^a-zA-Z' -]\", '', regex=True)" ] }, { "cell_type": "code", - "execution_count": 49, - "id": "0522d1ca-b9bc-4028-b3f9-58ea8d143357", - "metadata": { - "tags": [] - }, + "execution_count": 5, + "id": "9bfb3b34-bdea-4c60-bff9-ffb0a63aa265", + "metadata": {}, "outputs": [], "source": [ - "df.dropna(subset=['name_first', 'name_last'], inplace=True)" + "# recode race\n", + "mapping = {'multi_racial': 'other', 'native_indian': 'other'}\n", + "sdf['race'] = sdf['race'].replace(mapping)" ] }, { "cell_type": "code", - "execution_count": 50, - "id": "49bd34b0-6543-48bd-b0cd-99cb15bf2569", - "metadata": { - "tags": [] - }, + "execution_count": 6, + "id": "041729a5-8518-405f-9cd5-5e6e1869cc2f", + "metadata": {}, "outputs": [ { "data": { @@ -274,796 +132,223 @@ "\n", " \n", " \n", - " \n", - " \n", - " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
name_lastname_firstracefull_nameasianhispanicnh_blacknh_whiteothertotal_n
count154549081545490815454908
unique13411766410958
topSmithMichaelnh_white0A Arup Erik0.00.00.01.00.01.0
freq793621537539446749
\n", - "" - ], - "text/plain": [ - " name_last name_first race\n", - "count 15454908 15454908 15454908\n", - "unique 1341176 641095 8\n", - "top Smith Michael nh_white\n", - "freq 79362 153753 9446749" - ] - }, - "execution_count": 50, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.describe()" - ] - }, - { - "cell_type": "markdown", - "id": "5a1c7014-99e8-4b29-b7e5-98ba16caa3b3", - "metadata": { - "tags": [] - }, - "source": [ - "## Drop Last name and first name of length 1" - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "id": "42f1fd6c-ef4f-4538-bb6c-286db30c250a", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "df = df.drop(df[df['name_last'].str.len() < 2].index)" - ] - }, - { - "cell_type": "code", - "execution_count": 52, - "id": "1fc0d6fd-5ffc-4190-9a11-314f9c34535a", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "df = df.drop(df[df['name_first'].str.len() < 2].index)" - ] - }, - { - "cell_type": "code", - "execution_count": 53, - "id": "02479d63-5cf8-43f3-a208-9119df5b5457", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
name_lastname_firstrace1A Bitang Ahmad0.00.01.00.00.01.0
count1536669015366690153666902A De Feria Graciela0.01.00.00.00.01.0
unique134061764105583A F R Stephenson John Alexander0.00.00.01.00.01.0
topSmithMichaelnh_white4A Felix Noehmi0.01.00.00.00.01.0
freq792971537529383680
\n", - "
" - ], - "text/plain": [ - " name_last name_first race\n", - "count 15366690 15366690 15366690\n", - "unique 1340617 641055 8\n", - "top Smith Michael nh_white\n", - "freq 79297 153752 9383680" - ] - }, - "execution_count": 53, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.describe()" - ] - }, - { - "cell_type": "markdown", - "id": "058f71b5-0798-4ceb-89d7-e241000b7e1f", - "metadata": {}, - "source": [ - "## Make all names title case" - ] - }, - { - "cell_type": "code", - "execution_count": 54, - "id": "313b98b0-89f3-4ed8-9dd2-dad9bee428b1", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "df['name_first'] = df['name_first'].str.title()\n", - "df['name_last'] = df['name_last'].str.title()" - ] - }, - { - "cell_type": "markdown", - "id": "6a7d0983-9883-4777-a29e-6b9c913f9271", - "metadata": {}, - "source": [ - "## Remove Special Characters" - ] - }, - { - "cell_type": "code", - "execution_count": 56, - "id": "8c415ea0-1763-4fc2-b025-ba3cb5f7b786", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "df['full_name'] = df['name_last'] + ' ' + df['name_first']" - ] - }, - { - "cell_type": "code", - "execution_count": 57, - "id": "12dea730-e4dc-4bbd-91f6-286abf4c2fee", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "df['full_name'] = df['full_name'].str.replace(\"[^a-zA-Z' -]\", '', regex=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 58, - "id": "7d693324-de62-4ea3-8edd-ed3892f52a2f", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
name_lastname_firstracefull_name
0Hessler-SmithJasonnh_whiteHessler-Smith Jason
1RogersReneenh_whiteRogers Renee
2BartolomeCrystalnh_whiteBartolome Crystal
3BaileyDonnanh_whiteBailey Donna
4CarlsonGreggorynh_whiteCarlson Greggory
\n", - "
" - ], - "text/plain": [ - " name_last name_first race full_name\n", - "0 Hessler-Smith Jason nh_white Hessler-Smith Jason\n", - "1 Rogers Renee nh_white Rogers Renee\n", - "2 Bartolome Crystal nh_white Bartolome Crystal\n", - "3 Bailey Donna nh_white Bailey Donna\n", - "4 Carlson Greggory nh_white Carlson Greggory" - ] - }, - "execution_count": 58, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.head()" - ] - }, - { - "cell_type": "markdown", - "id": "7ce1807d-f228-482d-a2d8-43211e2806c1", - "metadata": {}, - "source": [ - "## Drop duplicates" - ] - }, - { - "cell_type": "code", - "execution_count": 59, - "id": "98363907-538a-4255-9cd6-35846c58d044", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
name_lastname_firstracefull_name
837MoserPatricianh_whiteMoser Patricia
928JohnsonTiffanynh_blackJohnson Tiffany
1247PerryCharlesnh_whitePerry Charles
2120JohnsonAshleynh_blackJohnson Ashley
2285JohnsonClaytonnh_whiteJohnson Clayton
...............
15455104BallentineRobertnh_whiteBallentine Robert
15455106WattsMarknh_whiteWatts Mark
15455107McraeEvelynnh_whiteMcrae Evelyn
15455108WardStephanienh_whiteWard Stephanie
15455109EdenfieldMarcusnh_whiteEdenfield Marcus
\n", - "

5364911 rows × 4 columns

\n", - "
" - ], - "text/plain": [ - " name_last name_first race full_name\n", - "837 Moser Patricia nh_white Moser Patricia\n", - "928 Johnson Tiffany nh_black Johnson Tiffany\n", - "1247 Perry Charles nh_white Perry Charles\n", - "2120 Johnson Ashley nh_black Johnson Ashley\n", - "2285 Johnson Clayton nh_white Johnson Clayton\n", - "... ... ... ... ...\n", - "15455104 Ballentine Robert nh_white Ballentine Robert\n", - "15455106 Watts Mark nh_white Watts Mark\n", - "15455107 Mcrae Evelyn nh_white Mcrae Evelyn\n", - "15455108 Ward Stephanie nh_white Ward Stephanie\n", - "15455109 Edenfield Marcus nh_white Edenfield Marcus\n", - "\n", - "[5364911 rows x 4 columns]" - ] - }, - "execution_count": 59, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[df[['full_name','race']].duplicated()]" - ] - }, - { - "cell_type": "code", - "execution_count": 60, - "id": "5dcbb0d6-ede7-4b20-af15-4466f04d3fcd", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
name_lastname_firstracefull_name
136PorterPaulanh_whitePorter Paula
550PorterPaulanh_blackPorter Paula
263636PorterPaulanh_whitePorter Paula
1527456PorterPaulanh_whitePorter Paula
7563599PorterPaulanh_whitePorter Paula
7631191PorterPaulanh_whitePorter Paula
8383292PorterPaulanh_whitePorter Paula
8945658PorterPaulanh_whitePorter Paula
9402546PorterPaulanh_whitePorter Paula
10682106PorterPaulanh_whitePorter Paula
12427420PorterPaulanh_whitePorter Paula
12731429PorterPaulanh_whitePorter Paula
14637476PorterPaulanh_whitePorter Paula
\n", - "
" - ], - "text/plain": [ - " name_last name_first race full_name\n", - "136 Porter Paula nh_white Porter Paula\n", - "550 Porter Paula nh_black Porter Paula\n", - "263636 Porter Paula nh_white Porter Paula\n", - "1527456 Porter Paula nh_white Porter Paula\n", - "7563599 Porter Paula nh_white Porter Paula\n", - "7631191 Porter Paula nh_white Porter Paula\n", - "8383292 Porter Paula nh_white Porter Paula\n", - "8945658 Porter Paula nh_white Porter Paula\n", - "9402546 Porter Paula nh_white Porter Paula\n", - "10682106 Porter Paula nh_white Porter Paula\n", - "12427420 Porter Paula nh_white Porter Paula\n", - "12731429 Porter Paula nh_white Porter Paula\n", - "14637476 Porter Paula nh_white Porter Paula" - ] - }, - "execution_count": 60, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[df['full_name'] == \"Porter Paula\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 61, - "id": "68f7146a-45e7-40c6-8f2a-9e50bd914743", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "df = df.drop_duplicates(['full_name','race'],keep= 'last')" - ] - }, - { - "cell_type": "code", - "execution_count": 62, - "id": "ed066729-7970-47ea-ad65-9842bc71366a", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
name_lastname_firstracefull_name
550PorterPaulanh_blackPorter Paula
14637476PorterPaulanh_whitePorter Paula
\n", - "
" - ], - "text/plain": [ - " name_last name_first race full_name\n", - "550 Porter Paula nh_black Porter Paula\n", - "14637476 Porter Paula nh_white Porter Paula" - ] - }, - "execution_count": 62, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[df['full_name'] == \"Porter Paula\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 63, - "id": "0d2504cf-d9f9-4eff-a53f-fa859f686ed1", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " \n", + " \n", + " 9018613\n", + " Zyzdryn Krzysztof\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 1.0\n", + " 0.0\n", + " 1.0\n", + " \n", + " \n", + " 9018614\n", + " Zyznomyrsky John\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 1.0\n", + " 0.0\n", + " 1.0\n", + " \n", + " \n", + " 9018615\n", + " Zzaman Md\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 1.0\n", + " 1.0\n", + " \n", + " \n", + " 9018616\n", + " Zzaman Mohammad\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 1.0\n", + " 1.0\n", + " \n", + " \n", + " 9018617\n", + " Zzie Richard\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 1.0\n", + " 0.0\n", + " 1.0\n", + " \n", + " \n", + "\n", + "

9018618 rows × 7 columns

\n", + "" + ], "text/plain": [ - "(10001779, 4)" + "race full_name asian hispanic nh_black nh_white \\\n", + "0 A Arup Erik 0.0 0.0 0.0 1.0 \n", + "1 A Bitang Ahmad 0.0 0.0 1.0 0.0 \n", + "2 A De Feria Graciela 0.0 1.0 0.0 0.0 \n", + "3 A F R Stephenson John Alexander 0.0 0.0 0.0 1.0 \n", + "4 A Felix Noehmi 0.0 1.0 0.0 0.0 \n", + "... ... ... ... ... ... \n", + "9018613 Zyzdryn Krzysztof 0.0 0.0 0.0 1.0 \n", + "9018614 Zyznomyrsky John 0.0 0.0 0.0 1.0 \n", + "9018615 Zzaman Md 0.0 0.0 0.0 0.0 \n", + "9018616 Zzaman Mohammad 0.0 0.0 0.0 0.0 \n", + "9018617 Zzie Richard 0.0 0.0 0.0 1.0 \n", + "\n", + "race other total_n \n", + "0 0.0 1.0 \n", + "1 0.0 1.0 \n", + "2 0.0 1.0 \n", + "3 0.0 1.0 \n", + "4 0.0 1.0 \n", + "... ... ... \n", + "9018613 0.0 1.0 \n", + "9018614 0.0 1.0 \n", + "9018615 1.0 1.0 \n", + "9018616 1.0 1.0 \n", + "9018617 0.0 1.0 \n", + "\n", + "[9018618 rows x 7 columns]" ] }, - "execution_count": 63, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df.shape" + "# Summing the count of each name & race combination\n", + "gdf = sdf.groupby(['full_name','race'], as_index=False)['race'].agg(['count'])\n", + "# creating a pivot table so that each name has a count of the # of races with that last name\n", + "gdf = gdf.pivot_table(values='count', columns='race', index='full_name')\n", + "\n", + "# Converting NaN to zeros since that means there is no one that identifies with that race with that last name\n", + "gdf = gdf.fillna(0)\n", + "\n", + "gdf['total_n'] = gdf.sum(axis=1)\n", + "gdf.reset_index(inplace=True)\n", + "gdf.iloc[:, 1:-1] = gdf.iloc[:, 1:-1].div(gdf.total_n, axis=0)\n", + "\n", + "gdf" ] }, { "cell_type": "code", - "execution_count": 64, - "id": "6351f866-44f2-4c5d-b9ec-ccbfb7278d9c", - "metadata": { - "tags": [] - }, + "execution_count": 7, + "id": "3e953fb6-2fe8-4d34-be56-10cf6698dc35", + "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "10001779" - ] - }, - "execution_count": 64, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "['asian', 'hispanic', 'nh_black', 'nh_white', 'other']\n" + ] } ], "source": [ - "len(df)" - ] - }, - { - "cell_type": "markdown", - "id": "9830ca67-c7a7-4587-bbc1-db7e0f552ae5", - "metadata": {}, - "source": [ - "## Drop and merge columns" - ] - }, - { - "cell_type": "code", - "execution_count": 65, - "id": "8c0468aa-dae8-4671-bbe5-88196c1b0fb1", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# dropping unknown column\n", - "df = df.drop(df[df['race'] == 'unknown'].index)" + "races = sorted(sdf.race.unique().tolist())\n", + "print(races)\n", + "\n", + "def get_race_idx(val, races):\n", + " race_idx = races.index(val)\n", + " return race_idx" ] }, { "cell_type": "code", - "execution_count": 66, - "id": "9d21c416-ea30-48f4-8794-721dd65f6259", - "metadata": { - "tags": [] - }, + "execution_count": 9, + "id": "db8c31ec-17d4-464a-8693-754a389c9bf5", + "metadata": {}, "outputs": [], "source": [ - "# combine multi_racial and native_indian to other\n", - "mapping = {'multi_racial': 'other', 'native_indian': 'other'}\n", - "df['race'] = df['race'].replace(mapping)" + "# for one set of analysis, we define 'true race/ethincity' = where max prob (so modal race = true race)\n", + "gdf['race'] = gdf[races].idxmax(axis=1)\n", + "gdf['race_code'] = gdf['race'].apply(lambda c: get_race_idx(c,races))" ] }, { "cell_type": "code", - "execution_count": 67, - "id": "4aeb7f29-01df-4702-bbbe-156924242bb0", - "metadata": { - "tags": [] - }, + "execution_count": 10, + "id": "734fc268-8d20-4120-8693-7da140e4c8ec", + "metadata": {}, "outputs": [ { "data": { @@ -1085,68 +370,116 @@ "\n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
racefull_name
asianhispanicnh_blacknh_whiteothertotal_nracerace_code
asian2782900A Arup Erik0.00.00.01.00.01.0nh_white3
hispanic16905731A Bitang Ahmad0.00.01.00.00.01.0nh_black2
nh_black14929892A De Feria Graciela0.01.00.00.00.01.0hispanic1
nh_white57347013A F R Stephenson John Alexander0.00.00.01.00.01.0nh_white3
other3906484A Felix Noehmi0.01.00.00.00.01.0hispanic1
\n", "" ], "text/plain": [ - " full_name\n", - "race \n", - "asian 278290\n", - "hispanic 1690573\n", - "nh_black 1492989\n", - "nh_white 5734701\n", - "other 390648" + "race full_name asian hispanic nh_black nh_white \\\n", + "0 A Arup Erik 0.0 0.0 0.0 1.0 \n", + "1 A Bitang Ahmad 0.0 0.0 1.0 0.0 \n", + "2 A De Feria Graciela 0.0 1.0 0.0 0.0 \n", + "3 A F R Stephenson John Alexander 0.0 0.0 0.0 1.0 \n", + "4 A Felix Noehmi 0.0 1.0 0.0 0.0 \n", + "\n", + "race other total_n race race_code \n", + "0 0.0 1.0 nh_white 3 \n", + "1 0.0 1.0 nh_black 2 \n", + "2 0.0 1.0 hispanic 1 \n", + "3 0.0 1.0 nh_white 3 \n", + "4 0.0 1.0 hispanic 1 " ] }, - "execution_count": 67, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df.groupby('race').agg({'full_name':'nunique'})" + "gdf.head()" ] }, { "cell_type": "code", - "execution_count": 68, - "id": "2e4d2735-03e0-4c3c-b049-216c662a84cb", - "metadata": { - "tags": [] - }, + "execution_count": 11, + "id": "036bbb11-0d02-45db-801c-03a2873291c5", + "metadata": {}, "outputs": [], "source": [ - "df['race_code'] = df.race.factorize()[0]" + "gdf.to_csv(\"train_validation_test/gdf_fullname.csv.gz\", index = False, compression=\"gzip\")" ] }, { @@ -1159,29 +492,18 @@ }, { "cell_type": "code", - "execution_count": 69, + "execution_count": 13, "id": "3a11b216-fde5-48c7-b5ec-904dbde4bb29", "metadata": {}, "outputs": [], "source": [ - "train_df, rest_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['race_code'])" - ] - }, - { - "cell_type": "code", - "execution_count": 70, - "id": "f7d44e05-63d5-47cc-85db-c59fc3e169f1", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ + "train_df, rest_df = train_test_split(gdf, test_size=0.2, random_state=42, stratify=gdf['race_code'])\n", "val_df, test_df = train_test_split(rest_df, test_size=0.5, random_state=42, stratify=rest_df['race_code'])" ] }, { "cell_type": "code", - "execution_count": 71, + "execution_count": 14, "id": "e47fa000-d58f-4360-9487-bd33c149433e", "metadata": { "tags": [] @@ -1195,7 +517,7 @@ }, { "cell_type": "code", - "execution_count": 72, + "execution_count": 15, "id": "fce465b6-0a17-437d-ad38-7d8036c74d0e", "metadata": { "tags": [] @@ -1205,9 +527,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "(7678780, 5)\n", - "(959847, 5)\n", - "(959848, 5)\n" + "(7214894, 9)\n", + "(901862, 9)\n", + "(901862, 9)\n" ] } ], @@ -1219,87 +541,7 @@ }, { "cell_type": "code", - "execution_count": 73, - "id": "c30f722b-995b-4269-bac7-38bc4ba64999", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
full_name
race
asian278290
hispanic1690573
nh_black1492989
nh_white5734701
other390648
\n", - "
" - ], - "text/plain": [ - " full_name\n", - "race \n", - "asian 278290\n", - "hispanic 1690573\n", - "nh_black 1492989\n", - "nh_white 5734701\n", - "other 390648" - ] - }, - "execution_count": 73, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.groupby('race').agg({'full_name':'nunique'})" - ] - }, - { - "cell_type": "code", - "execution_count": 74, + "execution_count": 16, "id": "66f97f4e-4fda-44bc-8d3c-65fbe6eea9b6", "metadata": { "tags": [] @@ -1325,7 +567,7 @@ "\n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -1336,39 +578,39 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", "
racefull_name
asian222632206042
hispanic13524581308198
nh_black11943911067770
nh_white45877614421898
other314216210986
\n", "" ], "text/plain": [ - " full_name\n", + "race full_name\n", "race \n", - "asian 222632\n", - "hispanic 1352458\n", - "nh_black 1194391\n", - "nh_white 4587761\n", - "other 314216" + "asian 206042\n", + "hispanic 1308198\n", + "nh_black 1067770\n", + "nh_white 4421898\n", + "other 210986" ] }, - "execution_count": 74, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -1379,7 +621,7 @@ }, { "cell_type": "code", - "execution_count": 75, + "execution_count": 17, "id": "7d9a625b-fd10-4374-b744-e809620c86d5", "metadata": { "tags": [] @@ -1405,7 +647,7 @@ "\n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -1416,39 +658,39 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", "
racefull_name
asian2782925755
hispanic169057163525
nh_black149299133471
nh_white573470552738
other4006126373
\n", "" ], "text/plain": [ - " full_name\n", + "race full_name\n", "race \n", - "asian 27829\n", - "hispanic 169057\n", - "nh_black 149299\n", - "nh_white 573470\n", - "other 40061" + "asian 25755\n", + "hispanic 163525\n", + "nh_black 133471\n", + "nh_white 552738\n", + "other 26373" ] }, - "execution_count": 75, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -1459,7 +701,7 @@ }, { "cell_type": "code", - "execution_count": 76, + "execution_count": 18, "id": "e61fc0f4-1b6c-42a0-a1d5-cf7cbff8a290", "metadata": { "tags": [] @@ -1485,7 +727,7 @@ "\n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -1496,39 +738,39 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", "
racefull_name
asian2782925756
hispanic169058163525
nh_black149299133471
nh_white573470552737
other4006826373
\n", "" ], "text/plain": [ - " full_name\n", + "race full_name\n", "race \n", - "asian 27829\n", - "hispanic 169058\n", - "nh_black 149299\n", - "nh_white 573470\n", - "other 40068" + "asian 25756\n", + "hispanic 163525\n", + "nh_black 133471\n", + "nh_white 552737\n", + "other 26373" ] }, - "execution_count": 76, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -1547,143 +789,22 @@ }, { "cell_type": "code", - "execution_count": 77, + "execution_count": 19, "id": "ecc12a2c-bee0-49bd-b42d-ab8cb5589a15", "metadata": { "tags": [] }, "outputs": [], "source": [ - "train_df.to_csv(\"data/fl_2022_FullName_train.csv.gz\",index=False,compression=\"gzip\")\n", - "val_df.to_csv(\"data/fl_2022_FullName_val.csv.gz\",index=False,compression=\"gzip\")\n", - "test_df.to_csv(\"data/fl_2022_FullName_test.csv.gz\",index=False,compression=\"gzip\")" - ] - }, - { - "cell_type": "code", - "execution_count": 78, - "id": "aa9be3b5-ee0d-4935-9b21-14012c676235", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "13M\tdata/fl_2022_FullName_test.csv.gz\n", - "101M\tdata/fl_2022_FullName_train.csv.gz\n", - "13M\tdata/fl_2022_FullName_val.csv.gz\n" - ] - } - ], - "source": [ - "!du -sh data/fl_2022_FullName_*" - ] - }, - { - "cell_type": "code", - "execution_count": 79, - "id": "2fb93bc9-448d-45e0-a976-9312bf94e708", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
name_lastname_firstracefull_namerace_code
0BaxlaPhyllisnh_whiteBaxla Phyllis0
1LudwinRonnh_whiteLudwin Ron0
2Signer WeltonJessicanh_whiteSigner Welton Jessica0
3StampsJoshuanh_whiteStamps Joshua0
4VassellLillienh_blackVassell Lillie1
\n", - "
" - ], - "text/plain": [ - " name_last name_first race full_name race_code\n", - "0 Baxla Phyllis nh_white Baxla Phyllis 0\n", - "1 Ludwin Ron nh_white Ludwin Ron 0\n", - "2 Signer Welton Jessica nh_white Signer Welton Jessica 0\n", - "3 Stamps Joshua nh_white Stamps Joshua 0\n", - "4 Vassell Lillie nh_black Vassell Lillie 1" - ] - }, - "execution_count": 79, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "train_df.head()" + "train_df.to_csv(\"data/fl_2022_FullName_train.csv.gz\", index=False, compression=\"gzip\")\n", + "val_df.to_csv(\"data/fl_2022_FullName_val.csv.gz\", index=False, compression=\"gzip\")\n", + "test_df.to_csv(\"data/fl_2022_FullName_test.csv.gz\", index=False, compression=\"gzip\")" ] }, { "cell_type": "code", "execution_count": null, - "id": "b0a55dbb-c317-4ab8-b4a8-7ff0eb9d645a", + "id": "41d34d62-afb2-479e-a21b-040f1eb962ff", "metadata": {}, "outputs": [], "source": [] @@ -1705,7 +826,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.8.10" } }, "nbformat": 4, diff --git a/notebooks/0.2_data_preprocessing_LastName.ipynb b/notebooks/0.2_data_preprocessing_LastName.ipynb index 320c956..e1ecb38 100644 --- a/notebooks/0.2_data_preprocessing_LastName.ipynb +++ b/notebooks/0.2_data_preprocessing_LastName.ipynb @@ -1,969 +1,32 @@ { "cells": [ { - "cell_type": "code", - "execution_count": 34, - "id": "71a4990d-d590-4f49-9d25-97982e6d58c0", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "\n", - "from sklearn.model_selection import train_test_split" - ] - }, - { - "cell_type": "markdown", - "id": "78ee21e5-efa5-4c7f-b2cc-32f8df219d08", - "metadata": {}, - "source": [ - "# Preprocessing data" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "id": "af05aa15-8218-4086-9626-adadd2552183", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "df = pd.read_csv('./data/fl_reg_name_race_2022.csv.gz')" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "id": "05503af7-99a2-4cf2-bdd4-b25d9015aa92", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
name_lastname_firstrace
0Hessler-SmithJasonnh_white
1RogersReneenh_white
2BartolomeCrystalnh_white
3BaileyDonnanh_white
4CarlsonGreggorynh_white
\n", - "
" - ], - "text/plain": [ - " name_last name_first race\n", - "0 Hessler-Smith Jason nh_white\n", - "1 Rogers Renee nh_white\n", - "2 Bartolome Crystal nh_white\n", - "3 Bailey Donna nh_white\n", - "4 Carlson Greggory nh_white" - ] - }, - "execution_count": 36, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "id": "b6b123c8-70ba-4ab4-841c-b5486a1ba69a", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
name_lastname_firstrace
count154549921545502215455110
unique13411956411038
topSmithMichaelnh_white
freq793621537539446851
\n", - "
" - ], - "text/plain": [ - " name_last name_first race\n", - "count 15454992 15455022 15455110\n", - "unique 1341195 641103 8\n", - "top Smith Michael nh_white\n", - "freq 79362 153753 9446851" - ] - }, - "execution_count": 37, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "id": "33b5941c-619c-485f-8767-27cdfa71ab27", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "array(['nh_white', 'nh_black', 'other', 'hispanic', 'asian',\n", - " 'native_indian', 'unknown', 'multi_racial'], dtype=object)" - ] - }, - "execution_count": 38, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df['race'].unique()" - ] - }, - { - "cell_type": "markdown", - "id": "65f441b5-3c7d-48f2-b6ba-352e897a72ae", - "metadata": {}, - "source": [ - "## Drop None Values" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "id": "0522d1ca-b9bc-4028-b3f9-58ea8d143357", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "df.dropna(subset=['name_first', 'name_last'], inplace=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "id": "49bd34b0-6543-48bd-b0cd-99cb15bf2569", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
name_lastname_firstrace
count154549081545490815454908
unique13411766410958
topSmithMichaelnh_white
freq793621537539446749
\n", - "
" - ], - "text/plain": [ - " name_last name_first race\n", - "count 15454908 15454908 15454908\n", - "unique 1341176 641095 8\n", - "top Smith Michael nh_white\n", - "freq 79362 153753 9446749" - ] - }, - "execution_count": 40, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.describe()" - ] - }, - { - "cell_type": "markdown", - "id": "5a1c7014-99e8-4b29-b7e5-98ba16caa3b3", - "metadata": { - "tags": [] - }, - "source": [ - "## Drop Last name and first name of length 1" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "id": "42f1fd6c-ef4f-4538-bb6c-286db30c250a", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "df = df.drop(df[df['name_last'].str.len() < 2].index)" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "id": "1fc0d6fd-5ffc-4190-9a11-314f9c34535a", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "df = df.drop(df[df['name_first'].str.len() < 2].index)" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "id": "02479d63-5cf8-43f3-a208-9119df5b5457", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
name_lastname_firstrace
count153666901536669015366690
unique13406176410558
topSmithMichaelnh_white
freq792971537529383680
\n", - "
" - ], - "text/plain": [ - " name_last name_first race\n", - "count 15366690 15366690 15366690\n", - "unique 1340617 641055 8\n", - "top Smith Michael nh_white\n", - "freq 79297 153752 9383680" - ] - }, - "execution_count": 43, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.describe()" - ] - }, - { - "cell_type": "markdown", - "id": "22f6aa41-af9c-44a7-a9ce-b5faf2c36caf", - "metadata": {}, - "source": [ - "## Make all names title case" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "id": "96a27d5c-b423-43d8-bb10-fc1e4d4404cb", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "df['name_first'] = df['name_first'].str.title()\n", - "df['name_last'] = df['name_last'].str.title()" - ] - }, - { - "cell_type": "markdown", - "id": "6a7d0983-9883-4777-a29e-6b9c913f9271", - "metadata": {}, - "source": [ - "## Remove Special Characters" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "id": "12dea730-e4dc-4bbd-91f6-286abf4c2fee", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "df['name_last'] = df['name_last'].str.replace(\"[^a-zA-Z' -]\", '', regex=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "id": "7d693324-de62-4ea3-8edd-ed3892f52a2f", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
name_lastname_firstrace
0Hessler-SmithJasonnh_white
1RogersReneenh_white
2BartolomeCrystalnh_white
3BaileyDonnanh_white
4CarlsonGreggorynh_white
\n", - "
" - ], - "text/plain": [ - " name_last name_first race\n", - "0 Hessler-Smith Jason nh_white\n", - "1 Rogers Renee nh_white\n", - "2 Bartolome Crystal nh_white\n", - "3 Bailey Donna nh_white\n", - "4 Carlson Greggory nh_white" - ] - }, - "execution_count": 46, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.head()" - ] - }, - { - "cell_type": "markdown", - "id": "7ce1807d-f228-482d-a2d8-43211e2806c1", - "metadata": {}, - "source": [ - "## Drop duplicates" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "id": "98363907-538a-4255-9cd6-35846c58d044", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
name_lastname_firstrace
52GruberLindanh_white
122TaylorRobertnh_white
127BaileyPamelanh_white
138JohnsonAshleynh_black
146MobleyRobertnh_black
............
15455105BallewChristinanh_white
15455106WattsMarknh_white
15455107McraeEvelynnh_white
15455108WardStephanienh_white
15455109EdenfieldMarcusnh_white
\n", - "

13894849 rows × 3 columns

\n", - "
" - ], - "text/plain": [ - " name_last name_first race\n", - "52 Gruber Linda nh_white\n", - "122 Taylor Robert nh_white\n", - "127 Bailey Pamela nh_white\n", - "138 Johnson Ashley nh_black\n", - "146 Mobley Robert nh_black\n", - "... ... ... ...\n", - "15455105 Ballew Christina nh_white\n", - "15455106 Watts Mark nh_white\n", - "15455107 Mcrae Evelyn nh_white\n", - "15455108 Ward Stephanie nh_white\n", - "15455109 Edenfield Marcus nh_white\n", - "\n", - "[13894849 rows x 3 columns]" - ] - }, - "execution_count": 47, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[df[['name_last','race']].duplicated()]" - ] - }, - { - "cell_type": "code", - "execution_count": 48, - "id": "5dcbb0d6-ede7-4b20-af15-4466f04d3fcd", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
name_lastname_firstrace
136PorterPaulanh_white
550PorterPaulanh_black
7329PorterWendellnh_white
7557PorterAnthonynh_white
9200PorterKevinnh_white
............
15448598PorterWilliamnh_white
15448772PorterKylenh_white
15451135PorterJeannh_white
15451767PorterAnnettenh_white
15454870PorterAshleynh_white
\n", - "

7451 rows × 3 columns

\n", - "
" - ], - "text/plain": [ - " name_last name_first race\n", - "136 Porter Paula nh_white\n", - "550 Porter Paula nh_black\n", - "7329 Porter Wendell nh_white\n", - "7557 Porter Anthony nh_white\n", - "9200 Porter Kevin nh_white\n", - "... ... ... ...\n", - "15448598 Porter William nh_white\n", - "15448772 Porter Kyle nh_white\n", - "15451135 Porter Jean nh_white\n", - "15451767 Porter Annette nh_white\n", - "15454870 Porter Ashley nh_white\n", - "\n", - "[7451 rows x 3 columns]" - ] - }, - "execution_count": 48, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[df['name_last'] == \"Porter\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "id": "68f7146a-45e7-40c6-8f2a-9e50bd914743", + "cell_type": "markdown", + "id": "6cc055eb-51ad-43c5-aacf-798983f0adfa", "metadata": { "tags": [] }, - "outputs": [], "source": [ - "df = df.drop_duplicates(['name_last','race'],keep= 'last')" + "### Last Name Preprocessing (Train/Validation/Test)" ] }, { "cell_type": "code", - "execution_count": 50, - "id": "ed066729-7970-47ea-ad65-9842bc71366a", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
name_lastname_firstrace
14952661PorterMarisydasian
15029071PorterAmbermulti_racial
15222442PorterAnnaother
15337979PorterDennisunknown
15369699PorterLilanative_indian
15378779PorterCristopherhispanic
15438806PorterOrricknh_black
15454870PorterAshleynh_white
\n", - "
" - ], - "text/plain": [ - " name_last name_first race\n", - "14952661 Porter Marisyd asian\n", - "15029071 Porter Amber multi_racial\n", - "15222442 Porter Anna other\n", - "15337979 Porter Dennis unknown\n", - "15369699 Porter Lila native_indian\n", - "15378779 Porter Cristopher hispanic\n", - "15438806 Porter Orrick nh_black\n", - "15454870 Porter Ashley nh_white" - ] - }, - "execution_count": 50, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": 1, + "id": "722b0940-de98-48ad-96b1-89abf710e3f1", + "metadata": {}, + "outputs": [], "source": [ - "df[df['name_last'] == \"Porter\"]" + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "from sklearn.model_selection import train_test_split" ] }, { "cell_type": "code", - "execution_count": 51, - "id": "0d2504cf-d9f9-4eff-a53f-fa859f686ed1", + "execution_count": 2, + "id": "af05aa15-8218-4086-9626-adadd2552183", "metadata": { "tags": [] }, @@ -971,83 +34,80 @@ { "data": { "text/plain": [ - "(1471841, 3)" + "(15455110, 3)" ] }, - "execution_count": 51, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "df = pd.read_csv('data/fl_reg_name_race_2022.csv.gz')\n", "df.shape" ] }, { "cell_type": "code", - "execution_count": 52, - "id": "6351f866-44f2-4c5d-b9ec-ccbfb7278d9c", - "metadata": { - "tags": [] - }, + "execution_count": 3, + "id": "5c244d79-6807-4156-b318-10555d85de1e", + "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "1471841" - ] - }, - "execution_count": 52, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "Size after dropping missing first and last names: (15454979, 3)\n", + "Size after dropping unknown: (15009244, 3)\n", + "Size after dropping last names less than 2 chars: (14933334, 3)\n" + ] } ], "source": [ - "len(df)" - ] - }, - { - "cell_type": "markdown", - "id": "9830ca67-c7a7-4587-bbc1-db7e0f552ae5", - "metadata": {}, - "source": [ - "## Drop and merge columns" + "# Remove NA first/last\n", + "df.dropna(subset=['name_first', 'name_last'], inplace=True)\n", + "print(\"Size after dropping missing first and last names:\", df.shape)\n", + "\n", + "# We assume unknown as missing at random\n", + "sdf = df[df.race.isin(['unknown']) == False]\n", + "print(\"Size after dropping unknown:\", sdf.shape)\n", + "del df\n", + "\n", + "# Drop cases where last name is less than 2 chars\n", + "sdf = sdf.drop(sdf[sdf['name_last'].str.len() < 2].index)\n", + "print(\"Size after dropping last names less than 2 chars:\", sdf.shape)" ] }, { "cell_type": "code", - "execution_count": 53, - "id": "8c0468aa-dae8-4671-bbe5-88196c1b0fb1", + "execution_count": 4, + "id": "96a27d5c-b423-43d8-bb10-fc1e4d4404cb", "metadata": { "tags": [] }, "outputs": [], "source": [ - "# dropping unknown column\n", - "df = df.drop(df[df['race'] == 'unknown'].index)" + "sdf['name_last'] = sdf['name_last'].str.title()\n", + "sdf['name_last'] = sdf['name_last'].str.replace(\"[^a-zA-Z' -]\", '', regex=True)" ] }, { "cell_type": "code", - "execution_count": 54, - "id": "9d21c416-ea30-48f4-8794-721dd65f6259", - "metadata": { - "tags": [] - }, + "execution_count": 5, + "id": "76cc90e1-9961-4ea4-bba0-003eb1c9965d", + "metadata": {}, "outputs": [], "source": [ - "# combine multi_racial and native_indian to other\n", + "# recode race\n", "mapping = {'multi_racial': 'other', 'native_indian': 'other'}\n", - "df['race'] = df['race'].replace(mapping)" + "sdf['race'] = sdf['race'].replace(mapping)" ] }, { "cell_type": "code", - "execution_count": 55, - "id": "4aeb7f29-01df-4702-bbbe-156924242bb0", - "metadata": { - "tags": [] - }, + "execution_count": 6, + "id": "bcf7903a-d84e-456f-a060-82ceec27b2a7", + "metadata": {}, "outputs": [ { "data": { @@ -1069,68 +129,212 @@ "\n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
name_last
racename_lastasianhispanicnh_blacknh_whiteothertotal_n
asian686720A Arup0.00.00.01.00.01.0
hispanic3896091A Bitang0.00.01.00.00.01.0
nh_black1372712A De Feria0.01.00.00.00.01.0
nh_white6097073A F R Stephenson0.00.00.01.00.01.0
other1152184A Felix0.01.00.00.00.01.0
........................
1056640Zyzanski0.00.00.01.00.01.0
1056641Zyzdryn0.00.00.01.00.02.0
1056642Zyznomyrsky0.00.00.01.00.01.0
1056643Zzaman0.00.00.00.01.02.0
1056644Zzie0.00.00.01.00.01.0
\n", + "

1056645 rows × 7 columns

\n", "" ], "text/plain": [ - " name_last\n", - "race \n", - "asian 68672\n", - "hispanic 389609\n", - "nh_black 137271\n", - "nh_white 609707\n", - "other 115218" + "race name_last asian hispanic nh_black nh_white other total_n\n", + "0 A Arup 0.0 0.0 0.0 1.0 0.0 1.0\n", + "1 A Bitang 0.0 0.0 1.0 0.0 0.0 1.0\n", + "2 A De Feria 0.0 1.0 0.0 0.0 0.0 1.0\n", + "3 A F R Stephenson 0.0 0.0 0.0 1.0 0.0 1.0\n", + "4 A Felix 0.0 1.0 0.0 0.0 0.0 1.0\n", + "... ... ... ... ... ... ... ...\n", + "1056640 Zyzanski 0.0 0.0 0.0 1.0 0.0 1.0\n", + "1056641 Zyzdryn 0.0 0.0 0.0 1.0 0.0 2.0\n", + "1056642 Zyznomyrsky 0.0 0.0 0.0 1.0 0.0 1.0\n", + "1056643 Zzaman 0.0 0.0 0.0 0.0 1.0 2.0\n", + "1056644 Zzie 0.0 0.0 0.0 1.0 0.0 1.0\n", + "\n", + "[1056645 rows x 7 columns]" ] }, - "execution_count": 55, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df.groupby('race').agg({'name_last':'nunique'})" + "# Summing the count of each name & race combination\n", + "gdf = sdf.groupby(['name_last','race'], as_index=False)['race'].agg(['count'])\n", + "# creating a pivot table so that each name has a count of the # of races with that last name\n", + "gdf = gdf.pivot_table(values='count', columns='race', index='name_last')\n", + "\n", + "# Converting NaN to zeros since that means there is no one that identifies with that race with that last name\n", + "gdf = gdf.fillna(0)\n", + "\n", + "gdf['total_n'] = gdf.sum(axis=1)\n", + "gdf.reset_index(inplace=True)\n", + "gdf.iloc[:, 1:-1] = gdf.iloc[:, 1:-1].div(gdf.total_n, axis=0)\n", + "gdf" ] }, { "cell_type": "code", - "execution_count": 56, - "id": "98ff85be-2f1e-4621-8aa8-11eb3965ecd4", - "metadata": { - "tags": [] - }, + "execution_count": 7, + "id": "25ebef7d-1af3-4039-8518-5ccef5f07c7c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['asian', 'hispanic', 'nh_black', 'nh_white', 'other']\n" + ] + } + ], + "source": [ + "races = sorted(sdf.race.unique().tolist())\n", + "print(races)\n", + "\n", + "def get_race_idx(val, races):\n", + " race_idx = races.index(val)\n", + " return race_idx" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "55c26fd4-55b4-4c9e-b918-06f124d4691c", + "metadata": {}, + "outputs": [], + "source": [ + "# for one set of analysis, we define 'true race/ethincity' = where max prob (so modal race = true race)\n", + "gdf['race'] = gdf[races].idxmax(axis=1)\n", + "gdf['race_code'] = gdf['race'].apply(lambda c: get_race_idx(c,races))" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "8704cd24-c082-453d-90cc-acd0101f6fd7", + "metadata": {}, "outputs": [], "source": [ - "df['race_code'] = df.race.factorize()[0]" + "gdf.to_csv(\"train_validation_test/fl_2022_lastname.csv.gz\", index = False, compression=\"gzip\")" ] }, { @@ -1143,17 +347,17 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 11, "id": "3a11b216-fde5-48c7-b5ec-904dbde4bb29", "metadata": {}, "outputs": [], "source": [ - "train_df, rest_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['race_code'])" + "train_df, rest_df = train_test_split(gdf, test_size=0.2, random_state=42, stratify=gdf['race_code'])" ] }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 12, "id": "f7d44e05-63d5-47cc-85db-c59fc3e169f1", "metadata": { "tags": [] @@ -1165,7 +369,7 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 13, "id": "e47fa000-d58f-4360-9487-bd33c149433e", "metadata": { "tags": [] @@ -1179,7 +383,7 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 14, "id": "fce465b6-0a17-437d-ad38-7d8036c74d0e", "metadata": { "tags": [] @@ -1189,9 +393,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "(1079199, 4)\n", - "(134900, 4)\n", - "(134900, 4)\n" + "(845316, 9)\n", + "(105664, 9)\n", + "(105665, 9)\n" ] } ], @@ -1203,87 +407,7 @@ }, { "cell_type": "code", - "execution_count": 61, - "id": "c30f722b-995b-4269-bac7-38bc4ba64999", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
name_last
race
asian68672
hispanic389609
nh_black137271
nh_white609707
other115218
\n", - "
" - ], - "text/plain": [ - " name_last\n", - "race \n", - "asian 68672\n", - "hispanic 389609\n", - "nh_black 137271\n", - "nh_white 609707\n", - "other 115218" - ] - }, - "execution_count": 61, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.groupby('race').agg({'name_last':'nunique'})" - ] - }, - { - "cell_type": "code", - "execution_count": 62, + "execution_count": 15, "id": "66f97f4e-4fda-44bc-8d3c-65fbe6eea9b6", "metadata": { "tags": [] @@ -1309,7 +433,7 @@ "\n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -1320,39 +444,39 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", "
racename_last
asian5493829184
hispanic311687259689
nh_black10981783227
nh_white487765450098
other9570823118
\n", "" ], "text/plain": [ - " name_last\n", + "race name_last\n", "race \n", - "asian 54938\n", - "hispanic 311687\n", - "nh_black 109817\n", - "nh_white 487765\n", - "other 95708" + "asian 29184\n", + "hispanic 259689\n", + "nh_black 83227\n", + "nh_white 450098\n", + "other 23118" ] }, - "execution_count": 62, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -1363,7 +487,7 @@ }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 16, "id": "7d9a625b-fd10-4374-b744-e809620c86d5", "metadata": { "tags": [] @@ -1389,7 +513,7 @@ "\n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -1400,39 +524,39 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", "
racename_last
asian68673648
hispanic3896132461
nh_black1372710403
nh_white6097156262
other140682890
\n", "" ], "text/plain": [ - " name_last\n", + "race name_last\n", "race \n", - "asian 6867\n", - "hispanic 38961\n", - "nh_black 13727\n", - "nh_white 60971\n", - "other 14068" + "asian 3648\n", + "hispanic 32461\n", + "nh_black 10403\n", + "nh_white 56262\n", + "other 2890" ] }, - "execution_count": 63, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -1443,7 +567,7 @@ }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 17, "id": "e61fc0f4-1b6c-42a0-a1d5-cf7cbff8a290", "metadata": { "tags": [] @@ -1469,7 +593,7 @@ "\n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -1480,39 +604,39 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", "
racename_last
asian68673648
hispanic3896132461
nh_black1372710404
nh_white6097156262
other140602890
\n", "" ], "text/plain": [ - " name_last\n", + "race name_last\n", "race \n", - "asian 6867\n", - "hispanic 38961\n", - "nh_black 13727\n", - "nh_white 60971\n", - "other 14060" + "asian 3648\n", + "hispanic 32461\n", + "nh_black 10404\n", + "nh_white 56262\n", + "other 2890" ] }, - "execution_count": 64, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -1531,7 +655,7 @@ }, { "cell_type": "code", - "execution_count": 65, + "execution_count": 18, "id": "ecc12a2c-bee0-49bd-b42d-ab8cb5589a15", "metadata": { "tags": [] @@ -1543,32 +667,10 @@ "test_df.to_csv(\"data/fl_2022_LastName_test.csv.gz\",index=False,compression=\"gzip\")" ] }, - { - "cell_type": "code", - "execution_count": 66, - "id": "aa9be3b5-ee0d-4935-9b21-14012c676235", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1.3M\tdata/fl_2022_LastName_test.csv.gz\n", - "11M\tdata/fl_2022_LastName_train.csv.gz\n", - "1.3M\tdata/fl_2022_LastName_val.csv.gz\n" - ] - } - ], - "source": [ - "!du -sh data/fl_2022_LastName_*" - ] - }, { "cell_type": "code", "execution_count": null, - "id": "2fb93bc9-448d-45e0-a976-9312bf94e708", + "id": "6b389e66-ee18-4ee6-b3d1-87c73859189c", "metadata": {}, "outputs": [], "source": [] @@ -1590,7 +692,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.8.10" } }, "nbformat": 4,