-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathorganize_cdc_data.py
82 lines (66 loc) · 4.57 KB
/
organize_cdc_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
'''This code performs all the operations to format the cdc data downloads to a standard data format. '''
import pandas as pd
import numpy as np
def organize(county_data, state_data, national_data):
# Import all datasets
df_county = pd.read_csv(county_data, sep='\t',
engine='python')
df_state = pd.read_csv(state_data, sep='\t', engine='python')
df_national = pd.read_csv(national_data, sep='\t',
engine='python')
# Organize the data
# drop unnecessary columns
df_county = df_county.drop(['Notes', 'Age Group', 'Crude Rate'], axis=1)
df_state = df_state.drop(['Notes', 'Age Group', 'Crude Rate'], axis=1)
df_national = df_national.drop(['Notes', 'Age Group', 'Crude Rate'], axis=1)
# drop rows full of nans
df_county = df_county.dropna(how='all')
df_state = df_state.dropna(how='all')
df_national = df_national.dropna(how='all')
df_county['State ANSI'] = (df_county['County Code'] / 1000).apply(np.floor) # extract state code from fpis code
df_county['County ANSI'] = df_county['County Code'] - (df_county['County Code'] / 1000).apply(
np.floor) * 1000 # extract county code from fpis code
df_county = df_county.drop(['County', 'County Code'], axis=1) # drop county and dounty code rows
df_state = df_state.drop(['State'], axis=1)
# Group ages into specified ranges
df_county = df_county.replace({'Age Group Code': {'1': '0-5', '1-4': '0-5', '5-9': '5-25', '10-14': '5-25',
'15-19': '5-25', '20-24': '5-25', '25-34': '25+', '35-44': '25+',
'45-54': '25+', '55-64': '25+', '65-74': '25+', '75-84': '25+',
'85+': '25+'}})
df_state = df_state.replace({'Age Group Code': {'1': '0-5', '1-4': '0-5', '5-9': '5-25', '10-14': '5-25',
'15-19': '5-25', '20-24': '5-25', '25-34': '25+', '35-44': '25+',
'45-54': '25+', '55-64': '25+', '65-74': '25+', '75-84': '25+',
'85+': '25+'}})
df_national = df_national.replace({'Age Group Code': {'1': '0-5', '1-4': '0-5', '5-9': '5-25', '10-14': '5-25',
'15-19': '5-25', '20-24': '5-25', '25-34': '25+',
'35-44': '25+', '45-54': '25+', '55-64': '25+',
'65-74': '25+', '75-84': '25+', '85+': '25+'}})
# Clean up data and reorder
# replace 'supprssed' and 'missing' with nans to allow for math operations
df_county = df_county.replace(
{'Deaths': {'Suppressed': np.nan, 'Missing': np.nan}, 'Population': {'Suppressed': np.nan, 'Missing': np.nan}})
df_state = df_state.replace(
{'Deaths': {'Suppressed': np.nan, 'Missing': np.nan}, 'Population': {'Suppressed': np.nan, 'Missing': np.nan}})
# set class of deaths and population to float
df_county['Deaths'] = df_county['Deaths'].astype(float)
df_county['Population'] = df_county['Population'].astype(float)
df_state['Deaths'] = df_state['Deaths'].astype(float)
# sum column values by state, then county, then age group code. needs to be done because multiple values for each combination due to higher granualrity in orginial data
df_county = df_county.groupby(['State ANSI', 'County ANSI', 'Age Group Code']).sum()
df_state = df_state.groupby(['State Code', 'Age Group Code']).sum()
df_national = df_national.groupby(['Age Group Code']).sum()
# unstack from multiindexing
df_county = df_county.unstack(level=2)
df_county.columns = ['_'.join(col) for col in df_county.columns.values]
df_county = df_county.reset_index()
df_county = df_county.sort_values(by=['State ANSI', 'County ANSI']) # resort rows
df_state = df_state.unstack(level=1)
df_state.columns = ['_'.join(col) for col in df_state.columns.values]
df_state = df_state.reset_index()
# calculate percent deaths for each age group
df_county['Percent Deaths 0-5'] = df_county['Deaths_0-5'] / df_county['Population_0-5']
df_county['Percent Deaths 25+'] = df_county['Deaths_25+'] / df_county['Population_25+']
df_county['Percent Deaths 5-25'] = df_county['Deaths_5-25'] / df_county['Population_5-25']
# Calculate the national death rate
df_national['Percent Deaths'] = df_national['Deaths'] / df_national['Population']
return [df_county, df_state, df_national]