-
Notifications
You must be signed in to change notification settings - Fork 1
/
parse_all_cause_mortality2.py
159 lines (124 loc) · 8.55 KB
/
parse_all_cause_mortality2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import fix_differences as diff
import pandas as pd
import numpy as np
import cdc_estimation_functions as rg
# Import all datasets
df_county = pd.read_csv(r'CDC Wonder Data/All-cause mortality at county level 2016.txt', sep='\t',
engine='python')
df_state = pd.read_csv(r'CDC Wonder Data/All-cause mortality at state level 2016.txt', sep='\t', engine='python')
df_national = pd.read_csv(r'CDC Wonder Data/All-cause mortality at national level 2016.txt', sep='\t',
engine='python')
# Organize the data
# drop unnecessary columns
df_county = df_county.drop(['Notes', 'Age Group', 'Crude Rate'], axis=1)
df_state = df_state.drop(['Notes', 'Age Group', 'Crude Rate'], axis=1)
df_national = df_national.drop(['Notes', 'Age Group', 'Crude Rate'], axis=1)
# drop rows full of nans
df_county = df_county.dropna(how='all')
df_state = df_state.dropna(how='all')
df_national = df_national.dropna(how='all')
df_county['State ANSI'] = (df_county['County Code'] / 1000).apply(np.floor) # extract state code from fpis code
df_county['County ANSI'] = df_county['County Code'] - (df_county['County Code'] / 1000).apply(
np.floor) * 1000 # extract county code from fpis code
df_county = df_county.drop(['County', 'County Code'], axis=1) # drop county and dounty code rows
df_state = df_state.drop(['State'], axis=1)
# Clean up data and reorder
# replace 'supprssed' and 'missing' with nans to allow for math operations
df_county = df_county.replace(
{'Deaths': {'Suppressed': np.nan, 'Missing': np.nan}, 'Population': {'Suppressed': np.nan, 'Missing': np.nan}})
df_state = df_state.replace(
{'Deaths': {'Suppressed': np.nan, 'Missing': np.nan}, 'Population': {'Suppressed': np.nan, 'Missing': np.nan}})
# set class of deaths and population to float
df_county['Deaths'] = df_county['Deaths'].astype(float)
df_county['Population'] = df_county['Population'].astype(float)
df_state['Deaths'] = df_state['Deaths'].astype(float)
# sum column values by state, then county, then age group code. needs to be done because multiple values for each combination due to higher granualrity in orginial data
df_county = df_county.groupby(['State ANSI', 'County ANSI', 'Age Group Code']).sum()
df_state = df_state.groupby(['State Code', 'Age Group Code']).sum()
df_national = df_national.groupby(['Age Group Code']).sum()
# unstack from multiindexing
df_county = df_county.unstack(level=2)
categories = df_county.columns.values
ages = set([col[1] for col in categories])
df_county.columns = ['_'.join(col) for col in df_county.columns.values]
df_county = df_county.reset_index()
df_county = df_county.sort_values(by=['State ANSI', 'County ANSI']) # resort rows
df_state = df_state.unstack(level=1)
df_state.columns = ['_'.join(col) for col in df_state.columns.values]
df_state = df_state.reset_index()
df_national['Percent Deaths'] = df_national['Deaths']/df_national['Population']
df_temp = df_county
df_temp[['Percent Deaths_0-5', 'Percent Deaths_5-25', 'Percent Deaths_25+']] = [np.nan, np.nan, np.nan]
df_temp = pd.concat([df_temp.iloc[:, 0:4], df_temp.iloc[:, 10], df_temp.iloc[:, 4:10], df_temp.iloc[:, 11:17], df_temp.iloc[:, 23], df_temp.iloc[:, 17:23], df_temp.iloc[:, 24:]], axis=1)
df_temp[df_temp==0] = np.nan
df_temp['Percent Deaths_0-5'] = df_temp.iloc[:, 2:4].sum(axis=1, skipna=False)/df_temp.iloc[:, 15:17].sum(axis=1, skipna=False)
df_temp['Percent Deaths_5-25'] = df_temp.iloc[:, 4:8].sum(axis=1, skipna=False)/df_temp.iloc[:, 17:21].sum(axis=1, skipna=False)
df_temp['Percent Deaths_25+'] = df_temp.iloc[:, 8:15].sum(axis=1, skipna=False)/df_temp.iloc[:, 21:28].sum(axis=1, skipna=False)
df_temp = pd.concat([df_temp.iloc[:,0:2], df_temp.iloc[:, 28:]], axis=1)
df_temp = diff.fix(df_temp, 1, 1, 1) # fill in missing counties
df_temp = df_temp.round(5) # round data
df_temp.to_csv(r'Parsed data/All Cause Mortality without Estimates.csv', index = False)
for age in ages:
df_county['Percent Deaths ' + age] = df_county['Deaths_' + age]/df_county['Population_' + age]
df_state[['CalDt' + age, 'CalPop' + age, 'CalPer' + age]] = [np.nan, np.nan, np.nan]
i = 0
for i in range(df_state.shape[0]):
lookat = df_county[df_county['State ANSI'] == df_state.loc[i, 'State Code']] # extract a subtable for each state
for age in ages:
df_state.loc[i, 'CalDt' + age] = lookat.loc[:, 'Deaths_' + age].sum()
df_state.loc[i, 'CalPop' + age] = lookat[lookat['Deaths_' + age] == 0]['Population_' + age].sum()
for age in ages:
#df_state['CalPer' + age] = df_state['CalDt' + age]/df_state['CalPop' + age][~(df_state['CalPop' + age] == 0)]
df_state['CalPer' + age] = (df_state['Deaths_' + age] - df_state['CalDt' + age])/df_state['CalPop' + age]
i = 0
# loop through all counties in county table
for i in range(df_county.shape[0]):
if df_county.loc[i, 'Deaths_' + age] == 0:
state = df_state[df_state['State Code'] == df_county.loc[i, 'State ANSI']].reset_index()
if state.loc[0, 'CalPer' + age] != 0:
df_county.loc[i, 'Percent Deaths ' + age] = state.loc[0, 'CalPer' + age]
else:
df_county.loc[i, 'Percent Deaths ' + age] = df_national.loc[age, 'Percent Deaths']
df_temp2 = df_county.round(5) # round data
df_temp2.to_csv(r'Parsed data/All Cause Mortality with r1 Estimates.csv', index = False)
regional_groupings = rg.create_regional_hashmap()
for age in ages:
df_county_sub_20 = df_county[(df_county['Deaths_' + age] < 20) & (df_county['Deaths_' + age] > 0)]
df_county_sub_20_regional = df_county_sub_20
i = 0
for i in range(df_state.shape[0]):
state = df_county_sub_20_regional[df_county_sub_20_regional['State ANSI'] == df_state.iloc[i, 0]]
if state.loc[:, 'Deaths_' + age].sum() >= 20:
df_county_sub_20_regional = df_county_sub_20_regional[df_county_sub_20_regional['State ANSI'] != df_state.iloc[i, 0]] # drop all counties with current state code
df_county_sub_20_state = df_county_sub_20.drop(df_county_sub_20_regional.index)
i = 0
for i in range(df_county.shape[0]): # loop through all counties
if (df_county.loc[i, 'Deaths_' + age] < 20) & (df_county.loc[i, 'Deaths_' + age] > 0): # if county death count is unreliable
if df_county.loc[i, 'State ANSI'] in list(df_county_sub_20_state['State ANSI']): # if state code is in state table, use value in state table
current = df_county_sub_20_state[df_county_sub_20_state['State ANSI'] == df_county.loc[i, 'State ANSI']]
df_county.loc[i, 'Percent Deaths ' + age] = current.loc[:, 'Deaths_' + age].sum()/current.loc[:, 'Population_' + age].sum() # replace death rate with sum of death rates in state table divided by population
else:
region = rg.get_region_from_state(df_county.loc[i, 'State ANSI'], regional_groupings)
current = df_county_sub_20_regional[df_county_sub_20_regional['State ANSI'].isin(regional_groupings[region])]
if (region == 'northeast') & (age == '25-34'):
print(current)
df_county.loc[i, 'Percent Deaths ' + age] = current.loc[:, 'Deaths_' + age].sum() / current.loc[:, 'Population_' + age].sum()
df_temp3 = df_county.round(5) # round data
df_temp3.to_csv(r'Parsed data/All Cause Mortality with r2 Estimates.csv', index = False)
for age in ages:
i = 0
for i in range(df_county.shape[0]):
if df_county.loc[i, 'Deaths_' + age] == 0:
df_county.loc[i, 'Deaths_' + age] = df_county.loc[i, 'Population_' + age] * df_county.loc[i, 'Percent Deaths ' + age]
for age in ages:
df_county = df_county.drop(['Percent Deaths ' + age], axis=1)
df_county[['Percent Deaths_0-5', 'Percent Deaths_5-25', 'Percent Deaths_25+']] = [np.nan, np.nan, np.nan]
df_county = pd.concat([df_county.iloc[:, 0:4], df_county.iloc[:, 10], df_county.iloc[:, 4:10], df_county.iloc[:, 11:17], df_county.iloc[:, 23], df_county.iloc[:, 17:23], df_county.iloc[:, 24:]], axis=1)
df_county[df_county==0] = np.nan
df_county['Percent Deaths_0-5'] = df_county.iloc[:, 2:4].sum(axis=1, skipna=False)/df_county.iloc[:, 15:17].sum(axis=1, skipna=False)
df_county['Percent Deaths_5-25'] = df_county.iloc[:, 4:8].sum(axis=1, skipna=False)/df_county.iloc[:, 17:21].sum(axis=1, skipna=False)
df_county['Percent Deaths_25+'] = df_county.iloc[:, 8:15].sum(axis=1, skipna=False)/df_county.iloc[:, 21:28].sum(axis=1, skipna=False)
df_county = pd.concat([df_county.iloc[:,0:2], df_county.iloc[:, 28:]], axis=1)
df_county = diff.fix(df_county, 1, 1, 1) # fill in missing counties
df_county = df_county.round(5) # round data
df_county.to_csv(r'Parsed data/All Cause Mortality with Estimates.csv', index = False)