-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess.py
117 lines (87 loc) · 4.94 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
# Open admissions.csv and convert to a dataframe
# Clean the data
# Save the dataframe as a pickle file
import pandas as pd
import numpy as np
import pickle
def preprocessAdmissions():
# Read in the csv file
df = pd.read_csv('data/admissions.csv')
# Choose 1000 random patients
df = df.sample(n=10000, random_state=42)
# Create a new dataframe with only one column for subject_id
master_df = df[['subject_id']]
master_df = master_df.drop_duplicates()
# Save the subject_id dataframe as a pickle file
with open('data/master_df.pickle', 'wb') as f:
pickle.dump(master_df, f)
def preprocessDiagnoses():
# Get the master_df dataframe
with open('data/master_df.pickle', 'rb') as f:
master_df = pickle.load(f)
# Read in the csv file
df = pd.read_csv('data/diagnoses_icd.csv')
# Filter the dataframe to only include the subject_id in the master_df
df = df[df['subject_id'].isin(master_df['subject_id'])]
# Add a column called 'Hypertension' to master_df
master_df['Hypertension'] = 0
# If the patient has a diagnosis of hypertension with diagnosis code 4019, 4011, or I10, set the value to 1
master_df.loc[master_df['subject_id'].isin(df[df['icd_code'].isin(['4019', '4011', 'I10'])]['subject_id']), 'Hypertension'] = 1
# Add a column called 'Hypercholesterolemia' to master_df
master_df['Hypercholesterolemia'] = 0
# If the patient has a diagnosis of hypercholesterolemia with diagnosis code 2720, set the value to 1
master_df.loc[master_df['subject_id'].isin(df[df['icd_code'].isin(['2720'])]['subject_id']), 'Hypercholesterolemia'] = 1
# Add a column called 'Atherosclerosis' to master_df
master_df['Atherosclerosis'] = 0
# If the patient has a diagnosis of atherosclerosis with diagnosis code 41401, set the value to 1
master_df.loc[master_df['subject_id'].isin(df[df['icd_code'].isin(['41401'])]['subject_id']), 'Atherosclerosis'] = 1
# Dump the master_df as a pickle file
with open('data/master_df.pickle', 'wb') as f:
pickle.dump(master_df, f)
# Count and print number of patients with athlerosclerosis, hypertension, and hypercholesterolemia
print('Number of patients with atherosclerosis: {}'.format(master_df['Atherosclerosis'].sum()))
print('Number of patients with hypertension: {}'.format(master_df['Hypertension'].sum()))
print('Number of patients with hypercholesterolemia: {}'.format(master_df['Hypercholesterolemia'].sum()))
def preprocessPatients():
# Get the master_df dataframe
with open('data/master_df.pickle', 'rb') as f:
master_df = pickle.load(f)
# Read in the csv file
df = pd.read_csv('data/patients.csv')
# Filter the dataframe to only include the subject_id in the master_df
df = df[df['subject_id'].isin(master_df['subject_id'])]
# Add column called 'Male' to master_df
master_df['Male'] = 0
# If the patient has gender 'M', set the value to 1
master_df.loc[master_df['subject_id'].isin(df[df['gender'].isin(['M'])]['subject_id']), 'Male'] = 1
# Add a column called 'Female' to master_df
master_df['Female'] = 0
# If patient has gender 'F', set the value to 1
master_df.loc[master_df['subject_id'].isin(df[df['gender'].isin(['F'])]['subject_id']), 'Female'] = 1
# Add columns called 'Age <40', 'Age 40-59', 'Age 60-79', and 'Age 80+' to master_df
master_df['Age <40'] = 0
master_df['Age 40-59'] = 0
master_df['Age 60-79'] = 0
master_df['Age 80+'] = 0
# If the patient is less than 40 years old, set the value to 1
# If the patient is between 40 and 59 years old, set the value to 1
# If the patient is between 60 and 79 years old, set the value to 1
# If the patient is 80 years old or older, set the value to 1
# Look at anchor_age column and classify patients into age groups
master_df.loc[master_df['subject_id'].isin(df[df['anchor_age'] < 40]['subject_id']), 'Age <40'] = 1
master_df.loc[master_df['subject_id'].isin(df[df['anchor_age'].between(40, 59, inclusive=True)]['subject_id']), 'Age 40-59'] = 1
master_df.loc[master_df['subject_id'].isin(df[df['anchor_age'].between(60, 79, inclusive=True)]['subject_id']), 'Age 60-79'] = 1
master_df.loc[master_df['subject_id'].isin(df[df['anchor_age'] >= 80]['subject_id']), 'Age 80+'] = 1
# Dump the master_df as a pickle file
with open('data/master_df.pickle', 'wb') as f:
pickle.dump(master_df, f)
# Print size of dataframe
print('Size of dataframe: {}'.format(master_df.shape))
# Print number of patients in each age group
print('Number of patients in age group <40: {}'.format(master_df['Age <40'].sum()))
print('Number of patients in age group 40-59: {}'.format(master_df['Age 40-59'].sum()))
print('Number of patients in age group 60-79: {}'.format(master_df['Age 60-79'].sum()))
print('Number of patients in age group 80+: {}'.format(master_df['Age 80+'].sum()))
preprocessAdmissions()
preprocessDiagnoses()
preprocessPatients()