-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_preprocessing.py
86 lines (73 loc) · 3.53 KB
/
data_preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import pandas as pd
# create dataframes for lh/rh cortical gm thickness/area and ms_studies
rh_thickness = pd.read_csv('stats/rh_thickness.csv')
rh_thickness = rh_thickness.rename(columns={'rh.aparc.a2009s.thickness': 'id'})
lh_thickness = pd.read_csv('stats/lh_thickness.csv')
lh_thickness = lh_thickness.rename(columns={'lh.aparc.a2009s.thickness': 'id'})
rh_area = pd.read_csv('stats/rh_area.csv')
rh_area = rh_area.rename(columns={'rh.aparc.a2009s.area': 'id'})
lh_area = pd.read_csv('stats/lh_area.csv')
lh_area = lh_area.rename(columns={'lh.aparc.a2009s.area': 'id'})
ms_studies = pd.read_csv('stats/MS_studies.csv')
# create dataframes for subcortical structures (rh/lh hippocampus, amygdala, thalamus, hypothalamus)
lh_amy = pd.read_csv('stats/lh_amy.csv')
lh_amy = lh_amy.add_prefix('lh_')
lh_amy = lh_amy.rename(columns={'lh_Measure:volume': 'id'})
rh_amy = pd.read_csv('stats/rh_amy.csv')
rh_amy = rh_amy.add_prefix('rh_')
rh_amy = rh_amy.rename(columns={'rh_Measure:volume': 'id'})
lh_hipp = pd.read_csv('stats/lh_hipp.csv')
lh_hipp = lh_hipp.add_prefix('lh_')
lh_hipp = lh_hipp.rename(columns={'lh_Measure:volume': 'id'})
rh_hipp = pd.read_csv('stats/rh_hipp.csv')
rh_hipp = rh_hipp.add_prefix('rh_')
rh_hipp = rh_hipp.rename(columns={'rh_Measure:volume': 'id'})
lh_thal = pd.read_csv('stats/lh_thal.csv')
lh_thal = lh_thal.add_prefix('lh_')
lh_thal = lh_thal.rename(columns={'lh_Measure:volume': 'id'})
rh_thal = pd.read_csv('stats/rh_thal.csv')
rh_thal = rh_thal.add_prefix('rh_')
rh_thal = rh_thal.rename(columns={'rh_Measure:volume': 'id'})
print(rh_area.columns[0])
# drop unnecessary columns
rh_area = rh_area.drop(columns=['BrainSegVolNotVent', 'eTIV'])
lh_area = lh_area.drop(columns=['BrainSegVolNotVent', 'eTIV'])
lh_thickness = lh_thickness.drop(columns=['BrainSegVolNotVent', 'eTIV'])
rh_thickness = rh_thickness.drop(columns=['BrainSegVolNotVent'])
# merge dataframes
thickness = pd.merge(rh_thickness, lh_thickness, on='id')
area = pd.merge(rh_area, lh_area, on='id')
gm_data = pd.merge(thickness, area, on='id')
amygdala = pd.merge(rh_amy, lh_amy, on='id')
hippocampus = pd.merge(rh_hipp, lh_hipp, on='id')
thalamus = pd.merge(rh_thal, lh_thal, on='id')
subcortical_data = pd.merge(amygdala, hippocampus, on='id')
subcortical_data = pd.merge(subcortical_data, thalamus, on='id')
# create dictionary for eTIV of each id
eTIV_dict = {}
for index, row in gm_data.iterrows():
eTIV_dict[row['id']] = row['eTIV']
# drop unnecessary columns
gm_features = gm_data.columns
gm_features = gm_features.drop('id')
gm_features = gm_features.drop('eTIV')
subcortical_features = subcortical_data.columns
subcortical_features = subcortical_features.drop('id')
# normalize data
for feature in gm_features:
gm_data[feature] = (gm_data[feature])*1000 / gm_data['eTIV']
for feature in subcortical_features:
subcortical_data[feature] = (subcortical_data[feature])*1000 / subcortical_data['id'].map(eTIV_dict)
# merge matching columns with different names
for column in gm_data.columns:
# replace '_and_' with '&'
gm_data = gm_data.rename(columns={column: column.replace('_and_', '&')})
gm_data = gm_data.groupby(level=0, axis=1).sum()
gm_data = gm_data.dropna(axis=1)
# create and save final dataframe
ml_dataframe = pd.merge(ms_studies, gm_data, on='id', how='outer')
print(ml_dataframe.shape)
ml_dataframe = pd.merge(ml_dataframe, subcortical_data, on='id', how='outer')
print(ml_dataframe.shape)
ml_dataframe = ml_dataframe.drop(columns=["rh_WhiteSurfArea_area", "lh_WhiteSurfArea_area", "lh_fimbria", "rh_fimbria"])
ml_dataframe.to_csv('stats/ml_dataframe.csv', index=False)