-
Notifications
You must be signed in to change notification settings - Fork 3
/
split.py
80 lines (61 loc) · 3.17 KB
/
split.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#%%
import pandas as pd
from sklearn.model_selection import train_test_split
import os
import numpy as np
def one_hot_encoding(pathology):
"""
:param pathology: String, name of the pathology
:return: function one hot that returns 1 if the row is labeled with pathology
"""
def one_hot(row):
if pathology in row['Finding Labels']:
return 1
return 0
return one_hot
#%%
if __name__ == "__main__":
####################################################################################################################
# %% Build Train Test Val Datasets
####################################################################################################################
"""
# Local
testListPath = "/home/user1/Documents/Data/ChestXray/Utils/test_list.txt"
csvpath = "/home/user1/Documents/Data/ChestXray/Data_Entry_2017.csv"
savepath = "/home/user1/Documents/Data/ChestXray"
"""
# Server
testListPath = "/network/data1/ChestXray-NIHCC-2/test_list.txt"
csvpath = "/network/data1/ChestXray-NIHCC-2/Data_Entry_2017.csv"
savepath = "/network/home/bertinpa/Documents/ChestXrays/Data"
# Load data
Data = pd.read_csv(csvpath)
testList = pd.read_csv(testListPath, header=None)[0].tolist()
# Add one hot encodings
pathologies = ["Atelectasis", "Consolidation", "Infiltration",
"Pneumothorax", "Edema", "Emphysema", "Fibrosis", "Effusion", "Pneumonia",
"Pleural_Thickening", "Cardiomegaly", "Nodule", "Mass", "Hernia"]
for pathology in pathologies:
Data[pathology] = Data.apply(one_hot_encoding(pathology), axis=1)
# Split Test vs TrainVal using the same splitting as the authors of the Dataset
DataTest = Data.loc[Data["Image Index"].isin(testList)]
DataTrainVal = Data.loc[~Data["Image Index"].isin(testList)]
DataTrainVal.reset_index(inplace=True)
# Split Train vs Validation on patient level
patients = DataTrainVal['Patient ID'].unique().tolist()
patients_train, patients_val = train_test_split(patients, test_size=0.11, random_state=14)
DataTrain = DataTrainVal.loc[DataTrainVal['Patient ID'].isin(patients_train)]
DataVal = DataTrainVal.loc[DataTrainVal['Patient ID'].isin(patients_val)]
Train_idx_list = DataTrain.index.tolist()
Val_idx_list = DataVal.index.tolist()
print("validation proportion :", DataVal.shape[0] / DataTrainVal.shape[0])
####################################################################################################################
# %% Save as csv
####################################################################################################################
# Save csv corresponding to train, test and validation
DataTrainVal.to_csv(os.path.join(savepath, "DataTrainVal.csv"), index=False)
DataTrain.to_csv(os.path.join(savepath, "DataTrain.csv"), index=False)
DataVal.to_csv(os.path.join(savepath, "DataVal.csv"), index=False)
np.save(os.path.join(savepath, "Train_Idx_List"), Train_idx_list)
np.save(os.path.join(savepath, "Val_Idx_List"), Val_idx_list)
DataTest.to_csv(os.path.join(savepath, "DataTest.csv"), index=False)