-
Notifications
You must be signed in to change notification settings - Fork 0
/
MIDL_calculate_f1_score.py
248 lines (210 loc) · 12 KB
/
MIDL_calculate_f1_score.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
import os
import json
from shapely.geometry import Polygon, Point
import numpy as np
import argparse
def process_geojsons(folder, dict_classes, NN192 = False):
centroid_dictionary = {}
instance = 0
filenames_to_process = [ # only the files that are in the test set
"metastasis_image_181_cell", "metastasis_image_182_cell", "metastasis_image_183_cell",
"metastasis_image_184_cell", "metastasis_image_185_cell", "metastasis_image_186_cell",
"metastasis_image_187_cell", "metastasis_image_188_cell", "metastasis_image_189_cell",
"metastasis_image_190_cell", "metastasis_image_191_cell", "metastasis_image_192_cell",
"metastasis_image_193_cell", "metastasis_image_194_cell", "metastasis_image_195_cell",
"metastasis_image_196_cell", "metastasis_image_197_cell", "metastasis_image_198_cell",
"metastasis_image_199_cell", "metastasis_image_200_cell", "primary_image_081_cell",
"primary_image_082_cell", "primary_image_083_cell", "primary_image_084_cell",
"primary_image_085_cell", "primary_image_086_cell", "primary_image_087_cell",
"primary_image_088_cell", "primary_image_089_cell", "primary_image_090_cell",
"primary_image_091_cell", "primary_image_092_cell", "primary_image_093_cell",
"primary_image_094_cell", "primary_image_095_cell", "primary_image_096_cell",
"primary_image_097_cell", "primary_image_098_cell", "primary_image_099_cell",
"primary_image_100_cell"
]
for geojson in os.listdir(folder):
geojson_basename = os.path.basename(geojson).split('.')[0]
if geojson_basename not in filenames_to_process:
continue
new_filepath = os.path.join(folder, geojson)
if new_filepath.endswith('_cell.geojson'): # Process only GeoJSON files
with open(new_filepath) as f:
data = json.load(f)
geojson_name = geojson.split('.')[0]
features_list = []
if 'nucleosGeometry' in data['features'][0]:
NN192 = True
else:
NN192 = False
if NN192 == True:
features = data.get('features', [])
for feature in features:
if feature['properties']['objectType'] == 'cell':
# NN192 outputs a rectangle without a class in metastasis_image_187_cell, metastasis_image_190_cell, metastasis_image_192_cell, primary_image_081_cell, primary_image_083_cell, primary_image_089_cell, primary_image_091_cell, primary_image_096_cell,
category = feature.get('properties', {}).get('classification', {}).get('name', 'cell_other')
category = dict_classes.get(category, 'cell_other')
segmentation = feature.get('nucleusGeometry', {}).get('coordinates', [])
polygon = Polygon(segmentation[0])
features_list.append({
'filename': geojson_name,
'category': category,
'centroid': polygon.centroid,
'score': 1.0
})
else:
features = data.get('features', [])
for feature in features:
category = feature.get('properties', {}).get('classification', {}).get('name', 'cell_other')
category = dict_classes.get(category, 'cell_other')
geometry_type = feature["geometry"]["type"]
geometry = feature["geometry"]
properties = feature['properties']
if 'classification' in properties and 'score' in properties['classification']:
score = properties['classification']['score']
# If not, check if 'type_prob' key exists for hovernet
elif 'type_prob' in properties:
score = properties['type_prob']
# If neither condition is met, default the score to 1.0
else:
score = 1.0
if geometry_type == "Polygon":
polygons = [geometry["coordinates"]]
elif geometry_type == "MultiPolygon":
polygons = geometry["coordinates"]
else:
continue
for polygon_coords in polygons:
exterior_coords = polygon_coords[0]
interior_coords = polygon_coords[1:]
exterior_ring = [tuple(coord) for coord in exterior_coords]
interior_rings = [[tuple(coord) for coord in interior] for interior in interior_coords]
polygon = Polygon(exterior_ring, interior_rings)
centroid = polygon.centroid
features_list.append({
'filename': geojson_name,
'category': category,
'centroid': centroid,
'score': score})
centroid_dictionary[geojson_basename] = features_list
return centroid_dictionary
def calculate_centroid_distance(dict_ground_truth, dict_pred):
results_dict = {}
pred_structure = {}
# Organize predictions by (filename, category) for faster access
for pred_geojson, pred_features in dict_pred.items():
for feature in pred_features:
key = (feature['filename'], feature['category'])
if key not in pred_structure:
pred_structure[key] = []
pred_structure[key].append(feature)
# Process each ground truth feature individually
for gt_geojson, gt_features in dict_ground_truth.items():
print('Processing', gt_geojson)
if gt_geojson not in results_dict:
results_dict[gt_geojson] = []
for gt_feature in gt_features:
match_key = (gt_feature['filename'], gt_feature['category'])
eligible_predictions = []
# Check if there are any predictions matching the current ground truth feature's filename and category
if match_key in pred_structure:
for pred_feature in pred_structure[match_key]:
# Calculate the distance between centroids of the gt_feature and pred_feature
distance = gt_feature['centroid'].distance(pred_feature['centroid'])
# Filter predictions within the specified distance threshold
if distance < 15:
eligible_predictions.append({
'pred_geojson': pred_feature['filename'],
'gt_category': gt_feature['category'],
'pred_category': pred_feature['category'],
'distance': distance,
'pred_score': pred_feature['score'],
'pred_feature': pred_feature,
})
# Sort the eligible predictions first by descending pred_score, then by ascending distance
eligible_predictions.sort(key=lambda x: (-x['pred_score'], x['distance']))
# If there are eligible predictions, take the best match based on the sorting criteria
if eligible_predictions:
best_match = eligible_predictions[0] # The best match for this gt_feature
results_dict[gt_geojson].append(best_match)
# Remove the best match from the pred_structure to avoid duplicate matches
pred_structure[match_key].remove(best_match['pred_feature'])
return results_dict
def calculate_classification_metrics(results_mask_rcnn, dict_ground_truth, dict_mask_rcnn):
# Extraction process remains the same
pred_tp = [match['pred_category'] for matches in results_mask_rcnn.values() for match in matches]
ground_truth = [match['category'] for matches in dict_ground_truth.values() for match in matches]
pred_all = [match['category'] for matches in dict_mask_rcnn.values() for match in matches]
# Calculation of counts remains the same
gt_dict = dict(zip(*np.unique(ground_truth, return_counts=True)))
pred_dict = dict(zip(*np.unique(pred_all, return_counts=True)))
tp_dict = dict(zip(*np.unique(pred_tp, return_counts=True)))
# Initialize variables for micro F1 calculation
micro_TP, micro_FP, micro_FN = 0, 0, 0
results = {}
for category in np.unique(list(gt_dict.keys()) + list(pred_dict.keys())):
TP = tp_dict.get(category, 0)
FP = pred_dict.get(category, 0) - TP
FN = gt_dict.get(category, 0) - TP
micro_TP += TP
micro_FP += FP
micro_FN += FN
precision = TP / (TP + FP) if TP + FP > 0 else 0
recall = TP / (TP + FN) if TP + FN > 0 else 0
f1_score = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
results[category] = {
'TP': TP, 'FP': FP, 'FN': FN,
'precision': precision, 'recall': recall, 'f1_score': f1_score
}
# Micro F1 calculation
micro_precision = micro_TP / (micro_TP + micro_FP) if micro_TP + micro_FP > 0 else 0
micro_recall = micro_TP / (micro_TP + micro_FN) if micro_TP + micro_FN > 0 else 0
micro_f1_score = 2 * micro_precision * micro_recall / (micro_precision + micro_recall) if micro_precision + micro_recall > 0 else 0
# Macro F1 calculation
macro_f1_score = np.mean([metrics['f1_score'] for metrics in results.values()])
# Add micro and macro F1 scores to the results
results['micro'] = {
'precision': micro_precision, 'recall': micro_recall, 'f1_score': micro_f1_score
}
results['macro'] = {
'f1_score': macro_f1_score
}
return results
if __name__ == "__main__":
# Initialize the argument parser
parser = argparse.ArgumentParser(description="Process GeoJSON files and calculate metrics.")
# Adding arguments
parser.add_argument("ground_truth_folder", type=str, help="Folder containing ground truth GeoJSON files.")
parser.add_argument("prediction_folder", type=str, help="Folder containing prediction GeoJSON files.")
# Parse the arguments
args = parser.parse_args()
# Dictionary for class mapping
dict_classes = { # classes of the NN192 and HoverNet PanNuke models
# NN192
'Tumor' : 'cell_tumor',
"Stroma" : 'cell_stroma',
'Immune cells' : 'cell_lymphocyte',
'Other' : 'cell_other',
# HoverNet PanNuke already code to geojson with the following classes
'cell_tumor' : 'cell_tumor',
'cell_stroma' : 'cell_stroma',
'cell_lymphocyte' : 'cell_lymphocyte',
'cell_other' : 'cell_other',
# melanoma dataset extra classes
'cell_plasma_cell' : 'cell_lymphocyte',
"cell_histiocyte" : 'cell_other',
"cell_melanophage" : 'cell_other',
"cell_endothelium" : 'cell_other',
"cell_epithelium" : 'cell_other',
"cell_neutrophil" : 'cell_other',
"cell_apoptosis" : 'cell_other',
}
# Process the GeoJSON files using the NN192 flag from arguments
dict_ground_truth = process_geojsons(args.ground_truth_folder, dict_classes)
dict_mask_rcnn = process_geojsons(args.prediction_folder, dict_classes)
# Calculate matches and metrics
results_mask_rcnn = calculate_centroid_distance(dict_ground_truth, dict_mask_rcnn)
metrics_mask_rcnn = calculate_classification_metrics(results_mask_rcnn, dict_ground_truth, dict_mask_rcnn)
# Print or otherwise output the metrics
print(metrics_mask_rcnn)
# To run the script, use the following command:
# python MIDL_calculate_f1_score.py /path/to/ground_truth_folder /path/to/prediction_folder