-
Notifications
You must be signed in to change notification settings - Fork 1
/
Dataset_stats.py
243 lines (176 loc) · 11.6 KB
/
Dataset_stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
import json
import pandas as pd
import pathlib
from typing import Dict, List, Callable
from Import import get_file_paths_pathlib, get_text_from_file, get_file_paths
from Models import *
def list_question_types_with_categories(input_path: str) -> Dict[str, List[str]]:
''' Returns a dictionary with all question types and their corresponding categories '''
question_types = {}
file_paths = get_file_paths(input_path)
# Loop through all files in the given folders
for file_path in file_paths:
data_set = json.loads(get_text_from_file(file_path))
# Loop through all questions in the file and add extract the question type id, question type and description
for i in range(len(data_set)):
question_type = data_set[i].get('question-type', '')
question_description = data_set[i].get('description', '')
if question_type == '':
continue
if question_type not in question_types:
question_types[question_type] = []
if question_description not in question_types[question_type]:
question_types[question_type].append(question_description)
# Return the dictionary sorted by question types
return dict(sorted(question_types.items(), key=lambda item: item[0]))
def list_question_types(input_path: str) -> List[str]:
''' Returns a list of all question types '''
question_types = []
file_paths = get_file_paths(input_path)
# Loop through all files in the given folders
for file_path in file_paths:
data_set = json.loads(get_text_from_file(file_path))
# Loop through all questions in the file and add extract the question type
for i in range(len(data_set)):
question_type = data_set[i].get('question-type', '')
if question_type != '' and question_type not in question_types:
question_types.append(question_type)
return question_types
def list_question_type_ids(input_path: str) -> List[str]:
''' Returns a list of all question types '''
question_type_ids = []
file_paths = get_file_paths(input_path)
# Loop through all files in the given folders
for file_path in file_paths:
data_set = json.loads(get_text_from_file(file_path))
# Loop through all questions in the file and add extract the question type
for i in range(len(data_set)):
question_type_id = data_set[i].get('ques_type_id', '')
if question_type_id != '' and question_type_id not in question_type_ids:
question_type_ids.append(question_type_id)
return question_type_ids
def count_instances_of_each_question_type(input_path: str) -> Dict[str, int]:
''' Returns a dictionary with all question types and the number of instances of each question type '''
question_type_instances = {}
file_paths = get_file_paths(input_path)
# Loop through all files in the given folders
for file_path in file_paths:
data_set = json.loads(get_text_from_file(file_path))
# Loop through all questions in the file and add extract the question type id and question type
for i in range(len(data_set)):
question_type = f"{data_set[i].get('question-type', '')}"
if question_type == '':
continue
if question_type not in question_type_instances:
question_type_instances[question_type] = 0
question_type_instances[question_type] += 1
# Return the dictionary sorted by the question type id
return dict(sorted(question_type_instances.items(), key=lambda item: item[0]))
def count_instances_of_question_subtypes_for_files(file_paths: List[pathlib.Path]) -> Dict[str, int]:
''' Returns a dictionary with all question subtypes and the number of instances of each question subtype '''
question_subtype_instances = {}
# Loop through all files in the given folders
for file_path in file_paths:
data_set = json.loads(get_text_from_file(str(file_path)))
# Loop through all questions in the file and add extract the question type and description
for i in range(len(data_set)):
question_type = data_set[i].get('question-type', '')
question_description = data_set[i].get('description', '')
if question_type == '':
continue
if question_description != '':
question_type += f" [{question_description}]"
if question_type not in question_subtype_instances:
question_subtype_instances[question_type] = 0
question_subtype_instances[question_type] += 1
# Return the dictionary sorted number of instances
return dict(sorted(question_subtype_instances.items(), key=lambda item: item[1], reverse=True))
def count_instances_of_question_subtypes(input_path: pathlib.Path) -> Dict[str, int]:
''' Returns a dictionary with all question subtypes and the number of instances of each question subtype '''
file_paths = get_file_paths_pathlib(input_path)
return count_instances_of_question_subtypes_for_files(file_paths)
def calculate_percentage_for_each_question_type(question_type_instances: Dict[str, int], sort_key: Callable, sort_reverse: bool = False) -> Dict[str, float]:
''' Returns a dictionary with all question type ids and the percentage of instances of each question type id '''
question_type_percentages = {}
total_instances = sum(question_type_instances.values())
# Loop through all question types
for question_type in question_type_instances:
question_type_percentages[question_type] = question_type_instances[question_type] / total_instances * 100
# Round to 2 decimal places
question_type_percentages[question_type] = round(question_type_percentages[question_type], 4)
# Return the dictionary sorted by the question type id
return dict(sorted(question_type_percentages.items(), key=sort_key, reverse=sort_reverse))
def get_required_samples_for_each_question_type(sample_size: int, question_type_percentages: Dict[str, float], sort_key: Callable, reverse: bool = False) -> Dict[str, int]:
''' Returns a dictionary with all question type ids and the required number of samples for each question type id '''
required_samples_rounded = {}
required_samples = {}
# Loop through all question types
for question_type in question_type_percentages:
required_samples_rounded[question_type] = round(sample_size / 100 * question_type_percentages[question_type])
required_samples[question_type] = sample_size / 100 * question_type_percentages[question_type]
sum_rounded = sum(required_samples_rounded.values())
missing_values = sample_size - sum_rounded
# Correct rounding errors
# Find all values that have been rounded down
rounded_down = {key: value % 1 for key, value in required_samples.items() if value % 1 <= 0.5}
# sort them by value and take the first missing_values number of values
rounded_down = dict(sorted(rounded_down.items(), key=lambda item: item[1], reverse=True)[:missing_values])
# Add 1 to each of these rounded down values
for key, value in rounded_down.items():
if key in required_samples_rounded:
required_samples_rounded[key] += 1
else:
required_samples_rounded[key] = 1
# Return the dictionary sorted dictionary
return dict(sorted(required_samples_rounded.items(), key=sort_key, reverse=reverse))
def count_instances_of_question_subtypes_csv_files(file_paths: List[str], sort_key: Callable, reverse: bool = False) -> Dict[str, int]:
''' Returns a dictionary with all question subtypes and the number of instances of each question subtype '''
question_subtype_instances = {}
# Loop through all files in the given folders
for file_path in file_paths:
dataframe = pd.read_csv(file_path)
# Loop through all questions in the dataframe and add extract the question type and description
for index, row in dataframe.iterrows():
question_type = row['question_type']
question_description = row['description']
# Check if question_type is not string or empty string
if type(question_type) != str or question_type == '':
if index % 2 == 0:
print(f"Question type is not string or empty string in file: {file_path}, at index: {index}")
continue
if type(question_description) == str and question_description != '':
question_type += f" [{question_description}]"
if question_type not in question_subtype_instances:
question_subtype_instances[question_type] = 0
question_subtype_instances[question_type] += 1
# Return the dictionary sorted number of instances
return dict(sorted(question_subtype_instances.items(), key=sort_key, reverse=reverse))
def count_instances_of_question_subtypes_csv(input_path: str, sort_key: Callable, reverse: bool = False) -> Dict[str, int]:
''' Returns a dictionary with all question subtypes and the number of instances of each question subtype '''
file_paths = get_file_paths(input_path)
return count_instances_of_question_subtypes_csv_files(file_paths, sort_key, reverse)
def count_difference_of_required_samples_and_available_samples(required_samples: Dict[str, int], available_samples: Dict[str, int], sort_key: Callable, reverse: bool = False, include_sufficient_samples: bool = True) -> Dict[str, int]:
''' Returns number of missing samples for each question (sub)type '''
missing_samples = {}
for question_type in required_samples:
available_sample = available_samples[question_type] if question_type in available_samples else 0
missing_samples[question_type] = required_samples[question_type] - available_sample
if not include_sufficient_samples:
missing_samples = {key: value for key, value in missing_samples.items() if value > 0}
return dict(sorted(missing_samples.items(), key=sort_key, reverse=reverse))
def count_missing_predictions_per_subcategory(sample_size: int, distribution_path: str, output_path: str, model: ModelType, prompt_type: str):
file_paths = get_file_paths(output_path)
# Exclude all file paths that do not contain the model name and promtp type
file_paths = [file_path for file_path in file_paths if model.value in file_path and f"{prompt_type}.csv" in file_path]
existing_predictions = count_instances_of_question_subtypes_csv_files(file_paths, lambda item: item[1], False)
# Get the required number of prediction for each question category
question_type_instances = count_instances_of_question_subtypes(distribution_path)
question_type_percentages = calculate_percentage_for_each_question_type(question_type_instances, sort_key= lambda item: item[1])
required_samples = get_required_samples_for_each_question_type(sample_size, question_type_percentages, sort_key= lambda item: item[1], reverse = False)
return count_difference_of_required_samples_and_available_samples(required_samples, existing_predictions, sort_key= lambda item: item[1], include_sufficient_samples= True)
def count_required_predictions_per_subcategory(sample_size: int, distribution_path: pathlib.Path):
# Get the required number of prediction for each question category
question_type_instances = count_instances_of_question_subtypes(distribution_path)
question_type_percentages = calculate_percentage_for_each_question_type(question_type_instances, sort_key= lambda item: item[1])
required_samples = get_required_samples_for_each_question_type(sample_size, question_type_percentages, sort_key= lambda item: item[1], reverse = False)
return required_samples