-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_mock.py
385 lines (354 loc) · 15.3 KB
/
run_mock.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
# Standard libraries
import json
import random
import time
from itertools import product
from datetime import datetime
from functools import partial
from pathlib import Path
import multiprocessing
import pdb
import numpy as np
import pandas as pd
# Own functions
from classes import Datapoint, MOCKGenotype, PartialClust
import precompute
import evaluation
import objectives
import delta_mock
import utils
import tests
def load_data(exp_name, data_folder, validate, data_subset=""):
# Generate experiment name if not given
if exp_name is None:
exp_name = f"experiment_{datetime.today().strftime('%Y%m%d')}"
# Create the folder to store the results
experiment_folder = Path.cwd() / "experiments" / exp_name
# Warn if already made
if experiment_folder.is_dir():
print(f"{experiment_folder} already exists, results may be overwritten")
if validate is None:
# Make the folder if not already
experiment_folder.mkdir(parents=True, exist_ok=True)
# Turn the folder into a Path
# If a relative path to the data is given this splits it properly for cross-platform
data_folder = Path.cwd().joinpath(*[i for i in data_folder.split("/")])
# Check if the data_folder exists
if not data_folder.is_dir():
raise NotADirectoryError(f"{data_folder} cannot be found")
# Select the datasets from the folder if a filter is given
if data_subset is None:
data_file_paths = data_folder.glob("*")
else:
data_file_paths = data_folder.glob("*"+data_subset+"*")
# Return the data and the base experiment folder
return list(data_file_paths), experiment_folder
def prepare_data(file_path):
# Some of this function is hard-coded for strings, so convert the Path
if isinstance(file_path, Path):
file_path = str(file_path)
# Get prettier names for the data
if "tevc" in file_path:
Datapoint.data_name = file_path.split("/")[-1].split(".")[0][:-15]
elif "UKC" in file_path:
Datapoint.data_name = file_path.split("/")[-1].split(".")[0]
# Current data has header with metadata
with open(file_path) as file:
head = [int(next(file)[:-1]) for _ in range(4)]
# Read the data in as an array
# The skip_header is for the header info in this data specifically
data = np.genfromtxt(file_path, delimiter="\t", skip_header=4)
# Set the values for the data
Datapoint.num_examples = head[0] # Num examples
Datapoint.num_features = head[1] # Num features/dimensions
Datapoint.k_user = head[3] # Num real clusters
print("Num examples:", Datapoint.num_examples)
print("Num features:", Datapoint.num_features)
print("Num (actual) clusters:", Datapoint.k_user)
# Do we have labels?
if head[2] == 1:
Datapoint.labels = True
else:
Datapoint.labels = False
# Remove labels if present and create data_dict
data, data_dict = Datapoint.create_dataset_garza(data)
return data, data_dict
def setup_mock(data, data_dict):
# Go through the precomputation specific to the dataset
# Calculate distance array
distarray = precompute.compute_dists(data, data)
distarray = precompute.normalize_dists(distarray)
argsortdists = np.argsort(distarray, kind='mergesort')
# Calculate nearest neighbour rankings
nn_rankings = precompute.nn_rankings(distarray, Datapoint.num_examples)
# Calculate MST
MOCKGenotype.mst_genotype = precompute.create_mst(distarray)
# Calculate DI values
MOCKGenotype.degree_int = precompute.degree_interest(
MOCKGenotype.mst_genotype, nn_rankings, distarray
)
# Sort to get the indices of most to least interesting links
MOCKGenotype.interest_links_indices()
return argsortdists, nn_rankings
def prepare_mock_args(data, data_dict, argsortdists, nn_rankings, config):
# Bundle all of the arguments together in a dict to pass to the function
# This is in order of runMOCK() so that we can easily turn it into a partial func for multiprocessing
mock_args = {
"data_dict": data_dict,
"hv_ref": None,
"argsortdists": argsortdists,
"nn_rankings": nn_rankings,
"L": None,
"num_indivs": config["num_indivs"],
"num_gens": config["num_gens"],
"mut_meth_params": None
}
return mock_args
def create_seeds(config):
# Specify seed folder
seed_folder = Path.cwd() / "seeds"
# Load seeds if present
if config["seed_file"] is not None:
seed_list = utils.load_json(seed_folder / config["seed_file"])["seed_list"]
# Otherwise make some seeds
else:
# Ensure no collision of seeds
while True:
# Randomly generate seeds
seed_list = [
random.randint(0, 1000000) for i in range(config["num_runs"])
]
# If no collisions, break
if len(seed_list) == len(set(seed_list)):
break
# Save the seed_list in the results folder
seed_fname = seed_folder / f"seed_list_{config['exp_name']}.json"
# Create a dict to save into the json just to be specific
seeds = {"seed_list": seed_list}
# Save the seeds
with open(seed_fname, "w") as out_file:
json.dump(seeds, out_file, indent=4)
# Add the seed list to the config file so when we save it it's complete
config["seed_file"] = f"seed_list_{config['exp_name']}.json"
# Ensure we have enough seeds
if len(seed_list) < config["num_runs"]:
raise ValueError("Not enough seeds for number of runs")
return seed_list, config
def calc_hv_ref(mock_args):
"""Calculates a hv reference/nadir point for use on all runs of that dataset
Arguments:
mock_args {dict} -- Arguments for MOCK
Returns:
[list] -- The reference/nadir point
"""
# Reduce the MST genotype
mst_reduced_genotype = [MOCKGenotype.mst_genotype[i] for i in MOCKGenotype.reduced_genotype_indices]
# Calculate chains
chains, superclusts = objectives.cluster_chains(
mst_reduced_genotype, mock_args['data_dict'], PartialClust.comp_dict, MOCKGenotype.reduced_cluster_nums
)
# Calculate the maximum possible intracluster variance
PartialClust.max_var = objectives.objVAR(
chains, PartialClust.comp_dict, PartialClust.base_members,
PartialClust.base_centres, superclusts
)
# Need to divide by N as this is done in eval_mock() not objVAR()
PartialClust.max_var= PartialClust.max_var/Datapoint.num_examples
# Set reference point just outside max values to ensure no overlap
hv_ref = [
PartialClust.max_var*1.01,
PartialClust.max_cnn*1.01
]
return hv_ref
def run_mock(**cl_args):
# Load the data file paths
if cl_args['validate']:
config_path = Path.cwd() / "configs" / "validate.json"
config = utils.load_json(config_path)
data_file_paths, experiment_folder = load_data(
config["exp_name"],
config["data_folder"],
cl_args['validate'],
config["data_subset"]
)
# Just validating so don't save results
save_results = False
else:
config_path = Path.cwd() / "configs" / cl_args["config"]
config = utils.load_json(config_path)
data_file_paths, experiment_folder = load_data(
config["exp_name"],
config["data_folder"],
cl_args['validate'],
config["data_subset"]
)
# Save experimental results
save_results = True
# Check the config and amend if needed
config = utils.check_config(config)
# Create the seed list
seed_list, config = create_seeds(
config
)
# Restrict seed_list to the actual number of runs that we need
# Truncating like this allows us to know that the run numbers and order of seeds correspond
seed_list = seed_list[:config["num_runs"]]
seed_list = [(i,) for i in seed_list] # for starmap
# Calculate number of delta values
# Just needed here to print
if config["delta_sr_vals"] is None:
a = 0
else:
a = len(config["delta_sr_vals"])
if config["delta_raw_vals"] is None:
b = 0
else:
b = len(config["delta_raw_vals"])
num_delta = a + b
# Print the number of runs to get an idea of runtime (sort of)
print("---------------------------")
print("Number of MOCK Runs:")
print(f"{config['num_runs']} run(s)")
print(f"{len(config['strategies'])} strategy/-ies")
print(f"{num_delta} delta value(s)")
print(f"{len(data_file_paths)} dataset(s)")
print(f"= {config['num_runs']*len(config['strategies'])*num_delta*len(data_file_paths)} run(s)")
print("---------------------------")
# Save the config
utils.save_config(config, experiment_folder, cl_args["validate"])
# Loop through the data to test
for file_path in data_file_paths:
print(f"Beginning precomputation for {file_path.name}...")
# Prepare the data
data, data_dict = prepare_data(file_path)
# Do the precomputation for MOCK
argsortdists, nn_rankings = setup_mock(data, data_dict)
# Wrap the arguments up for the main MOCK function
mock_args = prepare_mock_args(data, data_dict, argsortdists, nn_rankings, config)
print("Precomputation complete!")
if save_results or cl_args["validate"]:
# Create a dataframe for the results
results_df = pd.DataFrame()
# A longer genotype means a higher possible maximum for the CNN objective
# By running the highest sr value first, we ensure that the HV_ref
# is the same and appropriate for all runs
# If both sr and raw delta values are given, convert and unify them
if config["delta_sr_vals"] is not None and config["delta_raw_vals"] is not None:
delta_vals = sorted(
config["delta_raw_vals"] + [
MOCKGenotype.calc_delta(sr_val) for sr_val in config["delta_sr_vals"]
],
reverse=True
)
# If just raw values, then just reverse sort them
elif config["delta_sr_vals"] is None and config["delta_raw_vals"] is not None:
delta_vals = sorted(config['delta_raw_vals'], reverse=True)
# If just sr values, then convert and reverse sort them
elif config["delta_sr_vals"] is not None and config["delta_raw_vals"] is None:
delta_vals = sorted(
[MOCKGenotype.calc_delta(sr_val) for sr_val in config["delta_sr_vals"]],
reverse=True
)
# Loop through the sr (square root) values
for delta_val, L in product(delta_vals, config["L"]):
# Set the delta value in the args
# mock_args['delta_val'] = MOCKGenotype.delta_val
MOCKGenotype.delta_val = delta_val
print(f"Delta: {MOCKGenotype.delta_val}")
# Set the mock_args for this layer
mock_args["L"] = L
# Setup some of the variables for the genotype
MOCKGenotype.setup_genotype_vars()
# Setup the components class
PartialClust.partial_clusts(
data, mock_args["data_dict"], mock_args["argsortdists"], mock_args["L"]
)
# Identify the component IDs of the link origins
MOCKGenotype.calc_reduced_clusts(mock_args["data_dict"])
# Set the nadir point if first run
if mock_args['hv_ref'] is None:
# To ensure compatible results for validation
if cl_args['validate']:
mock_args['hv_ref'] = [3.0, 1469.0]
else:
mock_args['hv_ref'] = calc_hv_ref(mock_args)
print(f"HV ref point: {mock_args['hv_ref']}")
# Strategy is not used, but kept for result consistency with adaptive
# Avoid more nested loops
for strategy, L_comp in product(
config["strategies"], config["L_comp"]
):
# Add mutation method-specific arguments
mock_args = delta_mock.get_mutation_params(
config["mut_method"], mock_args, L_comp
)
# Create the partial function to give to multiprocessing
mock_func = partial(delta_mock.runMOCK, **mock_args)
print(f"{strategy}-{config['mut_method']} starting...")
# Measure the time taken for the runs
# Send the function to a thread, each thread with a different seed
start_time = time.time()
with multiprocessing.Pool() as pool:
results = pool.starmap(mock_func, seed_list)
mp_time = time.time() - start_time
print(f"{len(seed_list)} runs complete...took {mp_time:.3f} secs)")
for run_num, run_result in enumerate(results):
# Extract the population
pop = run_result[0]
# Extract hv list
hvs = run_result[1]
# Extract final interesting links
final_interest_inds = run_result[3]
# Extract final genotype length
final_gen_len = run_result[4]
# Get the running time for each run
time_taken = run_result[5]
# Calculate the number of clusters and ARIs
num_clusts, aris = evaluation.final_pop_metrics(
pop, MOCKGenotype.mst_genotype,
final_interest_inds, final_gen_len
)
# Extract the fitness values
var_vals = [indiv.fitness.values[0] for indiv in pop]
cnn_vals = [indiv.fitness.values[1] for indiv in pop]
# Add strategy here for adaptive version
results_dict = {
"dataset": [Datapoint.data_name]*config["num_indivs"],
"run": [run_num+1]*config["num_indivs"],
"indiv": list(range(config["num_indivs"])),
"L": [L]*config["num_indivs"],
"delta": [delta_val]*config["num_indivs"],
"VAR": var_vals,
"CNN": cnn_vals,
"HV": hvs,
"ARI": aris,
"clusters": num_clusts,
"time": [time_taken]*config["num_indivs"]
}
# Add the new results
results_df = results_df.append(
pd.DataFrame.from_dict(results_dict), ignore_index=True
)
print(f"{file_path.name} complete!")
# Validate the results
if cl_args['validate']:
tests.validate_results(
results_df
)
print("Test passed!")
# Save results
if save_results:
results_df.to_csv(
experiment_folder / f"{config['exp_name']}_results.csv",
index=False
)
if __name__ == '__main__':
parser = utils.build_parser()
cl_args = parser.parse_args()
cl_args = vars(cl_args)
utils.check_cl_args(cl_args)
run_mock(**cl_args)
######## TO DO ########
# look at how results are saved and named
# evaluation.py is a shit show
# should clean up and define the required environment at some point