From 37dc3a797275d084ba89fdb8f04b8eb61a755462 Mon Sep 17 00:00:00 2001 From: matuseviciute Date: Mon, 12 Mar 2018 11:22:43 +0100 Subject: [PATCH 1/2] Update hyopt.py --- kopt/hyopt.py | 87 +++++++++++++++++++++------------------------------ 1 file changed, 36 insertions(+), 51 deletions(-) diff --git a/kopt/hyopt.py b/kopt/hyopt.py index da9cc6f..2c587c2 100644 --- a/kopt/hyopt.py +++ b/kopt/hyopt.py @@ -10,7 +10,6 @@ import kopt.eval_metrics as ce from kopt.utils import write_json, merge_dicts, _to_string from kopt.model_data import (subset, split_train_test_idx, split_KFold_idx) -from kopt.config import db_host, db_port, save_dir from datetime import datetime, timedelta from uuid import uuid4 from hyperopt import STATUS_OK @@ -21,17 +20,21 @@ import glob import pprint import logging +import matplotlib.pyplot as plt logging.basicConfig(format='%(asctime)s [%(levelname)s] %(message)s') logger = logging.getLogger() logger.setLevel(logging.INFO) +# TODO - have a system-wide config for this +DEFAULT_IP = "ouga03" +DEFAULT_SAVE_DIR = "/s/project/deepcis/hyperopt/" -def test_fn(fn, hyper_params, n_train=1000, save_model='best', tmp_dir="/tmp/kopt_test/", custom_objects=None): + +def test_fn(fn, hyper_params, n_train=1000, save_model=None, tmp_dir="/tmp/hopt_test/", custom_objects=None): """Test the correctness of the compiled objective function (CompileFN). I will also test model saving/loading from disk. - # Arguments fn: CompileFN instance hyper_params: pyll graph of hyper-parameters - as later provided to `hyperopt.fmin` @@ -48,7 +51,7 @@ def new_data_fn(*args, **kwargs): data = data_fn(*args, **kwargs) train = data[0] train = subset(train, idx=np.arange(min(n_train, train[1].shape[0]))) - return train, + return train, data[1], data[2] return new_data_fn start_time = datetime.now() fn = deepcopy(fn) @@ -78,9 +81,8 @@ def new_data_fn(*args, **kwargs): load_model(model_path, custom_objects=custom_objects) -class KMongoTrials(MongoTrials): +class CMongoTrials(MongoTrials): """`hyperopt.MonoTrials` extended with the following methods: - - get_trial(tid) - Retrieve trial by tid (Trial ID). - get_param(tid) - Retrieve used hyper-parameters for a trial. - best_trial_tid(rank=0) - Return the trial with lowest loss. @@ -95,7 +97,6 @@ class KMongoTrials(MongoTrials): - get_ok_results - Return a list of trial results with an "ok" status - load_model(tid) - Load a Keras model of a tid. - as_df - Returns a tidy `pandas.DataFrame` of the trials database. - # Arguments db_name: str, MongoTrials database name exp_name: strm, MongoTrials experiment name @@ -103,17 +104,16 @@ class KMongoTrials(MongoTrials): port: int, MongoDB port. kill_timeout: int, Maximum runtime of a job (in seconds) before it gets killed. None for infinite. **kwargs: Additional keyword arguments passed to the `hyperopt.MongoTrials` constructor. - """ def __init__(self, db_name, exp_name, - ip=db_host(), port=db_port(), kill_timeout=None, **kwargs): + ip=DEFAULT_IP, port=1234, kill_timeout=None, **kwargs): self.kill_timeout = kill_timeout if self.kill_timeout is not None and self.kill_timeout < 60: logger.warning("kill_timeout < 60 -> Very short time for " + "each job to complete before it gets killed!") - super(KMongoTrials, self).__init__( + super(CMongoTrials, self).__init__( 'mongo://{ip}:{p}/{n}/jobs'.format(ip=ip, p=port, n=db_name), exp_key=exp_name, **kwargs) def get_trial(self, tid): @@ -128,7 +128,6 @@ def get_param(self, tid): def best_trial_tid(self, rank=0): """Get tid of the best trial - rank=0 means the best model rank=1 means second best ... @@ -174,11 +173,10 @@ def count_by_state_unsynced(self, arg): """ if self.kill_timeout is not None: self.delete_running(self.kill_timeout) - return super(KMongoTrials, self).count_by_state_unsynced(arg) + return super(CMongoTrials, self).count_by_state_unsynced(arg) def delete_running(self, timeout_last_refresh=0, dry_run=False): """Delete jobs stalled in the running state for too long - timeout_last_refresh, int: number of seconds """ running_all = self.handle.jobs_running() @@ -246,8 +244,6 @@ def plot_history(self, tid, scores=["loss", "f1", "accuracy"], figsize=(15, 3)): """Plot the loss curves""" history = self.train_history(tid) - import matplotlib.pyplot as plt - fig = plt.figure(figsize=figsize) for i, score in enumerate(scores): plt.subplot(1, len(scores), i + 1) @@ -260,11 +256,9 @@ def plot_history(self, tid, scores=["loss", "f1", "accuracy"], plt.legend(loc='best') return fig - def load_model(self, tid, custom_objects=None): + def load_model(self, tid): """Load saved keras model of the trial. - If tid = None, get the best model - Not applicable for trials ran in cross validion (i.e. not applicable for `CompileFN.cv_n_folds is None` """ @@ -272,7 +266,7 @@ def load_model(self, tid, custom_objects=None): tid = self.best_trial_tid() model_path = self.get_trial(tid)["result"]["path"]["model"] - return load_model(model_path, custom_objects=custom_objects) + return load_model(model_path) def n_ok(self): """Number of ok trials() @@ -331,12 +325,10 @@ def add_n_epoch(df): # -------------------------------------------- -# TODO - put to a separate module -def _train_and_eval_single(train, valid, model, +def _train_and_eval_single(train, valid, test, model, batch_size=32, epochs=300, use_weight=False, - callbacks=[], eval_best=False, add_eval_metrics={}, custom_objects=None): + callbacks=[], eval_best=False, add_eval_metrics={}): """Fit and evaluate a keras model - eval_best: if True, load the checkpointed model for evaluation """ def _format_keras_history(history): @@ -358,7 +350,7 @@ def _format_keras_history(history): epochs=epochs, sample_weight=sample_weight, verbose=2, - callbacks=[history] + callbacks) + callbacks=[history])# + callbacks) <------------------------------ #TODO: make early stopping optional # get history hist = _format_keras_history(history) @@ -366,30 +358,27 @@ def _format_keras_history(history): if eval_best: mcp = [x for x in callbacks if isinstance(x, ModelCheckpoint)] assert len(mcp) == 1 - model = load_model(mcp[0].filepath, custom_objects=custom_objects) + model = load_model(mcp[0].filepath) - return eval_model(model, valid, add_eval_metrics), hist + return eval_model(model, valid, test, add_eval_metrics), hist -def eval_model(model, test, add_eval_metrics={}): +def eval_model(model, valid, test, add_eval_metrics={}): """Evaluate model's performance on the test-set. - # Arguments model: Keras model test: test-dataset. Tuple of inputs `x` and target `y` - `(x, y)`. add_eval_metrics: Additional evaluation metrics to use. Can be a dictionary or a list of functions accepting arguments: `y_true`, `y_predicted`. Alternatively, you can provide names of functions from - the `kopt.eval_metrics` module. - + the `concise.eval_metrics` module. # Returns dictionary with evaluation metrics - """ # evaluate the model logger.info("Evaluate...") # - model_metrics - model_metrics_values = model.evaluate(test[0], test[1], verbose=0, - batch_size=test[1].shape[0]) + model_metrics_values = model.evaluate(valid[0], valid[1], verbose=0, + batch_size=valid[1].shape[0]) # evaluation is done in a single pass to have more precise metics model_metrics = dict(zip(_listify(model.metrics_names), _listify(model_metrics_values))) @@ -424,20 +413,18 @@ def get_data(data_fn, param): class CompileFN(): """Compile an objective function that - - trains the model on the training set - evaluates the model on the validation set - reports the performance metric on the validation set as the objective loss - # Arguments - db_name: Database name of the KMongoTrials. - exp_name: Experiment name of the KMongoTrials. + db_name: Database name of the CMongoTrials. + exp_name: Experiment name of the CMongoTrials. data_fn: Tuple containing training data as the x,y pair at the first (index=0) element: `((train_x, test_y), ...)`. If `valid_split` and `cv_n_folds` are both `None`, the second (index=1) tuple is used as the validation dataset. add_eval_metrics: Additional list of (global) evaluation metrics. Individual elements can be - a string (referring to kopt.eval_metrics) + a string (referring to concise.eval_metrics) or a function taking two numpy arrays: `y_true`, `y_pred`. These metrics are ment to supplement those specified in `model.compile(.., metrics = .)`. @@ -462,8 +449,6 @@ class CompileFN(): if save_model="last", save the model after training it. save_results: If True, the return value is saved as .json to the `save_dir` directory. save_dir: Path to the save directory. - custom_objects: argument passed to load_model - Optional dictionary mapping names (strings) to - custom classes or functions to be considered during deserialization. """ # TODO - check if we can get (db_name, exp_name) from hyperopt @@ -483,8 +468,7 @@ def __init__(self, db_name, exp_name, use_tensorboard=False, save_model="best", save_results=True, - save_dir=save_dir(), - custom_objects=None, + save_dir=DEFAULT_SAVE_DIR, **kwargs ): self.data_fn = data_fn @@ -507,7 +491,7 @@ def __init__(self, db_name, exp_name, add_arguments = set(kwargs.keys()).difference(possible_kwargs) if len(add_arguments) > 0: - raise ValueError("Unknown argument(s) {0}. **kwargs accepts only arguments: {1}. ". + raise ValueError("Unknown argument(s) {0}. **kwargs accepts only arguments: {0}. ". format(add_arguments, possible_kwargs)) self.optim_metric = optim_metric @@ -528,8 +512,6 @@ def __init__(self, db_name, exp_name, self.save_dir = save_dir self.save_model = save_model if save_model is not None else "" self.save_results = save_results - # loading - self.custom_objects = custom_objects # backcompatibility if self.save_model is True: @@ -575,8 +557,7 @@ def __call__(self, param): # setup paths for storing the data - TODO check if we can somehow get the id from hyperopt rid = str(uuid4()) tm_dir = self.save_dir_exp + "/train_models/" - if not os.path.exists(tm_dir): - os.makedirs(tm_dir) + os.makedirs(tm_dir, exist_ok=True) model_path = tm_dir + "{0}.h5".format(rid) if self.save_model else "" results_path = tm_dir + "{0}.json".format(rid) if self.save_results else "" @@ -596,6 +577,9 @@ def __call__(self, param): train = data[0] if self.cv_n_folds is None and self.valid_split is None: valid_data = data[1] + test = data[2] + else: + test = data[1] del data time_data_loaded = datetime.now() @@ -622,14 +606,14 @@ def __call__(self, param): save_best_only=True)] eval_metrics, history = _train_and_eval_single(train=train_data, valid=valid_data, + test=test, model=model, epochs=param["fit"]["epochs"], batch_size=param["fit"]["batch_size"], use_weight=param["fit"].get("use_weight", False), callbacks=c_callbacks, eval_best=self.save_model == "best", - add_eval_metrics=self.add_eval_metrics, - custom_objects=self.custom_objects) + add_eval_metrics=self.add_eval_metrics) if self.save_model == "last": model.save(model_path) else: @@ -651,14 +635,14 @@ def __call__(self, param): save_best_only=True)] eval_m, history_elem = _train_and_eval_single(train=subset(train, train_idx), valid=subset(train, valid_idx), + test=test, model=model, epochs=param["fit"]["epochs"], batch_size=param["fit"]["batch_size"], use_weight=param["fit"].get("use_weight", False), callbacks=c_callbacks, eval_best=self.save_model == "best", - add_eval_metrics=self.add_eval_metrics, - custom_objects=self.custom_objects) + add_eval_metrics=self.add_eval_metrics) print("\n") eval_metrics_list.append(eval_m) history.append(history_elem) @@ -784,3 +768,4 @@ def to_str(v): return str(v) return ";".join([k + "=" + to_str(v) for k, v in d.items()]) + From 9b9e6926077d559f5f546025ec5b8120c36ba14e Mon Sep 17 00:00:00 2001 From: matuseviciute Date: Mon, 12 Mar 2018 12:37:39 +0100 Subject: [PATCH 2/2] Update hyopt.py --- kopt/hyopt.py | 45 ++++++++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/kopt/hyopt.py b/kopt/hyopt.py index 2c587c2..afbcf1c 100644 --- a/kopt/hyopt.py +++ b/kopt/hyopt.py @@ -27,12 +27,9 @@ logger = logging.getLogger() logger.setLevel(logging.INFO) -# TODO - have a system-wide config for this -DEFAULT_IP = "ouga03" -DEFAULT_SAVE_DIR = "/s/project/deepcis/hyperopt/" -def test_fn(fn, hyper_params, n_train=1000, save_model=None, tmp_dir="/tmp/hopt_test/", custom_objects=None): +def test_fn(fn, hyper_params, n_train=1000, save_model=None, tmp_dir="/tmp/kopt_test/", custom_objects=None): """Test the correctness of the compiled objective function (CompileFN). I will also test model saving/loading from disk. # Arguments @@ -81,7 +78,7 @@ def new_data_fn(*args, **kwargs): load_model(model_path, custom_objects=custom_objects) -class CMongoTrials(MongoTrials): +class KMongoTrials(MongoTrials): """`hyperopt.MonoTrials` extended with the following methods: - get_trial(tid) - Retrieve trial by tid (Trial ID). - get_param(tid) - Retrieve used hyper-parameters for a trial. @@ -107,13 +104,13 @@ class CMongoTrials(MongoTrials): """ def __init__(self, db_name, exp_name, - ip=DEFAULT_IP, port=1234, kill_timeout=None, **kwargs): + ip=db_host(), port=db_port(), kill_timeout=None, **kwargs): self.kill_timeout = kill_timeout if self.kill_timeout is not None and self.kill_timeout < 60: logger.warning("kill_timeout < 60 -> Very short time for " + "each job to complete before it gets killed!") - super(CMongoTrials, self).__init__( + super(KMongoTrials, self).__init__( 'mongo://{ip}:{p}/{n}/jobs'.format(ip=ip, p=port, n=db_name), exp_key=exp_name, **kwargs) def get_trial(self, tid): @@ -173,7 +170,7 @@ def count_by_state_unsynced(self, arg): """ if self.kill_timeout is not None: self.delete_running(self.kill_timeout) - return super(CMongoTrials, self).count_by_state_unsynced(arg) + return super(KMongoTrials, self).count_by_state_unsynced(arg) def delete_running(self, timeout_last_refresh=0, dry_run=False): """Delete jobs stalled in the running state for too long @@ -256,7 +253,7 @@ def plot_history(self, tid, scores=["loss", "f1", "accuracy"], plt.legend(loc='best') return fig - def load_model(self, tid): + def load_model(self, tid, custom_objects=None): """Load saved keras model of the trial. If tid = None, get the best model Not applicable for trials ran in cross validion (i.e. not applicable @@ -266,7 +263,7 @@ def load_model(self, tid): tid = self.best_trial_tid() model_path = self.get_trial(tid)["result"]["path"]["model"] - return load_model(model_path) + return load_model(model_path, custom_objects=custom_objects) def n_ok(self): """Number of ok trials() @@ -327,7 +324,7 @@ def add_n_epoch(df): # -------------------------------------------- def _train_and_eval_single(train, valid, test, model, batch_size=32, epochs=300, use_weight=False, - callbacks=[], eval_best=False, add_eval_metrics={}): + callbacks=[], eval_best=False, add_eval_metrics={}, custom_objects=None): """Fit and evaluate a keras model eval_best: if True, load the checkpointed model for evaluation """ @@ -358,7 +355,7 @@ def _format_keras_history(history): if eval_best: mcp = [x for x in callbacks if isinstance(x, ModelCheckpoint)] assert len(mcp) == 1 - model = load_model(mcp[0].filepath) + model = load_model(mcp[0].filepath, custom_objects=custom_objects) return eval_model(model, valid, test, add_eval_metrics), hist @@ -370,7 +367,7 @@ def eval_model(model, valid, test, add_eval_metrics={}): test: test-dataset. Tuple of inputs `x` and target `y` - `(x, y)`. add_eval_metrics: Additional evaluation metrics to use. Can be a dictionary or a list of functions accepting arguments: `y_true`, `y_predicted`. Alternatively, you can provide names of functions from - the `concise.eval_metrics` module. + the `kopt.eval_metrics` module. # Returns dictionary with evaluation metrics """ @@ -417,14 +414,14 @@ class CompileFN(): - evaluates the model on the validation set - reports the performance metric on the validation set as the objective loss # Arguments - db_name: Database name of the CMongoTrials. - exp_name: Experiment name of the CMongoTrials. + db_name: Database name of the KMongoTrials. + exp_name: Experiment name of the KMongoTrials. data_fn: Tuple containing training data as the x,y pair at the first (index=0) element: `((train_x, test_y), ...)`. If `valid_split` and `cv_n_folds` are both `None`, the second (index=1) tuple is used as the validation dataset. add_eval_metrics: Additional list of (global) evaluation metrics. Individual elements can be - a string (referring to concise.eval_metrics) + a string (referring to kopt.eval_metrics) or a function taking two numpy arrays: `y_true`, `y_pred`. These metrics are ment to supplement those specified in `model.compile(.., metrics = .)`. @@ -468,7 +465,8 @@ def __init__(self, db_name, exp_name, use_tensorboard=False, save_model="best", save_results=True, - save_dir=DEFAULT_SAVE_DIR, + save_dir=save_dir(), + custom_objects=None, **kwargs ): self.data_fn = data_fn @@ -491,7 +489,7 @@ def __init__(self, db_name, exp_name, add_arguments = set(kwargs.keys()).difference(possible_kwargs) if len(add_arguments) > 0: - raise ValueError("Unknown argument(s) {0}. **kwargs accepts only arguments: {0}. ". + raise ValueError("Unknown argument(s) {0}. **kwargs accepts only arguments: {1}. ". format(add_arguments, possible_kwargs)) self.optim_metric = optim_metric @@ -512,6 +510,8 @@ def __init__(self, db_name, exp_name, self.save_dir = save_dir self.save_model = save_model if save_model is not None else "" self.save_results = save_results + # loading + self.custom_objects = custom_objects # backcompatibility if self.save_model is True: @@ -557,7 +557,8 @@ def __call__(self, param): # setup paths for storing the data - TODO check if we can somehow get the id from hyperopt rid = str(uuid4()) tm_dir = self.save_dir_exp + "/train_models/" - os.makedirs(tm_dir, exist_ok=True) + if not os.path.exists(tm_dir): + os.makedirs(tm_dir) model_path = tm_dir + "{0}.h5".format(rid) if self.save_model else "" results_path = tm_dir + "{0}.json".format(rid) if self.save_results else "" @@ -613,7 +614,8 @@ def __call__(self, param): use_weight=param["fit"].get("use_weight", False), callbacks=c_callbacks, eval_best=self.save_model == "best", - add_eval_metrics=self.add_eval_metrics) + add_eval_metrics=self.add_eval_metrics, + custom_objects=self.custom_objects) if self.save_model == "last": model.save(model_path) else: @@ -642,7 +644,8 @@ def __call__(self, param): use_weight=param["fit"].get("use_weight", False), callbacks=c_callbacks, eval_best=self.save_model == "best", - add_eval_metrics=self.add_eval_metrics) + add_eval_metrics=self.add_eval_metrics, + custom_objects=self.custom_objects) print("\n") eval_metrics_list.append(eval_m) history.append(history_elem)