From 37dc3a797275d084ba89fdb8f04b8eb61a755462 Mon Sep 17 00:00:00 2001
From: matuseviciute <agnemane@gmail.com>
Date: Mon, 12 Mar 2018 11:22:43 +0100
Subject: [PATCH 1/2] Update hyopt.py

---
 kopt/hyopt.py | 87 +++++++++++++++++++++------------------------------
 1 file changed, 36 insertions(+), 51 deletions(-)

diff --git a/kopt/hyopt.py b/kopt/hyopt.py
index da9cc6f..2c587c2 100644
--- a/kopt/hyopt.py
+++ b/kopt/hyopt.py
@@ -10,7 +10,6 @@
 import kopt.eval_metrics as ce
 from kopt.utils import write_json, merge_dicts, _to_string
 from kopt.model_data import (subset, split_train_test_idx, split_KFold_idx)
-from kopt.config import db_host, db_port, save_dir
 from datetime import datetime, timedelta
 from uuid import uuid4
 from hyperopt import STATUS_OK
@@ -21,17 +20,21 @@
 import glob
 import pprint
 import logging
+import matplotlib.pyplot as plt
 
 
 logging.basicConfig(format='%(asctime)s [%(levelname)s] %(message)s')
 logger = logging.getLogger()
 logger.setLevel(logging.INFO)
 
+# TODO - have a system-wide config for this
+DEFAULT_IP = "ouga03"
+DEFAULT_SAVE_DIR = "/s/project/deepcis/hyperopt/"
 
-def test_fn(fn, hyper_params, n_train=1000, save_model='best', tmp_dir="/tmp/kopt_test/", custom_objects=None):
+
+def test_fn(fn, hyper_params, n_train=1000, save_model=None, tmp_dir="/tmp/hopt_test/", custom_objects=None):
     """Test the correctness of the compiled objective function (CompileFN). I will also test
     model saving/loading from disk.
-
     # Arguments
         fn: CompileFN instance
         hyper_params: pyll graph of hyper-parameters - as later provided to `hyperopt.fmin`
@@ -48,7 +51,7 @@ def new_data_fn(*args, **kwargs):
             data = data_fn(*args, **kwargs)
             train = data[0]
             train = subset(train, idx=np.arange(min(n_train, train[1].shape[0])))
-            return train,
+            return train, data[1], data[2]
         return new_data_fn
     start_time = datetime.now()
     fn = deepcopy(fn)
@@ -78,9 +81,8 @@ def new_data_fn(*args, **kwargs):
         load_model(model_path, custom_objects=custom_objects)
 
 
-class KMongoTrials(MongoTrials):
+class CMongoTrials(MongoTrials):
     """`hyperopt.MonoTrials` extended with the following methods:
-
     - get_trial(tid) - Retrieve trial by tid (Trial ID).
     - get_param(tid) - Retrieve used hyper-parameters for a trial.
     - best_trial_tid(rank=0) - Return the trial with lowest loss.
@@ -95,7 +97,6 @@ class KMongoTrials(MongoTrials):
     - get_ok_results - Return a list of trial results with an "ok" status
     - load_model(tid) - Load a Keras model of a tid.
     - as_df - Returns a tidy `pandas.DataFrame` of the trials database.
-
     # Arguments
         db_name: str, MongoTrials database name
         exp_name: strm, MongoTrials experiment name
@@ -103,17 +104,16 @@ class KMongoTrials(MongoTrials):
         port: int, MongoDB port.
         kill_timeout: int, Maximum runtime of a job (in seconds) before it gets killed. None for infinite.
         **kwargs: Additional keyword arguments passed to the `hyperopt.MongoTrials` constructor.
-
     """
 
     def __init__(self, db_name, exp_name,
-                 ip=db_host(), port=db_port(), kill_timeout=None, **kwargs):
+                 ip=DEFAULT_IP, port=1234, kill_timeout=None, **kwargs):
         self.kill_timeout = kill_timeout
         if self.kill_timeout is not None and self.kill_timeout < 60:
             logger.warning("kill_timeout < 60 -> Very short time for " +
                            "each job to complete before it gets killed!")
 
-        super(KMongoTrials, self).__init__(
+        super(CMongoTrials, self).__init__(
             'mongo://{ip}:{p}/{n}/jobs'.format(ip=ip, p=port, n=db_name), exp_key=exp_name, **kwargs)
 
     def get_trial(self, tid):
@@ -128,7 +128,6 @@ def get_param(self, tid):
 
     def best_trial_tid(self, rank=0):
         """Get tid of the best trial
-
         rank=0 means the best model
         rank=1 means second best
         ...
@@ -174,11 +173,10 @@ def count_by_state_unsynced(self, arg):
         """
         if self.kill_timeout is not None:
             self.delete_running(self.kill_timeout)
-        return super(KMongoTrials, self).count_by_state_unsynced(arg)
+        return super(CMongoTrials, self).count_by_state_unsynced(arg)
 
     def delete_running(self, timeout_last_refresh=0, dry_run=False):
         """Delete jobs stalled in the running state for too long
-
         timeout_last_refresh, int: number of seconds
         """
         running_all = self.handle.jobs_running()
@@ -246,8 +244,6 @@ def plot_history(self, tid, scores=["loss", "f1", "accuracy"],
                      figsize=(15, 3)):
         """Plot the loss curves"""
         history = self.train_history(tid)
-        import matplotlib.pyplot as plt
-
         fig = plt.figure(figsize=figsize)
         for i, score in enumerate(scores):
             plt.subplot(1, len(scores), i + 1)
@@ -260,11 +256,9 @@ def plot_history(self, tid, scores=["loss", "f1", "accuracy"],
             plt.legend(loc='best')
         return fig
 
-    def load_model(self, tid, custom_objects=None):
+    def load_model(self, tid):
         """Load saved keras model of the trial.
-
         If tid = None, get the best model
-
         Not applicable for trials ran in cross validion (i.e. not applicable
         for `CompileFN.cv_n_folds is None`
         """
@@ -272,7 +266,7 @@ def load_model(self, tid, custom_objects=None):
             tid = self.best_trial_tid()
 
         model_path = self.get_trial(tid)["result"]["path"]["model"]
-        return load_model(model_path, custom_objects=custom_objects)
+        return load_model(model_path)
 
     def n_ok(self):
         """Number of ok trials()
@@ -331,12 +325,10 @@ def add_n_epoch(df):
 
 
 # --------------------------------------------
-# TODO - put to a separate module
-def _train_and_eval_single(train, valid, model,
+def _train_and_eval_single(train, valid, test, model,
                            batch_size=32, epochs=300, use_weight=False,
-                           callbacks=[], eval_best=False, add_eval_metrics={}, custom_objects=None):
+                           callbacks=[], eval_best=False, add_eval_metrics={}):
     """Fit and evaluate a keras model
-
     eval_best: if True, load the checkpointed model for evaluation
     """
     def _format_keras_history(history):
@@ -358,7 +350,7 @@ def _format_keras_history(history):
               epochs=epochs,
               sample_weight=sample_weight,
               verbose=2,
-              callbacks=[history] + callbacks)
+              callbacks=[history])# + callbacks) <------------------------------ #TODO: make early stopping optional
 
     # get history
     hist = _format_keras_history(history)
@@ -366,30 +358,27 @@ def _format_keras_history(history):
     if eval_best:
         mcp = [x for x in callbacks if isinstance(x, ModelCheckpoint)]
         assert len(mcp) == 1
-        model = load_model(mcp[0].filepath, custom_objects=custom_objects)
+        model = load_model(mcp[0].filepath)
 
-    return eval_model(model, valid, add_eval_metrics), hist
+    return eval_model(model, valid, test, add_eval_metrics), hist
 
 
-def eval_model(model, test, add_eval_metrics={}):
+def eval_model(model, valid, test, add_eval_metrics={}):
     """Evaluate model's performance on the test-set.
-
     # Arguments
         model: Keras model
         test: test-dataset. Tuple of inputs `x` and target `y` - `(x, y)`.
         add_eval_metrics: Additional evaluation metrics to use. Can be a dictionary or a list of functions
     accepting arguments: `y_true`, `y_predicted`. Alternatively, you can provide names of functions from
-    the `kopt.eval_metrics` module.
-
+    the `concise.eval_metrics` module.
     # Returns
         dictionary with evaluation metrics
-
     """
     # evaluate the model
     logger.info("Evaluate...")
     # - model_metrics
-    model_metrics_values = model.evaluate(test[0], test[1], verbose=0,
-                                          batch_size=test[1].shape[0])
+    model_metrics_values = model.evaluate(valid[0], valid[1], verbose=0,
+                                          batch_size=valid[1].shape[0])
     # evaluation is done in a single pass to have more precise metics
     model_metrics = dict(zip(_listify(model.metrics_names),
                              _listify(model_metrics_values)))
@@ -424,20 +413,18 @@ def get_data(data_fn, param):
 
 class CompileFN():
     """Compile an objective function that
-
     - trains the model on the training set
     - evaluates the model on the validation set
     - reports the performance metric on the validation set as the objective loss
-
     # Arguments
-        db_name: Database name of the KMongoTrials.
-        exp_name: Experiment name of the KMongoTrials.
+        db_name: Database name of the CMongoTrials.
+        exp_name: Experiment name of the CMongoTrials.
         data_fn: Tuple containing training data as the x,y pair at the first (index=0) element:
                  `((train_x, test_y), ...)`. If `valid_split` and `cv_n_folds` are both `None`,
                  the second (index=1) tuple is used as the validation dataset.
         add_eval_metrics: Additional list of (global) evaluation
             metrics. Individual elements can be
-            a string (referring to kopt.eval_metrics)
+            a string (referring to concise.eval_metrics)
             or a function taking two numpy arrays: `y_true`, `y_pred`.
             These metrics are ment to supplement those specified in
             `model.compile(.., metrics = .)`.
@@ -462,8 +449,6 @@ class CompileFN():
                     if save_model="last", save the model after training it.
         save_results: If True, the return value is saved as .json to the `save_dir` directory.
         save_dir: Path to the save directory.
-        custom_objects: argument passed to load_model - Optional dictionary mapping names (strings) to
-             custom classes or functions to be considered during deserialization.
     """
     # TODO - check if we can get (db_name, exp_name) from hyperopt
 
@@ -483,8 +468,7 @@ def __init__(self, db_name, exp_name,
                  use_tensorboard=False,
                  save_model="best",
                  save_results=True,
-                 save_dir=save_dir(),
-                 custom_objects=None,
+                 save_dir=DEFAULT_SAVE_DIR,
                  **kwargs
                  ):
         self.data_fn = data_fn
@@ -507,7 +491,7 @@ def __init__(self, db_name, exp_name,
         add_arguments = set(kwargs.keys()).difference(possible_kwargs)
 
         if len(add_arguments) > 0:
-            raise ValueError("Unknown argument(s) {0}. **kwargs accepts only arguments: {1}.  ".
+            raise ValueError("Unknown argument(s) {0}. **kwargs accepts only arguments: {0}.  ".
                              format(add_arguments, possible_kwargs))
 
         self.optim_metric = optim_metric
@@ -528,8 +512,6 @@ def __init__(self, db_name, exp_name,
         self.save_dir = save_dir
         self.save_model = save_model if save_model is not None else ""
         self.save_results = save_results
-        # loading
-        self.custom_objects = custom_objects
 
         # backcompatibility
         if self.save_model is True:
@@ -575,8 +557,7 @@ def __call__(self, param):
         # setup paths for storing the data - TODO check if we can somehow get the id from hyperopt
         rid = str(uuid4())
         tm_dir = self.save_dir_exp + "/train_models/"
-        if not os.path.exists(tm_dir):
-            os.makedirs(tm_dir)
+        os.makedirs(tm_dir, exist_ok=True)
         model_path = tm_dir + "{0}.h5".format(rid) if self.save_model else ""
         results_path = tm_dir + "{0}.json".format(rid) if self.save_results else ""
 
@@ -596,6 +577,9 @@ def __call__(self, param):
         train = data[0]
         if self.cv_n_folds is None and self.valid_split is None:
             valid_data = data[1]
+            test = data[2]
+        else:
+            test = data[1]
         del data
         time_data_loaded = datetime.now()
 
@@ -622,14 +606,14 @@ def __call__(self, param):
                                                 save_best_only=True)]
             eval_metrics, history = _train_and_eval_single(train=train_data,
                                                            valid=valid_data,
+                                                           test=test,
                                                            model=model,
                                                            epochs=param["fit"]["epochs"],
                                                            batch_size=param["fit"]["batch_size"],
                                                            use_weight=param["fit"].get("use_weight", False),
                                                            callbacks=c_callbacks,
                                                            eval_best=self.save_model == "best",
-                                                           add_eval_metrics=self.add_eval_metrics,
-                                                           custom_objects=self.custom_objects)
+                                                           add_eval_metrics=self.add_eval_metrics)
             if self.save_model == "last":
                 model.save(model_path)
         else:
@@ -651,14 +635,14 @@ def __call__(self, param):
                                                     save_best_only=True)]
                 eval_m, history_elem = _train_and_eval_single(train=subset(train, train_idx),
                                                               valid=subset(train, valid_idx),
+                                                              test=test,
                                                               model=model,
                                                               epochs=param["fit"]["epochs"],
                                                               batch_size=param["fit"]["batch_size"],
                                                               use_weight=param["fit"].get("use_weight", False),
                                                               callbacks=c_callbacks,
                                                               eval_best=self.save_model == "best",
-                                                              add_eval_metrics=self.add_eval_metrics,
-                                                              custom_objects=self.custom_objects)
+                                                              add_eval_metrics=self.add_eval_metrics)
                 print("\n")
                 eval_metrics_list.append(eval_m)
                 history.append(history_elem)
@@ -784,3 +768,4 @@ def to_str(v):
             return str(v)
 
     return ";".join([k + "=" + to_str(v) for k, v in d.items()])
+

From 9b9e6926077d559f5f546025ec5b8120c36ba14e Mon Sep 17 00:00:00 2001
From: matuseviciute <agnemane@gmail.com>
Date: Mon, 12 Mar 2018 12:37:39 +0100
Subject: [PATCH 2/2] Update hyopt.py

---
 kopt/hyopt.py | 45 ++++++++++++++++++++++++---------------------
 1 file changed, 24 insertions(+), 21 deletions(-)

diff --git a/kopt/hyopt.py b/kopt/hyopt.py
index 2c587c2..afbcf1c 100644
--- a/kopt/hyopt.py
+++ b/kopt/hyopt.py
@@ -27,12 +27,9 @@
 logger = logging.getLogger()
 logger.setLevel(logging.INFO)
 
-# TODO - have a system-wide config for this
-DEFAULT_IP = "ouga03"
-DEFAULT_SAVE_DIR = "/s/project/deepcis/hyperopt/"
 
 
-def test_fn(fn, hyper_params, n_train=1000, save_model=None, tmp_dir="/tmp/hopt_test/", custom_objects=None):
+def test_fn(fn, hyper_params, n_train=1000, save_model=None, tmp_dir="/tmp/kopt_test/", custom_objects=None):
     """Test the correctness of the compiled objective function (CompileFN). I will also test
     model saving/loading from disk.
     # Arguments
@@ -81,7 +78,7 @@ def new_data_fn(*args, **kwargs):
         load_model(model_path, custom_objects=custom_objects)
 
 
-class CMongoTrials(MongoTrials):
+class KMongoTrials(MongoTrials):
     """`hyperopt.MonoTrials` extended with the following methods:
     - get_trial(tid) - Retrieve trial by tid (Trial ID).
     - get_param(tid) - Retrieve used hyper-parameters for a trial.
@@ -107,13 +104,13 @@ class CMongoTrials(MongoTrials):
     """
 
     def __init__(self, db_name, exp_name,
-                 ip=DEFAULT_IP, port=1234, kill_timeout=None, **kwargs):
+                 ip=db_host(), port=db_port(), kill_timeout=None, **kwargs):
         self.kill_timeout = kill_timeout
         if self.kill_timeout is not None and self.kill_timeout < 60:
             logger.warning("kill_timeout < 60 -> Very short time for " +
                            "each job to complete before it gets killed!")
 
-        super(CMongoTrials, self).__init__(
+        super(KMongoTrials, self).__init__(
             'mongo://{ip}:{p}/{n}/jobs'.format(ip=ip, p=port, n=db_name), exp_key=exp_name, **kwargs)
 
     def get_trial(self, tid):
@@ -173,7 +170,7 @@ def count_by_state_unsynced(self, arg):
         """
         if self.kill_timeout is not None:
             self.delete_running(self.kill_timeout)
-        return super(CMongoTrials, self).count_by_state_unsynced(arg)
+        return super(KMongoTrials, self).count_by_state_unsynced(arg)
 
     def delete_running(self, timeout_last_refresh=0, dry_run=False):
         """Delete jobs stalled in the running state for too long
@@ -256,7 +253,7 @@ def plot_history(self, tid, scores=["loss", "f1", "accuracy"],
             plt.legend(loc='best')
         return fig
 
-    def load_model(self, tid):
+    def load_model(self, tid, custom_objects=None):
         """Load saved keras model of the trial.
         If tid = None, get the best model
         Not applicable for trials ran in cross validion (i.e. not applicable
@@ -266,7 +263,7 @@ def load_model(self, tid):
             tid = self.best_trial_tid()
 
         model_path = self.get_trial(tid)["result"]["path"]["model"]
-        return load_model(model_path)
+        return load_model(model_path, custom_objects=custom_objects)
 
     def n_ok(self):
         """Number of ok trials()
@@ -327,7 +324,7 @@ def add_n_epoch(df):
 # --------------------------------------------
 def _train_and_eval_single(train, valid, test, model,
                            batch_size=32, epochs=300, use_weight=False,
-                           callbacks=[], eval_best=False, add_eval_metrics={}):
+                           callbacks=[], eval_best=False, add_eval_metrics={}, custom_objects=None):
     """Fit and evaluate a keras model
     eval_best: if True, load the checkpointed model for evaluation
     """
@@ -358,7 +355,7 @@ def _format_keras_history(history):
     if eval_best:
         mcp = [x for x in callbacks if isinstance(x, ModelCheckpoint)]
         assert len(mcp) == 1
-        model = load_model(mcp[0].filepath)
+        model = load_model(mcp[0].filepath, custom_objects=custom_objects)
 
     return eval_model(model, valid, test, add_eval_metrics), hist
 
@@ -370,7 +367,7 @@ def eval_model(model, valid, test, add_eval_metrics={}):
         test: test-dataset. Tuple of inputs `x` and target `y` - `(x, y)`.
         add_eval_metrics: Additional evaluation metrics to use. Can be a dictionary or a list of functions
     accepting arguments: `y_true`, `y_predicted`. Alternatively, you can provide names of functions from
-    the `concise.eval_metrics` module.
+    the `kopt.eval_metrics` module.
     # Returns
         dictionary with evaluation metrics
     """
@@ -417,14 +414,14 @@ class CompileFN():
     - evaluates the model on the validation set
     - reports the performance metric on the validation set as the objective loss
     # Arguments
-        db_name: Database name of the CMongoTrials.
-        exp_name: Experiment name of the CMongoTrials.
+        db_name: Database name of the KMongoTrials.
+        exp_name: Experiment name of the KMongoTrials.
         data_fn: Tuple containing training data as the x,y pair at the first (index=0) element:
                  `((train_x, test_y), ...)`. If `valid_split` and `cv_n_folds` are both `None`,
                  the second (index=1) tuple is used as the validation dataset.
         add_eval_metrics: Additional list of (global) evaluation
             metrics. Individual elements can be
-            a string (referring to concise.eval_metrics)
+            a string (referring to kopt.eval_metrics)
             or a function taking two numpy arrays: `y_true`, `y_pred`.
             These metrics are ment to supplement those specified in
             `model.compile(.., metrics = .)`.
@@ -468,7 +465,8 @@ def __init__(self, db_name, exp_name,
                  use_tensorboard=False,
                  save_model="best",
                  save_results=True,
-                 save_dir=DEFAULT_SAVE_DIR,
+                 save_dir=save_dir(),
+                 custom_objects=None,
                  **kwargs
                  ):
         self.data_fn = data_fn
@@ -491,7 +489,7 @@ def __init__(self, db_name, exp_name,
         add_arguments = set(kwargs.keys()).difference(possible_kwargs)
 
         if len(add_arguments) > 0:
-            raise ValueError("Unknown argument(s) {0}. **kwargs accepts only arguments: {0}.  ".
+            raise ValueError("Unknown argument(s) {0}. **kwargs accepts only arguments: {1}.  ".
                              format(add_arguments, possible_kwargs))
 
         self.optim_metric = optim_metric
@@ -512,6 +510,8 @@ def __init__(self, db_name, exp_name,
         self.save_dir = save_dir
         self.save_model = save_model if save_model is not None else ""
         self.save_results = save_results
+        # loading
+        self.custom_objects = custom_objects
 
         # backcompatibility
         if self.save_model is True:
@@ -557,7 +557,8 @@ def __call__(self, param):
         # setup paths for storing the data - TODO check if we can somehow get the id from hyperopt
         rid = str(uuid4())
         tm_dir = self.save_dir_exp + "/train_models/"
-        os.makedirs(tm_dir, exist_ok=True)
+        if not os.path.exists(tm_dir):
+            os.makedirs(tm_dir)
         model_path = tm_dir + "{0}.h5".format(rid) if self.save_model else ""
         results_path = tm_dir + "{0}.json".format(rid) if self.save_results else ""
 
@@ -613,7 +614,8 @@ def __call__(self, param):
                                                            use_weight=param["fit"].get("use_weight", False),
                                                            callbacks=c_callbacks,
                                                            eval_best=self.save_model == "best",
-                                                           add_eval_metrics=self.add_eval_metrics)
+                                                           add_eval_metrics=self.add_eval_metrics,
+                                                           custom_objects=self.custom_objects)
             if self.save_model == "last":
                 model.save(model_path)
         else:
@@ -642,7 +644,8 @@ def __call__(self, param):
                                                               use_weight=param["fit"].get("use_weight", False),
                                                               callbacks=c_callbacks,
                                                               eval_best=self.save_model == "best",
-                                                              add_eval_metrics=self.add_eval_metrics)
+                                                              add_eval_metrics=self.add_eval_metrics,
+                                                              custom_objects=self.custom_objects)
                 print("\n")
                 eval_metrics_list.append(eval_m)
                 history.append(history_elem)