Avsecz · kryczko · Dec 5, 2018 · Dec 5, 2018 · Dec 20, 2018 · Jan 1, 2019
diff --git a/kopt/hyopt.py b/kopt/hyopt.py
@@ -334,7 +334,7 @@ def add_n_epoch(df):
 # TODO - put to a separate module
 def _train_and_eval_single(train, valid, model,
                            batch_size=32, epochs=300, use_weight=False,
-                           callbacks=[], eval_best=False, add_eval_metrics={}, custom_objects=None):
+                           callbacks=[], eval_best=False, add_eval_metrics={}, custom_objects=None,data_format='npy'):
     """Fit and evaluate a keras model
 
     eval_best: if True, load the checkpointed model for evaluation
@@ -352,14 +352,30 @@ def _format_keras_history(history):
     # train the model
     logger.info("Fit...")
     history = History()
-    model.fit(train[0], train[1],
-              batch_size=batch_size,
-              validation_data=valid[:2],
-              epochs=epochs,
-              sample_weight=sample_weight,
-              verbose=2,
-              callbacks=[history] + callbacks)
 
+    # if we're using numpy arrays
+    if data_format == 'npy':
+        model.fit(train[0], train[1],
+                  batch_size=batch_size,
+                  validation_data=valid[:2],
+                  epochs=epochs,
+                  sample_weight=sample_weight,
+                  verbose=2,
+                  callbacks=[history] + callbacks)
+    # if we're using h5 files
+    elif data_format == 'hdf5':
+        model.fit(train[0], train[1],
+                  batch_size=batch_size,
+                  validation_data=valid[:2],
+                  epochs=epochs,
+                  sample_weight=sample_weight,
+                  verbose=2,
+                  callbacks=[history] + callbacks,
+                  shuffle='batch')
+    # else, just exit cleanly
+    else:
+        logger.error('Data format is not supported. You can use numpy arrays (default), or hdf5 arrays.')
+        exit(-1)
     # get history
     hist = _format_keras_history(history)
     # load and eval the best model
@@ -503,9 +519,17 @@ def __init__(self, db_name, exp_name,
             optim_metric = kwargs["loss_metric"]
         if "loss_metric_mode" in kwargs and optim_metric_mode == "min":
             optim_metric_mode = kwargs["loss_metric_mode"]
-        possible_kwargs = ["loss_metric", "loss_metric_mode"]
+
+        # add in additional kwarg to handle reading directly from h5py
+        if 'data_format' in kwargs:
+            self.data_format = kwargs['data_format']
+        else:
+            self.data_format = 'npy'
+        possible_kwargs = ["loss_metric", "loss_metric_mode", 'data_format']
         add_arguments = set(kwargs.keys()).difference(possible_kwargs)
 
+        # add in ability to handle reading from hdf5
+
         if len(add_arguments) > 0:
             raise ValueError("Unknown argument(s) {0}. **kwargs accepts only arguments: {1}.  ".
                              format(add_arguments, possible_kwargs))
@@ -629,7 +653,8 @@ def __call__(self, param):
                                                            callbacks=c_callbacks,
                                                            eval_best=self.save_model == "best",
                                                            add_eval_metrics=self.add_eval_metrics,
-                                                           custom_objects=self.custom_objects)
+                                                           custom_objects=self.custom_objects,
+                                                           data_format=self.data_format)
             if self.save_model == "last":
                 model.save(model_path)
         else:

diff --git a/tests/data.py b/tests/data.py
@@ -1,6 +1,6 @@
 from keras.preprocessing import sequence
 from keras.datasets import imdb
-
+import h5py
 
 def data(max_features=5000, maxlen=400):
     print('Loading data...')
@@ -20,4 +20,17 @@ def data(max_features=5000, maxlen=400):
     x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
     print('x_train shape:', x_train.shape)
     print('x_test shape:', x_test.shape)
-    return (x_train, y_train, [1, 2, 3, "dummy_data"]), (x_test, y_test)
+    return (x_train, y_train, [1, 2, 3, "dummy_data"]), (x_test, y_test)
+
+def data_hdf5():
+    '''
+    This function returns training and testing data which is simply the data stored in keras.datasets.cifar10
+
+    Returns
+    -------
+    Tuple of tuples (x_train, y_train), (x_test, y_test)
+    '''
+    print('Loading data...')
+    # open and allow other to open and read in other processes
+    f = h5py.File('tests/data/data.h5', 'r', libver='latest', swmr=True)
+    return (f['x_train'], f['y_train']), (f['x_test'], f['y_test'])
diff --git a/tests/data/data.h5 b/tests/data/data.h5
diff --git a/tests/model.py b/tests/model.py
@@ -4,7 +4,7 @@
 from keras.models import Sequential
 from keras.layers import Dense, Dropout, Activation
 from keras.layers import Embedding
-from keras.layers import Conv1D, GlobalMaxPooling1D
+from keras.layers import Conv1D, Conv2D, GlobalMaxPooling1D
 from keras.datasets import imdb
 
 # set parameters:
@@ -46,6 +46,32 @@ def build_model(train_data, max_features=5000, maxlen=400,
                   optimizer='adam',
                   metrics=['accuracy'])
     return model
+
+def build_model_hdf5(train_data, 
+                     n_convolutions=3,
+                     batch_size=32, 
+                     n_filters=250, 
+                     kernel_size=3, 
+                     hidden_dims=250):
+    print('Build model...')
+    model = Sequential()
+
+    # we start off with an efficient embedding layer which maps
+    # our vocab indices into embedding_dims dimensions
+    for i in range(n_convolutions):
+        model.add(Conv2D(n_filters, kernel_size, padding='same', activation='relu', strides=1))
+    # We add a vanilla hidden layer:
+    model.add(Dense(hidden_dims))
+    model.add(Activation('relu'))
+
+    # We project onto a single unit output layer, and squash it with a sigmoid:
+    model.add(Dense(10))
+    model.add(Activation('sigmoid'))
+
+    model.compile(loss='binary_crossentropy',
+                  optimizer='adam',
+                  metrics=['accuracy'])
+    return model
     # model.fit(x_train, y_train,
     #           batch_size=batch_size,
     #           epochs=epochs,

diff --git a/tests/test_hyopt.py b/tests/test_hyopt.py
@@ -39,6 +39,37 @@ def test_argument_compileCN():
                       unknown_arg=3)
 
 
+def test_compilefn_train_test_split_h5py(tmpdir):
+    '''
+    Test out a kopt optimization when loading data from h5py.
+    '''
+    db_name = "test"
+    exp_name = "test2"
+    fn = CompileFN(db_name, exp_name,
+                   data_fn=data.data_hdf5,
+                   model_fn=model.build_model_hdf5,
+                   optim_metric="acc",
+                   optim_metric_mode="max",
+                   # eval
+                   valid_split=.1,
+                   stratified=False,
+                   random_state=True,
+                   save_dir="/tmp/",
+                   data_format='hdf5')
+    hyper_params = {
+        "data": {},
+        "model": {"n_filters": hp.choice("m_n_filters", (2, 5)),
+                  "n_convolutions": hp.choice("m_n_convolutions", (1, 3)),
+                  "kernel_size": hp.choice("m_kernel_size", (2, 5)),
+                  "hidden_dims": 3,
+                  },
+        "fit": {"epochs": 1}
+    }
+    fn_test(fn, hyper_params, tmp_dir=str(tmpdir))
+    trials = Trials()
+    best = fmin(fn, hyper_params, trials=trials, algo=tpe.suggest, max_evals=2)
+    assert isinstance(best, dict)
+
 def test_compilefn_train_test_split(tmpdir):
     db_name = "test"
     exp_name = "test2"