Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add the ability to read directly from hdf5 files (for large datasets) as well as numpy arrays. #12

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 35 additions & 10 deletions kopt/hyopt.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,7 +334,7 @@ def add_n_epoch(df):
# TODO - put to a separate module
def _train_and_eval_single(train, valid, model,
batch_size=32, epochs=300, use_weight=False,
callbacks=[], eval_best=False, add_eval_metrics={}, custom_objects=None):
callbacks=[], eval_best=False, add_eval_metrics={}, custom_objects=None,data_format='npy'):
"""Fit and evaluate a keras model

eval_best: if True, load the checkpointed model for evaluation
Expand All @@ -352,14 +352,30 @@ def _format_keras_history(history):
# train the model
logger.info("Fit...")
history = History()
model.fit(train[0], train[1],
batch_size=batch_size,
validation_data=valid[:2],
epochs=epochs,
sample_weight=sample_weight,
verbose=2,
callbacks=[history] + callbacks)

# if we're using numpy arrays
if data_format == 'npy':
model.fit(train[0], train[1],
batch_size=batch_size,
validation_data=valid[:2],
epochs=epochs,
sample_weight=sample_weight,
verbose=2,
callbacks=[history] + callbacks)
# if we're using h5 files
elif data_format == 'hdf5':
model.fit(train[0], train[1],
batch_size=batch_size,
validation_data=valid[:2],
epochs=epochs,
sample_weight=sample_weight,
verbose=2,
callbacks=[history] + callbacks,
shuffle='batch')
# else, just exit cleanly
else:
logger.error('Data format is not supported. You can use numpy arrays (default), or hdf5 arrays.')
exit(-1)
# get history
hist = _format_keras_history(history)
# load and eval the best model
Expand Down Expand Up @@ -503,9 +519,17 @@ def __init__(self, db_name, exp_name,
optim_metric = kwargs["loss_metric"]
if "loss_metric_mode" in kwargs and optim_metric_mode == "min":
optim_metric_mode = kwargs["loss_metric_mode"]
possible_kwargs = ["loss_metric", "loss_metric_mode"]

# add in additional kwarg to handle reading directly from h5py
if 'data_format' in kwargs:
self.data_format = kwargs['data_format']
else:
self.data_format = 'npy'
possible_kwargs = ["loss_metric", "loss_metric_mode", 'data_format']
add_arguments = set(kwargs.keys()).difference(possible_kwargs)

# add in ability to handle reading from hdf5

if len(add_arguments) > 0:
raise ValueError("Unknown argument(s) {0}. **kwargs accepts only arguments: {1}. ".
format(add_arguments, possible_kwargs))
Expand Down Expand Up @@ -629,7 +653,8 @@ def __call__(self, param):
callbacks=c_callbacks,
eval_best=self.save_model == "best",
add_eval_metrics=self.add_eval_metrics,
custom_objects=self.custom_objects)
custom_objects=self.custom_objects,
data_format=self.data_format)
if self.save_model == "last":
model.save(model_path)
else:
Expand Down
17 changes: 15 additions & 2 deletions tests/data.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from keras.preprocessing import sequence
from keras.datasets import imdb

import h5py

def data(max_features=5000, maxlen=400):
print('Loading data...')
Expand All @@ -20,4 +20,17 @@ def data(max_features=5000, maxlen=400):
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
return (x_train, y_train, [1, 2, 3, "dummy_data"]), (x_test, y_test)
return (x_train, y_train, [1, 2, 3, "dummy_data"]), (x_test, y_test)

def data_hdf5():
'''
This function returns training and testing data which is simply the data stored in keras.datasets.cifar10

Returns
-------
Tuple of tuples (x_train, y_train), (x_test, y_test)
'''
print('Loading data...')
# open and allow other to open and read in other processes
f = h5py.File('tests/data/data.h5', 'r', libver='latest', swmr=True)
return (f['x_train'], f['y_train']), (f['x_test'], f['y_test'])
Binary file added tests/data/data.h5
Binary file not shown.
28 changes: 27 additions & 1 deletion tests/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.layers import Conv1D, Conv2D, GlobalMaxPooling1D
from keras.datasets import imdb

# set parameters:
Expand Down Expand Up @@ -46,6 +46,32 @@ def build_model(train_data, max_features=5000, maxlen=400,
optimizer='adam',
metrics=['accuracy'])
return model

def build_model_hdf5(train_data,
n_convolutions=3,
batch_size=32,
n_filters=250,
kernel_size=3,
hidden_dims=250):
print('Build model...')
model = Sequential()

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
for i in range(n_convolutions):
model.add(Conv2D(n_filters, kernel_size, padding='same', activation='relu', strides=1))
# We add a vanilla hidden layer:
model.add(Dense(hidden_dims))
model.add(Activation('relu'))

# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(10))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy'])
return model
# model.fit(x_train, y_train,
# batch_size=batch_size,
# epochs=epochs,
Expand Down
31 changes: 31 additions & 0 deletions tests/test_hyopt.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,37 @@ def test_argument_compileCN():
unknown_arg=3)


def test_compilefn_train_test_split_h5py(tmpdir):
'''
Test out a kopt optimization when loading data from h5py.
'''
db_name = "test"
exp_name = "test2"
fn = CompileFN(db_name, exp_name,
data_fn=data.data_hdf5,
model_fn=model.build_model_hdf5,
optim_metric="acc",
optim_metric_mode="max",
# eval
valid_split=.1,
stratified=False,
random_state=True,
save_dir="/tmp/",
data_format='hdf5')
hyper_params = {
"data": {},
"model": {"n_filters": hp.choice("m_n_filters", (2, 5)),
"n_convolutions": hp.choice("m_n_convolutions", (1, 3)),
"kernel_size": hp.choice("m_kernel_size", (2, 5)),
"hidden_dims": 3,
},
"fit": {"epochs": 1}
}
fn_test(fn, hyper_params, tmp_dir=str(tmpdir))
trials = Trials()
best = fmin(fn, hyper_params, trials=trials, algo=tpe.suggest, max_evals=2)
assert isinstance(best, dict)

def test_compilefn_train_test_split(tmpdir):
db_name = "test"
exp_name = "test2"
Expand Down