From d2bc5a81fd3b5173b3e4fb0e042c81c58a17a617 Mon Sep 17 00:00:00 2001 From: Marcel Schoengens Date: Mon, 7 Aug 2017 17:43:34 +0200 Subject: [PATCH 01/14] Adapted VERSION file. --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index ee1372d3..be586341 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.2.2 +0.3 From 60b90d2e0bcb712a4451f351f34e65189dc8ef29 Mon Sep 17 00:00:00 2001 From: Marcel Schoengens Date: Mon, 7 Aug 2017 17:55:33 +0200 Subject: [PATCH 02/14] Updated module description. --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 533b2e33..138d3680 100644 --- a/setup.py +++ b/setup.py @@ -22,7 +22,7 @@ version=version, description='A framework for approximate Bayesian computation (ABC) that speeds up inference by parallelizing computation on single computers or whole clusters.', - long_description='ABCpy is a highly modular, scientific library for approximate Bayesian computation (ABC) written in Python using the parallel computation framework Apache SPARK. The modularity helps domain scientists to easily apply ABC to their research without being ABC experts; using ABCpy they can easily run large parallel simulations without much knowledge about parallelization, even without much additional effort to parallelize their code. Further, ABCpy enables ABC experts to easily develop new inference schemes and evaluate them in a standardized environment, and to extend the library with new algorithms. These benefits come mainly from the modularity of ABCpy.', + long_description='ABCpy is a highly modular, scientific library for approximate Bayesian computation (ABC) written in Python. It is designed to run all included ABC algorithms in parallel, either using multiple cores of a single computer or using an Apache Spark or MPI enabled cluster. The modularity helps domain scientists to easily apply ABC to their research without being ABC experts; using ABCpy they can easily run large parallel simulations without much knowledge about parallelization, even without much additional effort to parallelize their code. Further, ABCpy enables ABC experts to easily develop new inference schemes and evaluate them in a standardized environment, and to extend the library with new algorithms. These benefits come mainly from the modularity of ABCpy.', # The project's main homepage. url='https://github.com/eth-cscs/abcpy', From d76ac8015c244624d669f4ca1568e02ec2c67802 Mon Sep 17 00:00:00 2001 From: statrita Date: Fri, 1 Sep 2017 14:01:58 +0200 Subject: [PATCH 03/14] Bugfixed in Semiautomatic summary selection --- abcpy/summaryselections.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/abcpy/summaryselections.py b/abcpy/summaryselections.py index 741ea891..f6f9901a 100644 --- a/abcpy/summaryselections.py +++ b/abcpy/summaryselections.py @@ -66,12 +66,12 @@ def __init__(self, model, statistics_calc, backend, n_samples = 1000, seed = Non regr = linear_model.LinearRegression(fit_intercept=True) for ind in range(sample_parameters.shape[1]): regr.fit(sample_statistics, sample_parameters[:,ind]) - self.coefficients_learnt[:,ind] = regr.coef_ + self.coefficients_learnt[ind,:] = regr.coef_ def transformation(self, statistics): - if not statistics.shape[1] == self.coefficients_learnt.shape[0]: + if not statistics.shape[1] == self.coefficients_learnt.shape[1]: raise ValueError('Mismatch in dimension of summary statistics') - return np.dot(statistics,self.coefficients_learnt) + return np.dot(statistics,np.transpose(self.coefficients_learnt)) class _RemoteContextSemiautomatic: @@ -105,4 +105,4 @@ def _sample_parameter_statistics(self, seed): return (self.model.get_parameters(), self.stat_calc.statistics(self.model.simulate(1))) - \ No newline at end of file + From b1bcf63e24c2989060286d3c7422cf79ac3dbdff Mon Sep 17 00:00:00 2001 From: Nicole Date: Wed, 20 Sep 2017 10:51:37 +0200 Subject: [PATCH 04/14] Changed inferences.py to not pickle the backend and created a pickle test --- abcpy/inferences_getstate.py | 55 ++++++++++++++++++++++++++++++++++++ tests/pickle_tests.py | 22 +++++++++++++++ 2 files changed, 77 insertions(+) create mode 100644 abcpy/inferences_getstate.py create mode 100644 tests/pickle_tests.py diff --git a/abcpy/inferences_getstate.py b/abcpy/inferences_getstate.py new file mode 100644 index 00000000..9074e9b3 --- /dev/null +++ b/abcpy/inferences_getstate.py @@ -0,0 +1,55 @@ +import numpy as np +from abcpy.output import Journal +from scipy import optimize + +class RejectionABC: + def __init__(self, observations, model, distance, backend, epsilon, n_samples, n_samples_per_param, seed=None): + self.model = model + self.dist_calc = distance + self.backend = backend + self.rng = np.random.RandomState(seed) + self.epsilon = epsilon + self.observations_bds = backend.broadcast(observations) + self.n_samples = n_samples + self.n_samples_per_param = n_samples_per_param + + def __getstate__(self): + state = self.__dict__.copy() + del state['backend'] + return state + + def sample(self, observations, n_samples, n_samples_per_param, epsilon, full_output=0): + journal = Journal(full_output) + journal.configuration["n_samples"] = n_samples + journal.configuration["n_samples_per_param"] = n_samples_per_param + journal.configuration["epsilon"] = epsilon + + accepted_parameters = None + + # Initialize variables that need to be available remotely + + # main Rejection ABC algorithm + seed_arr = self.rng.randint(1, n_samples * n_samples, size=n_samples, dtype=np.int32) + seed_pds = self.backend.parallelize(seed_arr) + + accepted_parameters_pds = self.backend.map(self._sample_parameter, seed_pds) + accepted_parameters = self.backend.collect(accepted_parameters_pds) + accepted_parameters = np.array(accepted_parameters) + + journal.add_parameters(accepted_parameters) + journal.add_weights(np.ones((n_samples, 1))) + + return journal + + def _sample_parameter(self, seed): + distance = self.dist_calc.dist_max() + self.model.prior.reseed(seed) + + while distance > self.epsilon: + # Accept new parameter value if the distance is less than epsilon + self.model.sample_from_prior() + y_sim = self.model.simulate(self.n_samples_per_param) + distance = self.dist_calc.distance(self.observations_bds.value(), y_sim) + + return self.model.get_parameters() + diff --git a/tests/pickle_tests.py b/tests/pickle_tests.py new file mode 100644 index 00000000..8e64a1ad --- /dev/null +++ b/tests/pickle_tests.py @@ -0,0 +1,22 @@ +import unittest +import cloudpickle +import numpy as np + +class ToBePickled: + def __init__(self): + self.included = 5 + self.notIncluded = np.zeros(10**5) + + def __getstate__(self): + state = self.__dict__.copy() + del state['notIncluded'] + return state + +class PickleTests(unittest.TestCase): + def test_exclusion(self,A,string): + pickled_object = cloudpickle.dumps(A()) + unpickled_object = cloudpickle.loads(pickled_object) + assert(not(hasattr(pickled_object,string))) + +A=PickleTests() +A.test_exclusion(ToBePickled,'notIncluded') \ No newline at end of file From 54c34427ea66359208aa04c9493413fa856f4931 Mon Sep 17 00:00:00 2001 From: Nicole Date: Wed, 20 Sep 2017 11:26:37 +0200 Subject: [PATCH 05/14] Removed call of sanitize_and_pack when calling map fucntion --- abcpy/backends/mpi.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/abcpy/backends/mpi.py b/abcpy/backends/mpi.py index 52d04acc..2d1bab84 100644 --- a/abcpy/backends/mpi.py +++ b/abcpy/backends/mpi.py @@ -54,7 +54,7 @@ def __command_slaves(self, command, data): elif command == self.OP_MAP: #In map we receive data as (pds_id,pds_id_new,func) #Use cloudpickle to dump the function into a string. - function_packed = self.__sanitize_and_pack_func(data[2]) + function_packed = cloudpickle.dumps(data[2]) data_packet = (command, data[0], data[1], function_packed) elif command == self.OP_BROADCAST: @@ -96,7 +96,6 @@ def __sanitize_and_pack_func(self, func): globals()['backend'] = {} function_packed = cloudpickle.dumps(func) - #Reset the backend to self after it's been packed globals()['backend'] = self From 92528f26e0b66d8cc445f6738f52f4e2b2aa0c83 Mon Sep 17 00:00:00 2001 From: Nicole Date: Wed, 20 Sep 2017 17:05:23 +0200 Subject: [PATCH 06/14] added getstate methods to all inference classes, documented pickle test and changed the inferences tests such that it can use the new classes --- abcpy/inferences.py | 1675 ++++++++++++++++++------------------- tests/inferences_tests.py | 87 +- tests/pickle_tests.py | 14 + 3 files changed, 858 insertions(+), 918 deletions(-) diff --git a/abcpy/inferences.py b/abcpy/inferences.py index eeae277a..e413218d 100644 --- a/abcpy/inferences.py +++ b/abcpy/inferences.py @@ -3,117 +3,49 @@ from scipy import optimize class RejectionABC: - """This base class implements the rejection algorithm based inference scheme [1] for - Approximate Bayesian Computation. - - [1] Tavaré, S., Balding, D., Griffith, R., Donnelly, P.: Inferring coalescence - times from DNA sequence data. Genetics 145(2), 505–518 (1997). - - Parameters - ---------- - model: abcpy.models.Model - Model object that conforms to the Model class. - distance: abcpy.distances.Distance - Distance object that conforms to the Distance class. - backend: abcpy.backends.Backend - Backend object that conforms to the Backend class. - seed: integer, optional - Optional initial seed for the random number generator. The default value is generated randomly. - """ - def __init__(self, model, distance, backend, seed=None): + def __init__(self, observations, model, distance, backend, epsilon, n_samples, n_samples_per_param, seed=None): self.model = model self.dist_calc = distance self.backend = backend + self.epsilon = epsilon + self.observations_bds = self.backend.broadcast(observations) + self.n_samples = n_samples + self.n_samples_per_param = n_samples_per_param self.rng = np.random.RandomState(seed) + def __getstate__(self): + #tells cloudpickle to not include the backend when pickling + state = self.__dict__.copy() + del state['backend'] + return state - def sample(self, observations, n_samples, n_samples_per_param, epsilon, full_output=0): - """Samples from the posterior distribution of the model parameter given the observed - data observations. - - Parameters - ---------- - observations : python list - The observed data set. - n_samples : integer - Number of samples to generate. - n_samples_per_param : integer - Number of data points in each simulated dataset. - epsilon: float - Value of threshold. - full_output: integer, optional - If full_output==1, intermediate results are included in output journal. - The default value is 0, meaning the intermediate results are not saved. - - Returns - ------- - abcpy.output.Journal - a journal containing simulation results, metadata and optionally intermediate results. - """ - + def sample(self, full_output=0): journal = Journal(full_output) - journal.configuration["n_samples"] = n_samples - journal.configuration["n_samples_per_param"] = n_samples_per_param - journal.configuration["epsilon"] = epsilon - - accepted_parameters = None - + journal.configuration["n_samples"] = self.n_samples + journal.configuration["n_samples_per_param"] = self.n_samples_per_param + journal.configuration["epsilon"] = self.epsilon + + accepted_parameters = None + # Initialize variables that need to be available remotely - rc = _RemoteContextRejectionABC(self.backend, self.model, self.dist_calc, observations, n_samples, n_samples_per_param, epsilon) - - # main Rejection ABC algorithm - seed_arr = self.rng.randint(1, n_samples*n_samples, size=n_samples, dtype=np.int32) - seed_pds = self.backend.parallelize(seed_arr) - accepted_parameters_pds = self.backend.map(rc._sample_parameter, seed_pds) + # main Rejection ABC algorithm + seed_arr = self.rng.randint(1, self.n_samples * self.n_samples, size=self.n_samples, dtype=np.int32) + seed_pds = self.backend.parallelize(seed_arr) + + accepted_parameters_pds = self.backend.map(self._sample_parameter, seed_pds) accepted_parameters = self.backend.collect(accepted_parameters_pds) accepted_parameters = np.array(accepted_parameters) journal.add_parameters(accepted_parameters) - journal.add_weights(np.ones((n_samples, 1))) - - return journal - - - -class _RemoteContextRejectionABC: - """ - Contains everything that is sent over the network like broadcast vars and map functions - """ - - def __init__(self, backend, model, dist_calc, observations, n_samples, n_samples_per_param, epsilon): - self.model = model - self.dist_calc = dist_calc - self.n_samples = n_samples - self.n_samples_per_param = n_samples_per_param - - self.epsilon = epsilon - - # these are usually big tables, so we broadcast them to have them once - # per executor instead of once per task - self.observations_bds = backend.broadcast(observations) + journal.add_weights(np.ones((self.n_samples, 1))) + return journal - def _sample_parameter(self, seed): - """ - Samples a single model parameter and simulates from it until - distance between simulated outcome and the observation is - smaller than eplison. - - Parameters - ---------- - seed: int - value of a seed to be used for reseeding - Returns - ------- - np.array - accepted parameter - """ - distance = self.dist_calc.dist_max() self.model.prior.reseed(seed) - + while distance > self.epsilon: # Accept new parameter value if the distance is less than epsilon self.model.sample_from_prior() @@ -123,19 +55,17 @@ def _sample_parameter(self, seed): return self.model.get_parameters() - - class PMCABC: - """This base class implements a modified version of Population Monte Carlo based inference scheme - for Approximate Bayesian computation of Beaumont et. al. [1]. Here the threshold value at `t`-th generation are adaptively chosen - by taking the maximum between the epsilon_percentile-th value of discrepancies of the accepted - parameters at `t-1`-th generation and the threshold value provided for this generation by the user. If we take the + """This base class implements a modified version of Population Monte Carlo based inference scheme + for Approximate Bayesian computation of Beaumont et. al. [1]. Here the threshold value at `t`-th generation are adaptively chosen + by taking the maximum between the epsilon_percentile-th value of discrepancies of the accepted + parameters at `t-1`-th generation and the threshold value provided for this generation by the user. If we take the value of epsilon_percentile to be zero (default), this method becomes the inference scheme described in [1], where - the threshold values considered at each generation are the ones provided by the user. - + the threshold values considered at each generation are the ones provided by the user. + [1] M. A. Beaumont. Approximate Bayesian computation in evolution and ecology. Annual Review of Ecology, Evolution, and Systematics, 41(1):379–406, Nov. 2010. - + Parameters ---------- model : abcpy.models.Model @@ -149,104 +79,138 @@ class PMCABC: seed : integer, optional Optional initial seed for the random number generator. The default value is generated randomly. """ - def __init__(self, model, distance, kernel, backend, seed=None): + + def __init__(self, observations, model, distance, kernel, backend, n_samples=10000, n_samples_per_param=1, seed=None): + """Constructor of PMCABC inference schemes. + + Parameters + ---------- + observations : numpy.ndarray + Observed data. + model : abcpy.models.Model + Model object that conforms to the Model class + distance : abcpy.distances.Distance + Distance object that conforms to the Distance class. + kernel : abcpy.distributions.Distribution + Distribution object defining the perturbation kernel needed for the sampling + backend : abcpy.backends.Backend + Backend object that conforms to the Backend class + n_samples : integer, optional + Number of samples to generate. The default value is 10000. + n_samples_per_param : integer, optional + Number of data points in each simulated data set. The default value is 1. + seed : integer, optional + Optional initial seed for the random number generator. The default value is generated randomly. + + """ self.model = model self.distance = distance self.kernel = kernel self.backend = backend + self.n_samples = n_samples + self.n_samples_per_param = n_samples_per_param self.rng = np.random.RandomState(seed) - - def sample(self, observations, steps, epsilon_init, n_samples = 10000, n_samples_per_param = 1, epsilon_percentile = 0, covFactor = 2, full_output=0): - """Samples from the posterior distribution of the model parameter given the observed + self.epsilon = None + + # these are usually big tables, so we broadcast them to have them once + # per executor instead of once per task + self.observations_bds = self.backend.broadcast(observations) + self.accepted_parameters_bds = None + self.accepted_weights_bds = None + self.accepted_cov_mat_bds = None + + def __getstate__(self): + #tells cloudpickle to not include the backend when pickling + state = self.__dict__.copy() + del state['backend'] + return state + + def sample(self, steps, epsilon_init, epsilon_percentile=0, + covFactor=2, full_output=0): + """Samples from the posterior distribution of the model parameter given the observed data observations. - + Parameters ---------- - observations : numpy.ndarray + observations : numpy.ndarray Observed data. - steps : integer - Number of iterations in the sequential algoritm ("generations") - epsilon_init : numpy.ndarray + steps : integer + Number of iterations in the sequential algoritm ("generations") + epsilon_init : numpy.ndarray An array of proposed values of epsilon to be used at each steps. Can be supplied - A single value to be used as the threshold in Step 1 or a `steps`-dimensional array of values to be + A single value to be used as the threshold in Step 1 or a `steps`-dimensional array of values to be used as the threshold in evry steps. - n_samples : integer, optional - Number of samples to generate. The default value is 10000. - n_samples_per_param : integer, optional - Number of data points in each simulated data set. The default value is 1. - epsilon_percentile : float, optional + + epsilon_percentile : float, optional A value between [0, 100]. The default value is 0, meaning the threshold value provided by the user being used. - covFactor : float, optional + covFactor : float, optional scaling parameter of the covariance matrix. The default value is 2 as considered in [1]. full_output: integer, optional - If full_output==1, intermediate results are included in output journal. + If full_output==1, intermediate results are included in output journal. The default value is 0, meaning the intermediate results are not saved. Returns - ------- + ------- abcpy.output.Journal A journal containing simulation results, metadata and optionally intermediate results. """ - + journal = Journal(full_output) journal.configuration["type_model"] = type(self.model) journal.configuration["type_dist_func"] = type(self.distance) - journal.configuration["n_samples"] = n_samples - journal.configuration["n_samples_per_param"] = n_samples_per_param + journal.configuration["n_samples"] = self.n_samples + journal.configuration["n_samples_per_param"] = self.n_samples_per_param journal.configuration["steps"] = steps journal.configuration["epsilon_percentile"] = epsilon_percentile - + accepted_parameters = None accepted_weights = None accepted_cov_mat = None - #Define epsilon_arr + # Define epsilon_arr if len(epsilon_init) == steps: epsilon_arr = epsilon_init else: if len(epsilon_init) == 1: - epsilon_arr = [None]*steps - epsilon_arr[0] = epsilon_init + epsilon_arr = [None] * steps + epsilon_arr[0] = epsilon_init else: raise ValueError("The length of epsilon_init can only be of 1 or steps.") - - - # Initialize variables that need to be available remotely - rc = _RemoteContextPMCABC(self.backend, self.model, self.distance, self.kernel, observations, n_samples, n_samples_per_param) # main PMCABC algorithm # print("INFO: Starting PMCABC iterations.") for aStep in range(0, steps): # print("DEBUG: Iteration " + str(aStep) + " of PMCABC algorithm.") - seed_arr = self.rng.randint(0, np.iinfo(np.uint32).max, size=n_samples, dtype=np.uint32) + seed_arr = self.rng.randint(0, np.iinfo(np.uint32).max, size=self.n_samples, dtype=np.uint32) seed_pds = self.backend.parallelize(seed_arr) # 0: update remotely required variables # print("INFO: Broadcasting parameters.") - rc.epsilon = epsilon_arr[aStep] - rc._update_broadcasts(self.backend, accepted_parameters, accepted_weights, accepted_cov_mat) + self.epsilon = epsilon_arr[aStep] + self._update_broadcasts(accepted_parameters, accepted_weights, accepted_cov_mat) # 1: calculate resample parameters # print("INFO: Resampling parameters") - params_and_dists_and_ysim_pds = self.backend.map(rc._resample_parameter, seed_pds) + params_and_dists_and_ysim_pds = self.backend.map(self._resample_parameter, seed_pds) params_and_dists_and_ysim = self.backend.collect(params_and_dists_and_ysim_pds) new_parameters, distances = [list(t) for t in zip(*params_and_dists_and_ysim)] new_parameters = np.array(new_parameters) - rc._update_broadcasts(self.backend, accepted_parameters, accepted_weights, accepted_cov_mat) - + self._update_broadcasts(accepted_parameters, accepted_weights, accepted_cov_mat) + # Compute epsilon for next step # print("INFO: Calculating acceptance threshold (epsilon).") - if aStep < steps-1: + if aStep < steps - 1: if epsilon_arr[aStep + 1] == None: epsilon_arr[aStep + 1] = np.percentile(distances, epsilon_percentile) else: - epsilon_arr[aStep + 1] = np.max([np.percentile(distances, epsilon_percentile), epsilon_arr[aStep+1]]) + epsilon_arr[aStep + 1] = np.max( + [np.percentile(distances, epsilon_percentile), epsilon_arr[aStep + 1]]) - # 2: calculate weights for new parameters + # 2: calculate weights for new parameters # print("INFO: Calculating weights.") new_parameters_pds = self.backend.parallelize(new_parameters) - new_weights_pds = self.backend.map(rc._calculate_weight, new_parameters_pds) - new_weights = np.array(self.backend.collect(new_weights_pds)).reshape(-1,1) + new_weights_pds = self.backend.map(self._calculate_weight, new_parameters_pds) + new_weights = np.array(self.backend.collect(new_weights_pds)).reshape(-1, 1) sum_of_weights = 0.0 for w in new_weights: sum_of_weights += w @@ -254,58 +218,35 @@ def sample(self, observations, steps, epsilon_init, n_samples = 10000, n_samples # 3: calculate covariance # print("INFO: Calculating covariance matrix.") - new_cov_mat = covFactor * np.cov(new_parameters, aweights = new_weights.reshape(-1), rowvar=False) - + new_cov_mat = covFactor * np.cov(new_parameters, aweights=new_weights.reshape(-1), rowvar=False) + # 4: Update the newly computed values accepted_parameters = new_parameters accepted_weights = new_weights accepted_cov_mat = new_cov_mat # print("INFO: Saving configuration to output journal.") - if (full_output == 1 and aStep <= steps-1) or (full_output == 0 and aStep == steps-1): + if (full_output == 1 and aStep <= steps - 1) or (full_output == 0 and aStep == steps - 1): journal.add_parameters(accepted_parameters) journal.add_weights(accepted_weights) - - #Add epsilon_arr to the journal + + # Add epsilon_arr to the journal journal.configuration["epsilon_arr"] = epsilon_arr return journal -class _RemoteContextPMCABC: - """ - Contains everything that is sent over the network like broadcast vars and map functions - """ - - def __init__(self, backend, model, distance, kernel, observations, n_samples, n_samples_per_param): - self.model = model - self.distance = distance - self.n_samples = n_samples - self.n_samples_per_param = n_samples_per_param - self.kernel = kernel - - self.epsilon = None - - # these are usually big tables, so we broadcast them to have them once - # per executor instead of once per task - self.observations_bds = backend.broadcast(observations) - self.accepted_parameters_bds = None - self.accepted_weights_bds = None - self.accepted_cov_mat_bds = None - - - def _update_broadcasts(self, backend, accepted_parameters, accepted_weights, accepted_cov_mat): + def _update_broadcasts(self, accepted_parameters, accepted_weights, accepted_cov_mat): def destroy(bc): if bc != None: bc.unpersist - #bc.destroy - + # bc.destroy + if not accepted_parameters is None: - self.accepted_parameters_bds = backend.broadcast(accepted_parameters) + self.accepted_parameters_bds = self.backend.broadcast(accepted_parameters) if not accepted_weights is None: - self.accepted_weights_bds = backend.broadcast(accepted_weights) + self.accepted_weights_bds = self.backend.broadcast(accepted_weights) if not accepted_cov_mat is None: - self.accepted_cov_mat_bds = backend.broadcast(accepted_cov_mat) - + self.accepted_cov_mat_bds = self.backend.broadcast(accepted_cov_mat) # define helper functions for map step def _resample_parameter(self, seed): @@ -313,7 +254,7 @@ def _resample_parameter(self, seed): Samples a single model parameter and simulate from it until distance between simulated outcome and the observation is smaller than eplison. - + :type seed: int :rtype: np.array :return: accepted parameter @@ -321,10 +262,10 @@ def _resample_parameter(self, seed): rng = np.random.RandomState(seed) self.model.prior.reseed(rng.randint(np.iinfo(np.uint32).max, dtype=np.uint32)) self.kernel.reseed(rng.randint(np.iinfo(np.uint32).max, dtype=np.uint32)) - + distance = self.distance.dist_max() while distance > self.epsilon: - #print("on seed " + str(seed) + " distance: " + str(distance) + " epsilon: " + str(self.epsilon)) + # print("on seed " + str(seed) + " distance: " + str(distance) + " epsilon: " + str(self.epsilon)) if self.accepted_parameters_bds == None: self.model.sample_from_prior() else: @@ -334,7 +275,7 @@ def _resample_parameter(self, seed): # truncating the normal like this is fine: https://arxiv.org/pdf/0907.4010v1.pdf while True: self.kernel.set_parameters([theta, self.accepted_cov_mat_bds.value()]) - new_theta = self.kernel.sample(1)[0,:] + new_theta = self.kernel.sample(1)[0, :] theta_is_accepted = self.model.set_parameters(new_theta) if theta_is_accepted and self.model.prior.pdf(self.model.get_parameters()) != 0: break @@ -343,7 +284,7 @@ def _resample_parameter(self, seed): distance = self.distance.distance(self.observations_bds.value(), y_sim) return (self.model.get_parameters(), distance) - + def _calculate_weight(self, theta): """ Calculates the weight for the given parameter using @@ -353,13 +294,13 @@ def _calculate_weight(self, theta): ---------- theta: np.array 1xp matrix containing model parameter, where p is the dimension of parameter - + Returns ------- float the new weight for theta """ - + if self.accepted_weights_bds is None: return 1.0 / self.n_samples else: @@ -367,14 +308,15 @@ def _calculate_weight(self, theta): denominator = 0.0 for i in range(0, self.n_samples): - self.kernel.set_parameters([self.accepted_parameters_bds.value()[i,:], self.accepted_cov_mat_bds.value()]) + self.kernel.set_parameters( + [self.accepted_parameters_bds.value()[i, :], self.accepted_cov_mat_bds.value()]) pdf_value = self.kernel.pdf(theta) - denominator += self.accepted_weights_bds.value()[i,0] * pdf_value + denominator += self.accepted_weights_bds.value()[i, 0] * pdf_value return 1.0 * prior_prob / denominator class PMC: - """Population Monte Carlo based inference scheme of Cappé et. al. [1]. + """Population Monte Carlo based inference scheme of Cappé et. al. [1]. This algorithm assumes a likelihood function is available and can be evaluated at any parameter value given the oberved dataset. In absence of the @@ -384,26 +326,32 @@ class PMC: inference schemes are based on Andrieu and Roberts [2]. [1] Cappé, O., Guillin, A., Marin, J.-M., and Robert, C. P. (2004). Population Monte Carlo. - Journal of Computational and Graphical Statistics, 13(4), 907–929. - + Journal of Computational and Graphical Statistics, 13(4), 907–929. + [2] C. Andrieu and G. O. Roberts. The pseudo-marginal approach for efficient Monte Carlo computations. Annals of Statistics, 37(2):697–725, 04 2009. - - """ - - def __init__(self, model, likfun, kernel, backend, seed=None): + + """ + + def __init__(self, observations, model, likfun, kernel, backend, n_samples=10000, n_samples_per_param=100, seed=None): """Constructor of PMC inference schemes. Parameters ---------- + observations : numpy.ndarray + Observed data. model : abcpy.models.Model - Model object that conforms to the Model class + Model object that conforms to the Model class likfun : abcpy.approx_lhd.Approx_likelihood Approx_likelihood object that conforms to the Approx_likelihood class kernel : abcpy.distributions.Distribution Distribution object defining the perturbation kernel needed for the sampling backend : abcpy.backends.Backend Backend object that conforms to the Backend class + n_samples : integer, optional + Number of samples to generate. The default value is 10000. + n_samples_per_param : integer, optional + Number of data points in each simulated data set. The default value is 1. seed : integer, optional Optional initial seed for the random number generator. The default value is generated randomly. @@ -412,35 +360,45 @@ def __init__(self, model, likfun, kernel, backend, seed=None): self.likfun = likfun self.kernel = kernel self.backend = backend + self.n_samples = n_samples + self.n_samples_per_param = n_samples_per_param self.rng = np.random.RandomState(seed) - - def sample(self, observations, steps, n_samples = 10000, n_samples_per_param = 100, covFactor = None, iniPoints = None, full_output=0): - """Samples from the posterior distribution of the model parameter given the observed + + # these are usually big tables, so we broadcast them to have them once + # per executor instead of once per task + self.observations_bds = self.backend.broadcast(observations) + self.accepted_parameters_bds = None + self.accepted_weights_bds = None + self.accepted_cov_mat_bds = None + + def __getstate__(self): + #tells cloudpickle to not include the backend when pickling + state = self.__dict__.copy() + del state['backend'] + return state + + def sample(self, steps, covFactor=None, iniPoints=None, + full_output=0): + """Samples from the posterior distribution of the model parameter given the observed data observations. - + Parameters ---------- - observations : python list - Observed data - steps : integer - number of iterations in the sequential algoritm ("generations") - n_sample : integer, optional - number of samples to generate. The default value is 10000. - n_samples_per_param : integer, optional - number of data points in each simulated data set. The default value is 100. - covFactor : float, optional + steps : integer + number of iterations in the sequential algoritm ("generations") + covFactor : float, optional scaling parameter of the covariance matrix. The default is a p dimensional array of 1 when p is the dimension of the parameter. inipoints : numpy.ndarray, optional parameter vaulues from where the sampling starts. By default sampled from the prior. full_output: integer, optional - If full_output==1, intermediate results are included in output journal. + If full_output==1, intermediate results are included in output journal. The default value is 0, meaning the intermediate results are not saved. Returns - ------- + ------- abcpy.output.Journal - A journal containing simulation results, metadata and optionally intermediate results. - + A journal containing simulation results, metadata and optionally intermediate results. + Parameters ---------- @@ -449,142 +407,121 @@ def sample(self, observations, steps, n_samples = 10000, n_samples_per_param = 1 journal = Journal(full_output) journal.configuration["type_model"] = type(self.model) journal.configuration["type_lhd_func"] = type(self.likfun) - journal.configuration["n_samples"] = n_samples - journal.configuration["n_samples_per_param"] = n_samples_per_param + journal.configuration["n_samples"] = self.n_samples + journal.configuration["n_samples_per_param"] = self.n_samples_per_param journal.configuration["steps"] = steps journal.configuration["covFactor"] = covFactor journal.configuration["iniPoints"] = iniPoints - + accepted_parameters = None accepted_weights = None accepted_cov_mat = None new_theta = None dim = len(self.model.get_parameters()) - - # Initialize variables that need to be available remotely - rc = _RemoteContextPMC(self.backend, self.model, self.likfun, self.kernel, observations, n_samples, n_samples_per_param) - + # Initialize particles: When not supplied, randomly draw them from prior distribution # Weights of particles: Assign equal weights for each of the particles if iniPoints == None: - accepted_parameters = np.zeros(shape = (n_samples,dim)) - for ind in range(0,n_samples): + accepted_parameters = np.zeros(shape=(self.n_samples, dim)) + for ind in range(0, self.n_samples): self.model.sample_from_prior() - accepted_parameters[ind,:] = self.model.get_parameters() - accepted_weights = np.ones((n_samples,1), dtype=np.float)/n_samples + accepted_parameters[ind, :] = self.model.get_parameters() + accepted_weights = np.ones((self.n_samples, 1), dtype=np.float) / self.n_samples else: accepted_parameters = iniPoints - accepted_weights = np.ones((iniPoints.shape[0],1), dtype=np.float)/iniPoints.shape[0] - + accepted_weights = np.ones((iniPoints.shape[0], 1), dtype=np.float) / iniPoints.shape[0] + if covFactor is None: covFactor = np.ones(shape=(dim,)) # Calculate initial covariance matrix - accepted_cov_mat = covFactor * np.cov(accepted_parameters, aweights = accepted_weights.reshape(-1), rowvar=False) - - # main SMC algorithm + accepted_cov_mat = covFactor * np.cov(accepted_parameters, aweights=accepted_weights.reshape(-1), rowvar=False) + + # main SMC algorithm # print("INFO: Starting PMC iterations.") for aStep in range(0, steps): # print("DEBUG: Iteration " + str(aStep) + " of PMC algorithm.") - + # 0: update remotely required variables # print("INFO: Broadcasting parameters.") - rc._update_broadcasts(self.backend, accepted_parameters, accepted_weights, accepted_cov_mat) + self._update_broadcasts(accepted_parameters, accepted_weights, accepted_cov_mat) # 1: calculate resample parameters # print("INFO: Resample parameters.") - index = self.rng.choice(accepted_parameters.shape[0], size = n_samples, p = accepted_weights.reshape(-1)) - #Choose a new particle using the resampled particle (make the boundary proper) - #Initialize new_parameters - new_parameters = np.zeros((n_samples,dim), dtype=np.float) - for ind in range(0,n_samples): - self.kernel.set_parameters([accepted_parameters[index[ind],:], accepted_cov_mat]) + index = self.rng.choice(accepted_parameters.shape[0], size=self.n_samples, p=accepted_weights.reshape(-1)) + # Choose a new particle using the resampled particle (make the boundary proper) + # Initialize new_parameters + new_parameters = np.zeros((self.n_samples, dim), dtype=np.float) + for ind in range(0, self.n_samples): + self.kernel.set_parameters([accepted_parameters[index[ind], :], accepted_cov_mat]) while True: - new_theta = self.kernel.sample(1)[0,:] + new_theta = self.kernel.sample(1)[0, :] theta_is_accepted = self.model.set_parameters(new_theta) if theta_is_accepted and self.model.prior.pdf(self.model.get_parameters()) != 0: - new_parameters[ind,:] = new_theta + new_parameters[ind, :] = new_theta break # 2: calculate approximate lieklihood for new parameters # print("INFO: Calculate approximate likelihood.") new_parameters_pds = self.backend.parallelize(new_parameters) - approx_likelihood_new_parameters_pds = self.backend.map(rc._approx_lik_calc, new_parameters_pds) + approx_likelihood_new_parameters_pds = self.backend.map(self._approx_lik_calc, new_parameters_pds) # print("DEBUG: Collect approximate likelihood from pds.") - approx_likelihood_new_parameters = np.array(self.backend.collect(approx_likelihood_new_parameters_pds)).reshape(-1,1) - - # 3: calculate new weights for new parameters + approx_likelihood_new_parameters = np.array( + self.backend.collect(approx_likelihood_new_parameters_pds)).reshape(-1, 1) + + # 3: calculate new weights for new parameters # print("INFO: Calculating weights.") - new_weights_pds = self.backend.map(rc._calculate_weight, new_parameters_pds) - new_weights = np.array(self.backend.collect(new_weights_pds)).reshape(-1,1) - + new_weights_pds = self.backend.map(self._calculate_weight, new_parameters_pds) + new_weights = np.array(self.backend.collect(new_weights_pds)).reshape(-1, 1) + sum_of_weights = 0.0 - for i in range(0, n_samples): - new_weights[i] = new_weights[i]*approx_likelihood_new_parameters[i] + for i in range(0, self.n_samples): + new_weights[i] = new_weights[i] * approx_likelihood_new_parameters[i] sum_of_weights += new_weights[i] new_weights = new_weights / sum_of_weights accepted_parameters = new_parameters # 4: calculate covariance # print("INFO: Calculating covariance matrix.") - new_cov_mat = covFactor * np.cov(accepted_parameters, aweights = accepted_weights.reshape(-1), rowvar=False) - + new_cov_mat = covFactor * np.cov(accepted_parameters, aweights=accepted_weights.reshape(-1), rowvar=False) + # 5: Update the newly computed values accepted_parameters = new_parameters accepted_weights = new_weights accepted_cov_mat = new_cov_mat - + # print("INFO: Saving configuration to output journal.") - if (full_output == 1 and aStep <= steps-1) or (full_output == 0 and aStep == steps-1): + if (full_output == 1 and aStep <= steps - 1) or (full_output == 0 and aStep == steps - 1): journal.add_parameters(accepted_parameters) journal.add_weights(accepted_weights) journal.add_opt_values(approx_likelihood_new_parameters) return journal -class _RemoteContextPMC: - """ - Contains everything that is sent over the network like broadcast vars and map functions - """ - - def __init__(self, backend, model, likfun, kernel, observations, n_samples, n_samples_per_param,): - self.model = model - self.likfun = likfun - self.n_samples = n_samples - self.n_samples_per_param = n_samples_per_param - self.kernel = kernel - - # these are usually big tables, so we broadcast them to have them once - # per executor instead of once per task - self.observations_bds = backend.broadcast(observations) - self.accepted_parameters_bds = None - self.accepted_weights_bds = None - self.accepted_cov_mat_bds = None - - def _update_broadcasts(self, backend, accepted_parameters, accepted_weights, accepted_cov_mat): + def _update_broadcasts(self, accepted_parameters, accepted_weights, accepted_cov_mat): def destroy(bc): if bc != None: bc.unpersist - #bc.destroy - + # bc.destroy + if not accepted_parameters is None: - self.accepted_parameters_bds = backend.broadcast(accepted_parameters) + self.accepted_parameters_bds = self.backend.broadcast(accepted_parameters) if not accepted_weights is None: - self.accepted_weights_bds = backend.broadcast(accepted_weights) + self.accepted_weights_bds = self.backend.broadcast(accepted_weights) if not accepted_cov_mat is None: - self.accepted_cov_mat_bds = backend.broadcast(accepted_cov_mat) + self.accepted_cov_mat_bds = self.backend.broadcast(accepted_cov_mat) # define helper functions for map step def _approx_lik_calc(self, theta): """ Compute likelihood for new parameters using approximate likelihood function - + :seed = aTuple[0] :theta = aTuple[1] :type seed: int :rtype: np.arraybg :return: likelihood values of new parameter - """ + """ # Assign theta to model self.model.set_parameters(theta) @@ -599,7 +536,7 @@ def _approx_lik_calc(self, theta): pdf_at_theta = self.model.prior.pdf(theta) # print("DEBUG: prior pdf evaluated at theta is :" + str(pdf_at_theta)) return pdf_at_theta * lhd - + def _calculate_weight(self, theta): """ Calculates the weight for the given parameter using @@ -610,7 +547,7 @@ def _calculate_weight(self, theta): :rtype: float :return: the new weight for theta """ - + if self.accepted_weights_bds is None: return 1.0 / self.n_samples else: @@ -618,18 +555,19 @@ def _calculate_weight(self, theta): denominator = 0.0 for i in range(0, self.n_samples): - self.kernel.set_parameters([self.accepted_parameters_bds.value()[i,:], self.accepted_cov_mat_bds.value()]) + self.kernel.set_parameters( + [self.accepted_parameters_bds.value()[i, :], self.accepted_cov_mat_bds.value()]) pdf_value = self.kernel.pdf(theta) - denominator += self.accepted_weights_bds.value()[i,0] * pdf_value - return 1.0 * prior_prob / denominator + denominator += self.accepted_weights_bds.value()[i, 0] * pdf_value + return 1.0 * prior_prob / denominator class SABC: """This base class implements a modified version of Simulated Annealing Approximate Bayesian Computation (SABC) of [1] when the prior is non-informative. - - [1] C. Albert, H. R. Kuensch and A. Scheidegger. A Simulated Annealing Approach to - Approximate Bayes Computations. Statistics and Computing, (2014). - + + [1] C. Albert, H. R. Kuensch and A. Scheidegger. A Simulated Annealing Approach to + Approximate Bayes Computations. Statistics and Computing, (2014). + Parameters ---------- model : abcpy.models.Model @@ -643,29 +581,47 @@ class SABC: seed : integer, optional Optional initial seed for the random number generator. The default value is generated randomly. """ - def __init__(self, model, distance, kernel, backend, seed=None): + + def __init__(self, observations, model, distance, kernel, backend, n_samples=10000, n_samples_per_param=1, seed=None): self.model = model self.distance = distance self.kernel = kernel self.backend = backend + self.n_samples = n_samples + self.n_samples_per_param = n_samples_per_param + self.epsilon = None self.rng = np.random.RandomState(seed) - - def sample(self, observations, steps, epsilon, n_samples = 10000, n_samples_per_param = 1, beta = 2, delta = 0.2, v = 0.3, ar_cutoff = 0.5, resample = None, n_update = None, adaptcov = 1, full_output=0): - """Samples from the posterior distribution of the model parameter given the observed + # these are usually big tables, so we broadcast them to have them once + # per executor instead of once per task + self.observations_bds = self.backend.broadcast(observations) + self.accepted_parameters_bds = None + self.accepted_cov_mat_bds = None + self.smooth_distances_bds = None + self.all_distances_bds = None + + def __getstate__(self): + #tells cloudpickle to not include the backend when pickling + state = self.__dict__.copy() + del state['backend'] + return state + + def sample(self, steps, epsilon, beta=2, delta=0.2, v=0.3, + ar_cutoff=0.5, resample=None, n_update=None, adaptcov=1, full_output=0): + """Samples from the posterior distribution of the model parameter given the observed data observations. - + Parameters ---------- - observations : numpy.ndarray + observations : numpy.ndarray Observed data. - steps : integer - Number of maximum iterations in the sequential algoritm ("generations") - epsilon : numpy.float - An array of proposed values of epsilon to be used at each steps. + steps : integer + Number of maximum iterations in the sequential algoritm ("generations") + epsilon : numpy.float + An array of proposed values of epsilon to be used at each steps. n_samples : integer, optional Number of samples to generate. The default value is 10000. - n_samples_per_param : integer, optional + n_samples_per_param : integer, optional Number of data points in each simulated data set. The default value is 1. beta : numpy.float Tuning parameter of SABC @@ -679,14 +635,14 @@ def sample(self, observations, steps, epsilon, n_samples = 10000, n_samples_per_ Resample after this many acceptance, The default value if n_samples n_update: int, optional Number of perturbed parameters at each step, The default value if n_samples - adaptcov : boolean, optional + adaptcov : boolean, optional Whether we adapt the covariance matrix in iteration stage. The default value TRUE. full_output: integer, optional - If full_output==1, intermediate results are included in output journal. + If full_output==1, intermediate results are included in output journal. The default value is 0, meaning the intermediate results are not saved. Returns - ------- + ------- abcpy.output.Journal A journal containing simulation results, metadata and optionally intermediate results. """ @@ -694,8 +650,8 @@ def sample(self, observations, steps, epsilon, n_samples = 10000, n_samples_per_ journal.configuration["type_model"] = type(self.model) journal.configuration["type_dist_func"] = type(self.distance) journal.configuration["type_kernel_func"] = type(self.kernel) - journal.configuration["n_samples"] = n_samples - journal.configuration["n_samples_per_param"] = n_samples_per_param + journal.configuration["n_samples"] = self.n_samples + journal.configuration["n_samples_per_param"] = self.n_samples_per_param journal.configuration["beta"] = beta journal.configuration["delta"] = delta journal.configuration["v"] = v @@ -704,238 +660,210 @@ def sample(self, observations, steps, epsilon, n_samples = 10000, n_samples_per_ journal.configuration["n_update"] = n_update journal.configuration["adaptcov"] = adaptcov journal.configuration["full_output"] = full_output - - accepted_parameters = np.zeros(shape=(n_samples,len(self.model.get_parameters()))) - distances = np.zeros(shape=(n_samples,)) - smooth_distances = np.zeros(shape=(n_samples,)) - accepted_weights = np.ones(shape=(n_samples,1)) + + accepted_parameters = np.zeros(shape=(self.n_samples, len(self.model.get_parameters()))) + distances = np.zeros(shape=(self.n_samples,)) + smooth_distances = np.zeros(shape=(self.n_samples,)) + accepted_weights = np.ones(shape=(self.n_samples, 1)) all_distances = None - accepted_cov_mat = None + accepted_cov_mat = None if resample == None: - resample = n_samples + resample = self.n_samples if n_update == None: - n_update = n_samples + n_update = self.n_samples sample_array = np.ones(shape=(steps,)) - sample_array[0] = n_samples + sample_array[0] = self.n_samples sample_array[1:] = n_update ## Acceptance counter to determine the resampling step accept = 0 samples_until = 0 - - # Initialize variables that need to be available remotely - rc = _RemoteContextSABC(self.backend, self.model, self.distance, self.kernel, \ - observations, n_samples, n_samples_per_param) - - for aStep in range(0,steps): - # main SABC algorithm + + for aStep in range(0, steps): + # main SABC algorithm # print("INFO: Initialization of SABC") seed_arr = self.rng.randint(0, np.iinfo(np.uint32).max, size=int(sample_array[aStep]), dtype=np.uint32) seed_pds = self.backend.parallelize(seed_arr) # 0: update remotely required variables # print("INFO: Broadcasting parameters.") - rc.epsilon = epsilon - rc._update_broadcasts(self.backend, accepted_parameters, accepted_cov_mat, smooth_distances, all_distances) - + self.epsilon = epsilon + self._update_broadcasts(accepted_parameters, accepted_cov_mat, smooth_distances, all_distances) # 1: Calculate parameters # print("INFO: Initial accepted parameter parameters") - params_and_dists_pds = self.backend.map(rc._accept_parameter, seed_pds) + params_and_dists_pds = self.backend.map(self._accept_parameter, seed_pds) params_and_dists = self.backend.collect(params_and_dists_pds) - new_parameters, new_distances, new_all_parameters, new_all_distances, index, acceptance = [list(t) for t in zip(*params_and_dists)] - new_parameters = np.array(new_parameters) + new_parameters, new_distances, new_all_parameters, new_all_distances, index, acceptance = [list(t) for t in + zip( + *params_and_dists)] + new_parameters = np.array(new_parameters) new_distances = np.array(new_distances) new_all_distances = np.concatenate(new_all_distances) index = np.array(index) acceptance = np.array(acceptance) - - # Reading all_distances at Initial step + + # Reading all_distances at Initial step if aStep == 0: - index = np.linspace(0,n_samples-1,n_samples).astype(int).reshape(n_samples,) + index = np.linspace(0, self.n_samples - 1, self.n_samples).astype(int).reshape(self.n_samples, ) accept = 0 all_distances = new_all_distances - #print(index[acceptance == 1]) - # Initialize/Update the accepted parameters and their corresponding distances - accepted_parameters[index[acceptance==1],:] = new_parameters[acceptance==1,:] - distances[index[acceptance==1]] = new_distances[acceptance==1] - - + # print(index[acceptance == 1]) + # Initialize/Update the accepted parameters and their corresponding distances + accepted_parameters[index[acceptance == 1], :] = new_parameters[acceptance == 1, :] + distances[index[acceptance == 1]] = new_distances[acceptance == 1] + # 2: Smoothing of the distances - smooth_distances[index[acceptance==1]] = self._smoother_distance(distances[index[acceptance==1]],all_distances) - + smooth_distances[index[acceptance == 1]] = self._smoother_distance(distances[index[acceptance == 1]], + all_distances) + # 3: Initialize/Update U, epsilon and covariance of perturbation kernel if aStep == 0: - U = self._avergae_redefined_distance(self._smoother_distance(all_distances,all_distances), epsilon) - else: + U = self._average_redefined_distance(self._smoother_distance(all_distances, all_distances), epsilon) + else: U = np.mean(smooth_distances) - epsilon = self._schedule(U,v) + epsilon = self._schedule(U, v) if accepted_parameters.shape[1] > 1: - accepted_cov_mat = beta*np.cov(np.transpose(accepted_parameters)) + \ - 0.0001*np.trace(np.cov(np.transpose(accepted_parameters)))*np.eye(accepted_parameters.shape[1]) + accepted_cov_mat = beta * np.cov(np.transpose(accepted_parameters)) + \ + 0.0001 * np.trace(np.cov(np.transpose(accepted_parameters))) * np.eye( + accepted_parameters.shape[1]) else: - accepted_cov_mat = beta*np.var(np.transpose(accepted_parameters)) + \ - 0.0001*(np.var(np.transpose(accepted_parameters)))*np.eye(accepted_parameters.shape[1]) - - ## 4: Show progress and if acceptance rate smaller than a value break the iteration - + accepted_cov_mat = beta * np.var(np.transpose(accepted_parameters)) + \ + 0.0001 * (np.var(np.transpose(accepted_parameters))) * np.eye( + accepted_parameters.shape[1]) + + ## 4: Show progress and if acceptance rate smaller than a value break the iteration + # print("INFO: Saving intermediate configuration to output journal.") if full_output == 1: journal.add_parameters(accepted_parameters) journal.add_weights(accepted_weights) - + if aStep > 0: accept = accept + np.sum(acceptance) samples_until = samples_until + sample_array[aStep] - acceptance_rate = accept/samples_until - print('updates: ',np.sum(sample_array[1:aStep+1])/np.sum(sample_array[1:])*100,' epsilon: ' ,epsilon,\ + acceptance_rate = accept / samples_until + print( + 'updates: ', np.sum(sample_array[1:aStep + 1]) / np.sum(sample_array[1:]) * 100, ' epsilon: ', epsilon, \ 'u.mean: ', U, 'acceptance rate: ', acceptance_rate) if acceptance_rate < ar_cutoff: break - + # 5: Resampling if number of accepted particles greater than resample if accept >= resample and U > 1e-100: ## Weighted resampling: - weight = np.exp(-smooth_distances*delta/U) - weight = weight/sum(weight) - index_resampled = self.rng.choice(np.arange(n_samples), n_samples, replace = 1, p = weight) - accepted_parameters = accepted_parameters[index_resampled,:] + weight = np.exp(-smooth_distances * delta / U) + weight = weight / sum(weight) + index_resampled = self.rng.choice(np.arange(self.n_samples), self.n_samples, replace=1, p=weight) + accepted_parameters = accepted_parameters[index_resampled, :] smooth_distances = smooth_distances[index_resampled] - + ## Update U and epsilon: - epsilon = epsilon*(1-delta) + epsilon = epsilon * (1 - delta) U = np.mean(smooth_distances) - epsilon = self._schedule(U,v) - + epsilon = self._schedule(U, v) + ## Print effective sampling size - print('Resampling: Effective sampling size: ', 1/sum(pow(weight/sum(weight),2))) - accept = 0 + print('Resampling: Effective sampling size: ', 1 / sum(pow(weight / sum(weight), 2))) + accept = 0 samples_until = 0 - - #Add epsilon_arr, number of final steps and final output to the journal + + # Add epsilon_arr, number of final steps and final output to the journal # print("INFO: Saving final configuration to output journal.") if full_output == 0: journal.add_parameters(accepted_parameters) journal.add_weights(accepted_weights) - journal.configuration["steps"] = aStep + 1 + journal.configuration["steps"] = aStep + 1 journal.configuration["epsilon"] = epsilon return journal - - + def _smoother_distance(self, distance, old_distance): """Smooths the distance using the Equation 14 of [1]. - - [1] C. Albert, H. R. Kuensch and A. Scheidegger. A Simulated Annealing Approach to - Approximate Bayes Computations. Statistics and Computing 0960-3174 (2014). + + [1] C. Albert, H. R. Kuensch and A. Scheidegger. A Simulated Annealing Approach to + Approximate Bayes Computations. Statistics and Computing 0960-3174 (2014). """ - - smoothed_distance = np.zeros(shape=(len(distance),)) - - for ind in range(0,len(distance)): + + smoothed_distance = np.zeros(shape=(len(distance),)) + + for ind in range(0, len(distance)): if distance[ind] < np.min(old_distance): - smoothed_distance[ind] = (distance[ind]/np.min(old_distance))/len(old_distance) + smoothed_distance[ind] = (distance[ind] / np.min(old_distance)) / len(old_distance) else: - smoothed_distance[ind] = np.mean(np.array(old_distance) 1: - new_theta = self.kernel.sample(1)[0,:] + new_theta = self.kernel.sample(1)[0, :] else: new_theta = self.kernel.sample(1) theta_is_accepted = self.model.set_parameters(new_theta) if theta_is_accepted and self.model.prior.pdf(self.model.get_parameters()) != 0: break y_sim = self.model.simulate(self.n_samples_per_param) - distance = self.distance.distance(self.observations_bds.value(), y_sim) - smooth_distance = self._smoother_distance_remote([distance],self.all_distances_bds.value()) - + distance = self.distance.distance(self.observations_bds.value(), y_sim) + smooth_distance = self._smoother_distance_remote([distance], self.all_distances_bds.value()) + ## Calculate acceptance probability: - ratio_prior_prob = self.model.prior.pdf(new_theta)/self.model.prior.pdf(self.accepted_parameters_bds.value()[index,:]) + ratio_prior_prob = self.model.prior.pdf(new_theta) / self.model.prior.pdf( + self.accepted_parameters_bds.value()[index, :]) ratio_likelihood_prob = np.exp((self.smooth_distances_bds.value()[index] - smooth_distance) / self.epsilon) - acceptance_prob = ratio_prior_prob*ratio_likelihood_prob - + acceptance_prob = ratio_prior_prob * ratio_likelihood_prob + ## If accepted if rng.rand(1) < acceptance_prob: acceptance = 1 else: distance = np.inf - + return (new_theta, distance, all_parameters, all_distances, index, acceptance) + class ABCsubsim: """This base class implements Approximate Bayesian Computation by subset simulation (ABCsubsim) algorithm of [1]. - + [1] M. Chiachio, J. L. Beck, J. Chiachio, and G. Rus., Approximate Bayesian computation by subset simulation. SIAM J. Sci. Comput., 36(3):A1339–A1358, 2014/10/03 2014. - + Parameters ---------- model : abcpy.models.Model @@ -996,172 +926,161 @@ class ABCsubsim: seed : integer, optional Optional initial seed for the random number generator. The default value is generated randomly. """ - def __init__(self, model, distance, kernel, backend, seed=None): + + def __init__(self, observations, model, distance, kernel, backend, chain_length=10, n_samples=10000, n_samples_per_param=1, seed=None): self.model = model self.distance = distance self.kernel = kernel self.backend = backend + self.n_samples = n_samples + self.chain_length = chain_length + self.n_samples_per_param = n_samples_per_param self.rng = np.random.RandomState(seed) - + self.anneal_parameter = None + + + # these are usually big tables, so we broadcast them to have them once + # per executor instead of once per task + self.observations_bds = self.backend.broadcast(observations) + self.accepted_parameters_bds = None + self.accepted_cov_mat_bds = None + + def __getstate__(self): + #tells cloudpickle to not include the backend when pickling + state = self.__dict__.copy() + del state['backend'] + return state - def sample(self, observations, steps, n_samples = 10000, n_samples_per_param = 1, chain_length = 10, ap_change_cutoff = 10, full_output=0): - """Samples from the posterior distribution of the model parameter given the observed + def sample(self, steps, ap_change_cutoff=10, + full_output=0): + """Samples from the posterior distribution of the model parameter given the observed data observations. - + Parameters ---------- - observations : numpy.ndarray + observations : numpy.ndarray Observed data. - steps : integer - Number of iterations in the sequential algoritm ("generations") - n_samples : integer, optional - Number of samples to generate. The default value is 10000. - n_samples_per_param : integer, optional - Number of data points in each simulated data set. The default value is 1. - chain_length : integer, optional - Chain length of the MCMC. n_samples should be divisable by chain_length. The default value is 10. + steps : integer + Number of iterations in the sequential algoritm ("generations") ap_change_cutoff : float, optional - The cutoff value for the percentage change in the anneal parameter. If the change is less than + The cutoff value for the percentage change in the anneal parameter. If the change is less than ap_change_cutoff the iterations are stopped. The default value is 10. full_output: integer, optional - If full_output==1, intermediate results are included in output journal. + If full_output==1, intermediate results are included in output journal. The default value is 0, meaning the intermediate results are not saved. Returns - ------- + ------- abcpy.output.Journal A journal containing simulation results, metadata and optionally intermediate results. """ - + journal = Journal(full_output) journal.configuration["type_model"] = type(self.model) journal.configuration["type_dist_func"] = type(self.distance) journal.configuration["type_kernel_func"] = type(self.kernel) - journal.configuration["n_samples"] = n_samples - journal.configuration["n_samples_per_param"] = n_samples_per_param - journal.configuration["chain_length"] = chain_length + journal.configuration["n_samples"] = self.n_samples + journal.configuration["n_samples_per_param"] = self.n_samples_per_param + journal.configuration["chain_length"] = self.chain_length journal.configuration["ap_change_cutoff"] = ap_change_cutoff journal.configuration["full_output"] = full_output - + accepted_parameters = None - accepted_weights = np.ones(shape=(n_samples,1)) - accepted_cov_mat = None + accepted_weights = np.ones(shape=(self.n_samples, 1)) + accepted_cov_mat = None anneal_parameter = 0 anneal_parameter_old = 0 temp_chain_length = 1 - - # Initialize variables that need to be available remotely - rc = _RemoteContextABCsubsim(self.backend, self.model, self.distance, self.kernel, observations, n_samples, n_samples_per_param, chain_length) - - for aStep in range(0,steps): - # main ABCsubsim algorithm + + + for aStep in range(0, steps): + # main ABCsubsim algorithm # print("INFO: Initialization of ABCsubsim") - seed_arr = self.rng.randint(0, np.iinfo(np.uint32).max, size=int(n_samples/temp_chain_length), dtype=np.uint32) - index_arr = np.linspace(0,n_samples/temp_chain_length-1,n_samples/temp_chain_length).astype(int).reshape(int(n_samples/temp_chain_length),) - seed_index_arr = np.column_stack((seed_arr,index_arr)) + seed_arr = self.rng.randint(0, np.iinfo(np.uint32).max, size=int(self.n_samples / temp_chain_length), + dtype=np.uint32) + index_arr = np.linspace(0, self.n_samples / temp_chain_length - 1, self.n_samples / temp_chain_length).astype( + int).reshape(int(self.n_samples / temp_chain_length), ) + seed_index_arr = np.column_stack((seed_arr, index_arr)) seed_index_pds = self.backend.parallelize(seed_index_arr) - # 0: update remotely required variables # print("INFO: Broadcasting parameters.") - rc._update_broadcasts(self.backend, accepted_parameters, accepted_cov_mat) + self._update_broadcasts(accepted_parameters, accepted_cov_mat) # 1: Calculate parameters # print("INFO: Initial accepted parameter parameters") - params_and_dists_pds = self.backend.map(rc._accept_parameter, seed_index_pds) + params_and_dists_pds = self.backend.map(self._accept_parameter, seed_index_pds) params_and_dists = self.backend.collect(params_and_dists_pds) new_parameters, new_distances = [list(t) for t in zip(*params_and_dists)] - accepted_parameters = np.concatenate(new_parameters) + accepted_parameters = np.concatenate(new_parameters) distances = np.concatenate(new_distances) - # 2: Sort and renumber samples + # 2: Sort and renumber samples SortIndex = sorted(range(len(distances)), key=lambda k: distances[k]) distances = distances[SortIndex] - accepted_parameters = accepted_parameters[SortIndex,:] - + accepted_parameters = accepted_parameters[SortIndex, :] + # 3: Calculate and broadcast annealling parameters - temp_chain_length = chain_length + temp_chain_length = self.chain_length if aStep > 0: anneal_parameter_old = anneal_parameter - anneal_parameter = 0.5*(distances[int(n_samples/temp_chain_length)]+distances[int(n_samples/temp_chain_length)+1]) - rc.anneal_parameter = anneal_parameter - + anneal_parameter = 0.5 * ( + distances[int(self.n_samples / temp_chain_length)] + distances[int(self.n_samples / temp_chain_length) + 1]) + self.anneal_parameter = anneal_parameter + # 4: Update proposal covariance matrix (Parallelized) if aStep == 0: accepted_cov_mat = np.cov(accepted_parameters, rowvar=False) else: - accepted_cov_mat = pow(2,1)*accepted_cov_mat - rc._update_broadcasts(self.backend, accepted_parameters, accepted_cov_mat) - + accepted_cov_mat = pow(2, 1) * accepted_cov_mat + self._update_broadcasts(accepted_parameters, accepted_cov_mat) + seed_arr = self.rng.randint(0, np.iinfo(np.uint32).max, size=10, dtype=np.uint32) - index_arr = np.linspace(0,10-1,10).astype(int).reshape(10,) - seed_index_arr = np.column_stack((seed_arr,index_arr)) + index_arr = np.linspace(0, 10 - 1, 10).astype(int).reshape(10, ) + seed_index_arr = np.column_stack((seed_arr, index_arr)) seed_index_pds = self.backend.parallelize(seed_index_arr) - - cov_mat_index_pds = self.backend.map(rc._update_cov_mat, seed_index_pds) + + cov_mat_index_pds = self.backend.map(self._update_cov_mat, seed_index_pds) cov_mat_index = self.backend.collect(cov_mat_index_pds) cov_mat, T, accept_index = [list(t) for t in zip(*cov_mat_index)] - + for ind in range(10): if accept_index[ind] == 1: accepted_cov_mat = cov_mat[ind] break - + # print("INFO: Saving intermediate configuration to output journal.") if full_output == 1: journal.add_parameters(accepted_parameters) journal.add_weights(accepted_weights) # Show progress - anneal_parameter_change_percentage = 100*abs(anneal_parameter_old-anneal_parameter)/anneal_parameter - print('Steps: ', aStep, 'annealing parameter: ', anneal_parameter, 'change (%) in annealing parameter: ', anneal_parameter_change_percentage ) + anneal_parameter_change_percentage = 100 * abs(anneal_parameter_old - anneal_parameter) / anneal_parameter + print('Steps: ', aStep, 'annealing parameter: ', anneal_parameter, 'change (%) in annealing parameter: ', + anneal_parameter_change_percentage) if anneal_parameter_change_percentage < ap_change_cutoff: break - - #Add anneal_parameter, number of final steps and final output to the journal + # Add anneal_parameter, number of final steps and final output to the journal # print("INFO: Saving final configuration to output journal.") if full_output == 0: journal.add_parameters(accepted_parameters) journal.add_weights(accepted_weights) - journal.configuration["steps"] = aStep+1 + journal.configuration["steps"] = aStep + 1 journal.configuration["anneal_parameter"] = anneal_parameter return journal -class _RemoteContextABCsubsim: - """ - Contains everything that is sent over the network like broadcast vars and map functions - """ - - def __init__(self, backend, model, distance, kernel, observations, n_samples, n_samples_per_param, chain_length): - self.model = model - self.distance = distance - self.n_samples = n_samples - self.n_samples_per_param = n_samples_per_param - self.kernel = kernel - self.chain_length = chain_length - - self.anneal_parameter = None - - # these are usually big tables, so we broadcast them to have them once - # per executor instead of once per task - self.observations_bds = backend.broadcast(observations) - self.accepted_parameters_bds = None - self.accepted_cov_mat_bds = None - - - def _update_broadcasts(self, backend, accepted_parameters, accepted_cov_mat): + def _update_broadcasts(self, accepted_parameters, accepted_cov_mat): def destroy(bc): if bc != None: bc.unpersist - #bc.destroy - + # bc.destroy + if not accepted_parameters is None: - self.accepted_parameters_bds = backend.broadcast(accepted_parameters) + self.accepted_parameters_bds = self.backend.broadcast(accepted_parameters) if not accepted_cov_mat is None: - self.accepted_cov_mat_bds = backend.broadcast(accepted_cov_mat) - + self.accepted_cov_mat_bds = self.backend.broadcast(accepted_cov_mat) # define helper functions for map step def _accept_parameter(self, seed_index): @@ -1169,7 +1088,7 @@ def _accept_parameter(self, seed_index): Samples a single model parameter and simulate from it until distance between simulated outcome and the observation is smaller than eplison. - + :type seed: int :rtype: np.array :return: accepted parameter @@ -1180,9 +1099,9 @@ def _accept_parameter(self, seed_index): self.model.prior.reseed(rng.randint(np.iinfo(np.uint32).max, dtype=np.uint32)) self.kernel.reseed(rng.randint(np.iinfo(np.uint32).max, dtype=np.uint32)) - result_theta = [] - result_distance = [] - + result_theta = [] + result_distance = [] + if self.accepted_parameters_bds == None: self.model.sample_from_prior() y_sim = self.model.simulate(self.n_samples_per_param) @@ -1196,10 +1115,10 @@ def _accept_parameter(self, seed_index): distance = self.distance.distance(self.observations_bds.value(), y_sim) result_theta.append(theta) result_distance.append(distance) - for ind in range(0,self.chain_length-1): + for ind in range(0, self.chain_length - 1): while True: self.kernel.set_parameters([theta, self.accepted_cov_mat_bds.value()]) - new_theta = self.kernel.sample(1)[0,:] + new_theta = self.kernel.sample(1)[0, :] theta_is_accepted = self.model.set_parameters(new_theta) if theta_is_accepted and self.model.prior.pdf(self.model.get_parameters()) != 0: break @@ -1207,16 +1126,17 @@ def _accept_parameter(self, seed_index): new_distance = self.distance.distance(self.observations_bds.value(), y_sim) ## Calculate acceptance probability: - ratio_prior_prob = self.model.prior.pdf(new_theta)/self.model.prior.pdf(theta) + ratio_prior_prob = self.model.prior.pdf(new_theta) / self.model.prior.pdf(theta) self.kernel.set_parameters([new_theta, self.accepted_cov_mat_bds.value()]) kernel_numerator = self.kernel.pdf(theta) - self.kernel.set_parameters([theta, self.accepted_cov_mat_bds.value()]) + self.kernel.set_parameters([theta, self.accepted_cov_mat_bds.value()]) kernel_denominator = self.kernel.pdf(new_theta) - ratio_likelihood_prob = kernel_numerator/kernel_denominator - acceptance_prob = min(1,ratio_prior_prob*ratio_likelihood_prob)*(new_distance=0.3: - return(accepted_cov_mat_transformed, t, 1) + if acceptance / 10 <= 0.5 and acceptance / 10 >= 0.3: + return (accepted_cov_mat_transformed, t, 1) else: - return(accepted_cov_mat_transformed, t, 0) - + return (accepted_cov_mat_transformed, t, 0) + + class RSMCABC: - """This base class implements Adaptive Population Monte Carlo Approximate Bayesian computation of - Drovandi and Pettitt [1]. - + """This base class implements Adaptive Population Monte Carlo Approximate Bayesian computation of + Drovandi and Pettitt [1]. + [1] CC. Drovandi CC and AN. Pettitt, Estimation of parameters for macroparasite population evolution using approximate Bayesian computation. Biometrics 67(1):225–233, 2011. - + Parameters ---------- model : abcpy.models.Model @@ -1285,27 +1207,46 @@ class RSMCABC: seed : integer, optional Optional initial seed for the random number generator. The default value is generated randomly. """ - def __init__(self, model, distance, kernel, backend, seed=None): + + def __init__(self, observations, model, distance, kernel, backend, n_samples=10000, n_samples_per_param=1, alpha=0.1, seed=None): self.model = model self.distance = distance self.kernel = kernel self.backend = backend + self.n_samples=n_samples + self.n_samples_per_param = n_samples_per_param + self.alpha = alpha + self.epsilon = None + self.R=None self.rng = np.random.RandomState(seed) - - def sample(self, observations, steps, n_samples = 10000, n_samples_per_param = 1, alpha = 0.1, epsilon_init = 100, epsilon_final = 0.1, const = 1, covFactor = 2.0, full_output=0): - """Samples from the posterior distribution of the model parameter given the observed + # these are usually big tables, so we broadcast them to have them once + # per executor instead of once per task + self.observations_bds = self.backend.broadcast(observations) + self.accepted_parameters_bds = None + self.accepted_dist_bds = None + self.accepted_cov_mat_bds = None + + def __getstate__(self): + #tells cloudpickle to not include the backend when pickling + state = self.__dict__.copy() + del state['backend'] + return state + + def sample(self, steps, epsilon_init=100, + epsilon_final=0.1, const=1, covFactor=2.0, full_output=0): + """Samples from the posterior distribution of the model parameter given the observed data observations. - + Parameters ---------- - observations : numpy.ndarray + observations : numpy.ndarray Observed data. - steps : integer - Number of iterations in the sequential algoritm ("generations") + steps : integer + Number of iterations in the sequential algoritm ("generations") n_samples : integer, optional Number of samples to generate. The default value is 10000. - n_samples_per_param : integer, optional + n_samples_per_param : integer, optional Number of data points in each simulated data set. The default value is 1. alpha : float, optional A parameter taking values between [0,1], the default value is 0.1. @@ -1313,146 +1254,119 @@ def sample(self, observations, steps, n_samples = 10000, n_samples_per_param = 1 Initial value of threshold, the default is 100 epsilon_final : float, optional Terminal value of threshold, the default is 0.1 - const : float, optional + const : float, optional A constant to compute acceptance probabilty - covFactor : float, optional + covFactor : float, optional scaling parameter of the covariance matrix. The default value is 2. full_output: integer, optional - If full_output==1, intermediate results are included in output journal. + If full_output==1, intermediate results are included in output journal. The default value is 0, meaning the intermediate results are not saved. Returns - ------- + ------- abcpy.output.Journal A journal containing simulation results, metadata and optionally intermediate results. """ - + journal = Journal(full_output) journal.configuration["type_model"] = type(self.model) journal.configuration["type_dist_func"] = type(self.distance) - journal.configuration["n_samples"] = n_samples - journal.configuration["n_samples_per_param"] = n_samples_per_param + journal.configuration["n_samples"] = self.n_samples + journal.configuration["n_samples_per_param"] = self.n_samples_per_param journal.configuration["steps"] = steps - + accepted_parameters = None accepted_cov_mat = None accepted_dist = None - - # Initialize variables that need to be available remotely - rc = _RemoteContextRSMCABC(self.backend, self.model, self.distance, self.kernel, observations, n_samples, n_samples_per_param, alpha) # main RSMCABC algorithm # print("INFO: Starting RSMCABC iterations.") for aStep in range(steps): - - # 0: Compute epsilon, compute new covariance matrix for Kernel, + + # 0: Compute epsilon, compute new covariance matrix for Kernel, # and finally Drawing new new/perturbed samples using prior or MCMC Kernel # print("DEBUG: Iteration " + str(aStep) + " of RSMCABC algorithm.") - if aStep == 0: - n_replenish = n_samples - # Compute epsilon + if aStep == 0: + n_replenish = self.n_samples + # Compute epsilon epsilon = [epsilon_init] R = int(1) else: - n_replenish = round(n_samples*alpha) - # Throw away N_alpha particles with largest dist - accepted_parameters = np.delete(accepted_parameters, np.arange(round(n_samples*alpha))+(n_samples-round(n_samples*alpha)), 0) - accepted_dist = np.delete(accepted_dist, np.arange(round(n_samples*alpha))+(n_samples-round(n_samples*alpha)), 0) + n_replenish = round(self.n_samples * self.alpha) + # Throw away N_alpha particles with largest dist + accepted_parameters = np.delete(accepted_parameters, np.arange(round(self.n_samples * self.alpha)) + ( + self.n_samples - round(self.n_samples * self.alpha)), 0) + accepted_dist = np.delete(accepted_dist, + np.arange(round(self.n_samples * self.alpha)) + (self.n_samples - round(self.n_samples * self.alpha)), + 0) # Compute epsilon epsilon.append(accepted_dist[-1]) # Calculate covariance # print("INFO: Calculating covariance matrix.") - new_cov_mat = covFactor * np.cov(accepted_parameters, rowvar=False) - accepted_cov_mat = new_cov_mat - - + new_cov_mat = covFactor * np.cov(accepted_parameters, rowvar=False) + accepted_cov_mat = new_cov_mat + if epsilon[-1] < epsilon_final: break - - seed_arr = self.rng.randint(0, np.iinfo(np.uint32).max, size = n_replenish, dtype=np.uint32) + + seed_arr = self.rng.randint(0, np.iinfo(np.uint32).max, size=n_replenish, dtype=np.uint32) seed_pds = self.backend.parallelize(seed_arr) - #update remotely required variables + # update remotely required variables # print("INFO: Broadcasting parameters.") - rc.epsilon = epsilon - rc.R = R + self.epsilon = epsilon + self.R = R # Broadcast updated variable - rc._update_broadcasts(self.backend, accepted_parameters, accepted_dist, accepted_cov_mat) + self._update_broadcasts(accepted_parameters, accepted_dist, accepted_cov_mat) - #calculate resample parameters - #print("INFO: Resampling parameters") - params_and_dist_index_pds = self.backend.map(rc._accept_parameter, seed_pds) + # calculate resample parameters + # print("INFO: Resampling parameters") + params_and_dist_index_pds = self.backend.map(self._accept_parameter, seed_pds) params_and_dist_index = self.backend.collect(params_and_dist_index_pds) new_parameters, new_dist, new_index = [list(t) for t in zip(*params_and_dist_index)] new_parameters = np.array(new_parameters) new_dist = np.array(new_dist) new_index = np.array(new_index) - + # 1: Update all parameters, compute acceptance probability, compute epsilon - if len(new_dist) == n_samples: + if len(new_dist) == self.n_samples: accepted_parameters = new_parameters - accepted_dist = new_dist + accepted_dist = new_dist else: - accepted_parameters = np.concatenate((accepted_parameters,new_parameters)) + accepted_parameters = np.concatenate((accepted_parameters, new_parameters)) accepted_dist = np.concatenate((accepted_dist, new_dist)) - # 2: Compute acceptance probabilty and set R - #print(aStep) - #print(new_index) - prob_acceptance = sum(new_index)/(R*n_replenish) + # 2: Compute acceptance probabilty and set R + # print(aStep) + # print(new_index) + prob_acceptance = sum(new_index) / (R * n_replenish) if prob_acceptance == 1 or prob_acceptance == 0: R = 1 else: - R = int(np.log(const)/np.log(1-prob_acceptance)) - - + R = int(np.log(const) / np.log(1 - prob_acceptance)) + # print("INFO: Saving configuration to output journal.") - if (full_output == 1 and aStep <= steps-1) or (full_output == 0 and aStep == steps-1): + if (full_output == 1 and aStep <= steps - 1) or (full_output == 0 and aStep == steps - 1): journal.add_parameters(accepted_parameters) - journal.add_weights(np.ones(shape=(n_samples,1))*(1/n_samples)) + journal.add_weights(np.ones(shape=(self.n_samples, 1)) * (1 / self.n_samples)) - #Add epsilon_arr to the journal + # Add epsilon_arr to the journal journal.configuration["epsilon_arr"] = epsilon return journal - - - -class _RemoteContextRSMCABC: - """ - Contains everything that is sent over the network like broadcast vars and map functions - """ - - def __init__(self, backend, model, distance, kernel, observations, n_samples, n_samples_per_param, alpha): - self.model = model - self.distance = distance - self.n_samples = n_samples - self.n_samples_per_param = n_samples_per_param - self.kernel = kernel - self.alpha = alpha - - self.epsilon = None - self.R = None - - # these are usually big tables, so we broadcast them to have them once - # per executor instead of once per task - self.observations_bds = backend.broadcast(observations) - self.accepted_parameters_bds = None - self.accepted_dist_bds = None - self.accepted_cov_mat_bds = None - def _update_broadcasts(self, backend, accepted_parameters, accepted_dist, accepted_cov_mat): + def _update_broadcasts(self, accepted_parameters, accepted_dist, accepted_cov_mat): def destroy(bc): if bc != None: bc.unpersist - #bc.destroy - + # bc.destroy + if not accepted_parameters is None: - self.accepted_parameters_bds = backend.broadcast(accepted_parameters) + self.accepted_parameters_bds = self.backend.broadcast(accepted_parameters) if not accepted_dist is None: - self.accepted_dist_bds = backend.broadcast(accepted_dist) + self.accepted_dist_bds = self.backend.broadcast(accepted_dist) if not accepted_cov_mat is None: - self.accepted_cov_mat_bds = backend.broadcast(accepted_cov_mat) + self.accepted_cov_mat_bds = self.backend.broadcast(accepted_cov_mat) # define helper functions for map step def _accept_parameter(self, seed): @@ -1460,7 +1374,7 @@ def _accept_parameter(self, seed): Samples a single model parameter and simulate from it until distance between simulated outcome and the observation is smaller than eplison. - + :type seed: int :rtype: np.array :return: accepted parameter @@ -1469,13 +1383,13 @@ def _accept_parameter(self, seed): rng = np.random.RandomState(seed) self.model.prior.reseed(rng.randint(np.iinfo(np.uint32).max, dtype=np.uint32)) self.kernel.reseed(rng.randint(np.iinfo(np.uint32).max, dtype=np.uint32)) - + distance = self.distance.dist_max() if self.accepted_parameters_bds == None: while distance > self.epsilon[-1]: self.model.sample_from_prior() y_sim = self.model.simulate(self.n_samples_per_param) - distance = self.distance.distance(self.observations_bds.value(), y_sim) + distance = self.distance.distance(self.observations_bds.value(), y_sim) index_accept = 1 else: index = rng.choice(len(self.accepted_parameters_bds.value()), size=1) @@ -1484,34 +1398,35 @@ def _accept_parameter(self, seed): for ind in range(self.R): while True: self.kernel.set_parameters([theta, self.accepted_cov_mat_bds.value()]) - new_theta = self.kernel.sample(1)[0,:] + new_theta = self.kernel.sample(1)[0, :] theta_is_accepted = self.model.set_parameters(new_theta) if theta_is_accepted and self.model.prior.pdf(self.model.get_parameters()) != 0: - break + break y_sim = self.model.simulate(self.n_samples_per_param) distance = self.distance.distance(self.observations_bds.value(), y_sim) - ratio_prior_prob = self.model.prior.pdf(new_theta)/self.model.prior.pdf(theta) + ratio_prior_prob = self.model.prior.pdf(new_theta) / self.model.prior.pdf(theta) self.kernel.set_parameters([new_theta, self.accepted_cov_mat_bds.value()]) kernel_numerator = self.kernel.pdf(theta) - self.kernel.set_parameters([theta, self.accepted_cov_mat_bds.value()]) + self.kernel.set_parameters([theta, self.accepted_cov_mat_bds.value()]) kernel_denominator = self.kernel.pdf(new_theta) - ratio_kernel_prob = kernel_numerator/kernel_denominator - probability_acceptance = min(1,ratio_prior_prob*ratio_kernel_prob) - if distance < self.epsilon[-1] and rng.binomial(1,probability_acceptance) == 1: + ratio_kernel_prob = kernel_numerator / kernel_denominator + probability_acceptance = min(1, ratio_prior_prob * ratio_kernel_prob) + if distance < self.epsilon[-1] and rng.binomial(1, probability_acceptance) == 1: index_accept += 1 else: self.model.set_parameters(theta) distance = self.accepted_dist_bds.value()[index[0]] - + return (self.model.get_parameters(), distance, index_accept) + class APMCABC: - """This base class implements Adaptive Population Monte Carlo Approximate Bayesian computation of - M. Lenormand et al. [1]. - + """This base class implements Adaptive Population Monte Carlo Approximate Bayesian computation of + M. Lenormand et al. [1]. + [1] M. Lenormand, F. Jabot and G. Deffuant, Adaptive approximate Bayesian computation for complex models. Computational Statistics, 28:2777–2796, 2013. - + Parameters ---------- model : abcpy.models.Model @@ -1525,51 +1440,72 @@ class APMCABC: seed : integer, optional Optional initial seed for the random number generator. The default value is generated randomly. """ - def __init__(self, model, distance, kernel, backend, seed=None): + + def __init__(self, observations, model, distance, kernel, backend, n_samples=10000, n_samples_per_param=1, alpha=0.9, seed=None): self.model = model self.distance = distance self.kernel = kernel self.backend = backend + self.n_samples = n_samples + self.n_samples_per_param = n_samples_per_param + + self.alpha = alpha + + self.epsilon= None self.rng = np.random.RandomState(seed) - - def sample(self, observations, steps, n_samples = 10000, n_samples_per_param = 1, alpha = 0.9, acceptance_cutoff = 0.2, covFactor = 2.0, full_output=0): - """Samples from the posterior distribution of the model parameter given the observed + # these are usually big tables, so we broadcast them to have them once + # per executor instead of once per task + self.observations_bds = self.backend.broadcast(observations) + self.alpha_accepted_parameters_bds = None + self.alpha_accepted_weights_bds = None + self.alpha_accepted_dist = None + self.accepted_cov_mat_bds = None + + def __getstate__(self): + #tells cloudpickle to not include the backend when pickling + state = self.__dict__.copy() + del state['backend'] + return state + + def sample(self, steps, acceptance_cutoff=0.2, + covFactor=2.0, full_output=0): + """Samples from the posterior distribution of the model parameter given the observed data observations. - + Parameters ---------- - observations : numpy.ndarray + observations : numpy.ndarray Observed data. - steps : integer - Number of iterations in the sequential algoritm ("generations") + steps : integer + Number of iterations in the sequential algoritm ("generations") n_samples : integer, optional Number of samples to generate. The default value is 10000. - n_samples_per_param : integer, optional + n_samples_per_param : integer, optional Number of data points in each simulated data set. The default value is 1. alpha : float, optional A parameter taking values between [0,1], the default value is 0.1. acceptance_cutoff : float, optional Acceptance ratio cutoff, The default value is 0.2 - covFactor : float, optional + covFactor : float, optional scaling parameter of the covariance matrix. The default value is 2. full_output: integer, optional - If full_output==1, intermediate results are included in output journal. + If full_output==1, intermediate results are included in output journal. The default value is 0, meaning the intermediate results are not saved. Returns - ------- + ------- abcpy.output.Journal A journal containing simulation results, metadata and optionally intermediate results. """ - + journal = Journal(full_output) journal.configuration["type_model"] = type(self.model) journal.configuration["type_dist_func"] = type(self.distance) - journal.configuration["n_samples"] = n_samples - journal.configuration["n_samples_per_param"] = n_samples_per_param + journal.configuration["n_samples"] = self.n_samples + journal.configuration["n_samples_per_param"] = self.n_samples_per_param journal.configuration["steps"] = steps - + accepted_parameters = None accepted_weights = None accepted_cov_mat = None @@ -1577,68 +1513,67 @@ def sample(self, observations, steps, n_samples = 10000, n_samples_per_param = 1 alpha_accepted_parameters = None alpha_accepted_weights = None alpha_accepted_dist = None - - # Initialize variables that need to be available remotely - rc = _RemoteContextAPMCABC(self.backend, self.model, self.distance, self.kernel, observations, n_samples, n_samples_per_param, alpha) # main APMCABC algorithm # print("INFO: Starting APMCABC iterations.") for aStep in range(steps): - + # 0: Drawing new new/perturbed samples using prior or MCMC Kernel # print("DEBUG: Iteration " + str(aStep) + " of APMCABC algorithm.") if aStep > 0: - n_additional_samples = n_samples - round(n_samples*alpha) + n_additional_samples = self.n_samples - round(self.n_samples * self.alpha) else: - n_additional_samples = n_samples - - seed_arr = self.rng.randint(0, np.iinfo(np.uint32).max, size = n_additional_samples, dtype=np.uint32) + n_additional_samples = self.n_samples + + seed_arr = self.rng.randint(0, np.iinfo(np.uint32).max, size=n_additional_samples, dtype=np.uint32) seed_pds = self.backend.parallelize(seed_arr) - #update remotely required variables + # update remotely required variables # print("INFO: Broadcasting parameters.") - rc._update_broadcasts(self.backend, alpha_accepted_parameters, alpha_accepted_weights, alpha_accepted_dist, accepted_cov_mat) + self._update_broadcasts(alpha_accepted_parameters, alpha_accepted_weights, alpha_accepted_dist, + accepted_cov_mat) - #calculate resample parameters - #print("INFO: Resampling parameters") - params_and_dist_weights_pds = self.backend.map(rc._accept_parameter, seed_pds) + # calculate resample parameters + # print("INFO: Resampling parameters") + params_and_dist_weights_pds = self.backend.map(self._accept_parameter, seed_pds) params_and_dist_weights = self.backend.collect(params_and_dist_weights_pds) new_parameters, new_dist, new_weights = [list(t) for t in zip(*params_and_dist_weights)] new_parameters = np.array(new_parameters) new_dist = np.array(new_dist) - new_weights = np.array(new_weights).reshape(n_additional_samples,1) - + new_weights = np.array(new_weights).reshape(n_additional_samples, 1) + # 1: Update all parameters, compute acceptance probability, compute epsilon - if len(new_weights) == n_samples: + if len(new_weights) == self.n_samples: accepted_parameters = new_parameters accepted_dist = new_dist accepted_weights = new_weights # Compute acceptance probability prob_acceptance = 1 # Compute epsilon - epsilon = [np.percentile(accepted_dist, alpha*100)] + epsilon = [np.percentile(accepted_dist, self.alpha * 100)] else: - accepted_parameters = np.concatenate((alpha_accepted_parameters,new_parameters)) + accepted_parameters = np.concatenate((alpha_accepted_parameters, new_parameters)) accepted_dist = np.concatenate((alpha_accepted_dist, new_dist)) accepted_weights = np.concatenate((alpha_accepted_weights, new_weights)) # Compute acceptance probability - prob_acceptance = sum(new_dist < epsilon[-1])/len(new_dist) + prob_acceptance = sum(new_dist < epsilon[-1]) / len(new_dist) # Compute epsilon - epsilon.append(np.percentile(accepted_dist, alpha*100)) - + epsilon.append(np.percentile(accepted_dist, self.alpha * 100)) + # 2: Update alpha_parameters, alpha_dist and alpha_weights index_alpha = accepted_dist < epsilon[-1] - alpha_accepted_parameters = accepted_parameters[index_alpha,:] - alpha_accepted_weights = accepted_weights[index_alpha]/sum(accepted_weights[index_alpha]) + alpha_accepted_parameters = accepted_parameters[index_alpha, :] + alpha_accepted_weights = accepted_weights[index_alpha] / sum(accepted_weights[index_alpha]) alpha_accepted_dist = accepted_dist[index_alpha] - + # 3: calculate covariance # print("INFO: Calculating covariance matrix.") - new_cov_mat = covFactor * np.cov(alpha_accepted_parameters, aweights = alpha_accepted_weights.reshape(-1), rowvar=False) - accepted_cov_mat = new_cov_mat + new_cov_mat = covFactor * np.cov(alpha_accepted_parameters, aweights=alpha_accepted_weights.reshape(-1), + rowvar=False) + accepted_cov_mat = new_cov_mat # print("INFO: Saving configuration to output journal.") - if (full_output == 1 and aStep <= steps-1) or (full_output == 0 and aStep == steps-1): + if (full_output == 1 and aStep <= steps - 1) or (full_output == 0 and aStep == steps - 1): journal.add_parameters(accepted_parameters) journal.add_weights(accepted_weights) @@ -1646,50 +1581,26 @@ def sample(self, observations, steps, n_samples = 10000, n_samples_per_param = 1 if prob_acceptance < acceptance_cutoff: break - #Add epsilon_arr to the journal + # Add epsilon_arr to the journal journal.configuration["epsilon_arr"] = epsilon return journal - - - -class _RemoteContextAPMCABC: - """ - Contains everything that is sent over the network like broadcast vars and map functions - """ - - def __init__(self, backend, model, distance, kernel, observations, n_samples, n_samples_per_param, alpha): - self.model = model - self.distance = distance - self.n_samples = n_samples - self.n_samples_per_param = n_samples_per_param - self.kernel = kernel - self.alpha = alpha - - self.epsilon = None - - # these are usually big tables, so we broadcast them to have them once - # per executor instead of once per task - self.observations_bds = backend.broadcast(observations) - self.alpha_accepted_parameters_bds = None - self.alpha_accepted_weights_bds = None - self.alpha_accepted_dist = None - self.accepted_cov_mat_bds = None - def _update_broadcasts(self, backend, alpha_accepted_parameters, alpha_accepted_weights, alpha_accepted_dist, accepted_cov_mat): + def _update_broadcasts(self, alpha_accepted_parameters, alpha_accepted_weights, alpha_accepted_dist, + accepted_cov_mat): def destroy(bc): if bc != None: bc.unpersist - #bc.destroy - + # bc.destroy + if not alpha_accepted_parameters is None: - self.alpha_accepted_parameters_bds = backend.broadcast(alpha_accepted_parameters) + self.alpha_accepted_parameters_bds = self.backend.broadcast(alpha_accepted_parameters) if not alpha_accepted_weights is None: - self.alpha_accepted_weights_bds = backend.broadcast(alpha_accepted_weights) + self.alpha_accepted_weights_bds = self.backend.broadcast(alpha_accepted_weights) if not alpha_accepted_dist is None: - self.alpha_accepted_dist_bds = backend.broadcast(alpha_accepted_dist) + self.alpha_accepted_dist_bds = self.backend.broadcast(alpha_accepted_dist) if not accepted_cov_mat is None: - self.accepted_cov_mat_bds = backend.broadcast(accepted_cov_mat) + self.accepted_cov_mat_bds = self.backend.broadcast(accepted_cov_mat) # define helper functions for map step def _accept_parameter(self, seed): @@ -1697,7 +1608,7 @@ def _accept_parameter(self, seed): Samples a single model parameter and simulate from it until distance between simulated outcome and the observation is smaller than eplison. - + :type seed: int :rtype: np.array :return: accepted parameter @@ -1706,45 +1617,47 @@ def _accept_parameter(self, seed): rng = np.random.RandomState(seed) self.model.prior.reseed(rng.randint(np.iinfo(np.uint32).max, dtype=np.uint32)) self.kernel.reseed(rng.randint(np.iinfo(np.uint32).max, dtype=np.uint32)) - + if self.alpha_accepted_parameters_bds == None: self.model.sample_from_prior() y_sim = self.model.simulate(self.n_samples_per_param) dist = self.distance.distance(self.observations_bds.value(), y_sim) weight = 1.0 else: - index = rng.choice(len(self.alpha_accepted_weights_bds.value()), size=1, p=self.alpha_accepted_weights_bds.value().reshape(-1)) + index = rng.choice(len(self.alpha_accepted_weights_bds.value()), size=1, + p=self.alpha_accepted_weights_bds.value().reshape(-1)) theta = self.alpha_accepted_parameters_bds.value()[index[0]] # trucate the normal to the bounds of parameter space of the model # truncating the normal like this is fine: https://arxiv.org/pdf/0907.4010v1.pdf while True: self.kernel.set_parameters([theta, self.accepted_cov_mat_bds.value()]) - new_theta = self.kernel.sample(1)[0,:] + new_theta = self.kernel.sample(1)[0, :] theta_is_accepted = self.model.set_parameters(new_theta) if theta_is_accepted and self.model.prior.pdf(self.model.get_parameters()) != 0: break y_sim = self.model.simulate(self.n_samples_per_param) dist = self.distance.distance(self.observations_bds.value(), y_sim) - + prior_prob = self.model.prior.pdf(new_theta) denominator = 0.0 for i in range(0, len(self.alpha_accepted_weights_bds.value())): - self.kernel.set_parameters([self.alpha_accepted_parameters_bds.value()[i,:], self.accepted_cov_mat_bds.value()]) + self.kernel.set_parameters( + [self.alpha_accepted_parameters_bds.value()[i, :], self.accepted_cov_mat_bds.value()]) pdf_value = self.kernel.pdf(new_theta) - denominator += self.alpha_accepted_weights_bds.value()[i,0] * pdf_value - weight = 1.0 * prior_prob / denominator - - return (self.model.get_parameters(), dist, weight) + denominator += self.alpha_accepted_weights_bds.value()[i, 0] * pdf_value + weight = 1.0 * prior_prob / denominator + + return (self.model.get_parameters(), dist, weight) class SMCABC: - """This base class implements Adaptive Population Monte Carlo Approximate Bayesian computation of - Del Moral et al. [1]. - + """This base class implements Adaptive Population Monte Carlo Approximate Bayesian computation of + Del Moral et al. [1]. + [1] P. Del Moral, A. Doucet, A. Jasra, An adaptive sequential Monte Carlo method for approximate Bayesian computation. Statistics and Computing, 22(5):1009–1020, 2012. - + Parameters ---------- model : abcpy.models.Model @@ -1758,229 +1671,225 @@ class SMCABC: seed : integer, optional Optional initial seed for the random number generator. The default value is generated randomly. """ - def __init__(self, model, distance, kernel, backend, seed=None): + + def __init__(self, observations, model, distance, kernel, backend, n_samples=10000, n_samples_per_param=1, seed=None): self.model = model self.distance = distance self.kernel = kernel self.backend = backend + self.n_samples = n_samples + self.n_samples_per_param = n_samples_per_param + + self.epsilon = None self.rng = np.random.RandomState(seed) - - def sample(self, observations, steps, n_samples = 10000, n_samples_per_param = 1, epsilon_final = 0.1, alpha = 0.95, covFactor = 2, resample = None, full_output=0): - """Samples from the posterior distribution of the model parameter given the observed + # these are usually big tables, so we broadcast them to have them once + # per executor instead of once per task + self.observations_bds = self.backend.broadcast(observations) + self.accepted_parameters_bds = None + self.accepted_weights_bds = None + self.accepted_cov_mat_bds = None + self.accepted_y_sim_bds = None + + def __getstate__(self): + #tells cloudpickle to not include the backend when pickling + state = self.__dict__.copy() + del state['backend'] + return state + + def sample(self, observations, steps, epsilon_final=0.1, alpha=0.95, + covFactor=2, resample=None, full_output=0): + """Samples from the posterior distribution of the model parameter given the observed data observations. - + Parameters ---------- - observations : numpy.ndarray + observations : numpy.ndarray Observed data. - steps : integer - Number of iterations in the sequential algoritm ("generations") + steps : integer + Number of iterations in the sequential algoritm ("generations") epsilon_final : float, optional The final threshold value of epsilon to be reached. The default value is 0.1. n_samples : integer, optional Number of samples to generate. The default value is 10000. - n_samples_per_param : integer, optional + n_samples_per_param : integer, optional Number of data points in each simulated data set. The default value is 1. alpha : float, optional A parameter taking values between [0,1], determinining the rate of change of the threshold epsilon. The default value is 0.5. - covFactor : float, optional + covFactor : float, optional scaling parameter of the covariance matrix. The default value is 2. full_output: integer, optional - If full_output==1, intermediate results are included in output journal. + If full_output==1, intermediate results are included in output journal. The default value is 0, meaning the intermediate results are not saved. Returns - ------- + ------- abcpy.output.Journal A journal containing simulation results, metadata and optionally intermediate results. """ - + journal = Journal(full_output) journal.configuration["type_model"] = type(self.model) journal.configuration["type_dist_func"] = type(self.distance) - journal.configuration["n_samples"] = n_samples - journal.configuration["n_samples_per_param"] = n_samples_per_param + journal.configuration["n_samples"] = self.n_samples + journal.configuration["n_samples_per_param"] = self.n_samples_per_param journal.configuration["steps"] = steps - + accepted_parameters = None accepted_weights = None accepted_cov_mat = None accepted_y_sim = None - + # Define the resmaple parameter if resample == None: - resample = n_samples*0.5 - - #Define epsilon_init + resample = self.n_samples * 0.5 + + # Define epsilon_init epsilon = [10000] - - - # Initialize variables that need to be available remotely - rc = _RemoteContextSMCABC(self.backend, self.model, self.distance, self.kernel, observations, n_samples, n_samples_per_param) # main SMC ABC algorithm # print("INFO: Starting SMCABC iterations.") for aStep in range(0, steps): - + # Break if epsilon in previous step is less than epsilon_final if epsilon[-1] == epsilon_final: break - + # 0: Compute the Epsilon if accepted_y_sim != None: # Compute epsilon for next step fun = lambda epsilon_var: self._compute_epsilon(epsilon_var, \ - epsilon, observations, accepted_y_sim, accepted_weights, n_samples, n_samples_per_param, alpha) - epsilon_new = self._bisection(fun, epsilon_final, epsilon[-1], 0.001) + epsilon, observations, accepted_y_sim, accepted_weights, + self.n_samples, self.n_samples_per_param, alpha) + epsilon_new = self._bisection(fun, epsilon_final, epsilon[-1], 0.001) if epsilon_new < epsilon_final: epsilon_new = epsilon_final epsilon.append(epsilon_new) - # 1: calculate weights for new parameters + # 1: calculate weights for new parameters # print("INFO: Calculating weights.") if accepted_y_sim != None: - new_weights = np.zeros(shape=(n_samples),) - for ind1 in range(n_samples): + new_weights = np.zeros(shape=(self.n_samples), ) + for ind1 in range(self.n_samples): numerator = 0.0 denominator = 0.0 - for ind2 in range(n_samples_per_param): + for ind2 in range(self.n_samples_per_param): numerator += (self.distance.distance(observations, [accepted_y_sim[ind1][ind2]]) < epsilon[-1]) - denominator += (self.distance.distance(observations, [accepted_y_sim[ind1][ind2]]) < epsilon[-2]) + denominator += ( + self.distance.distance(observations, [accepted_y_sim[ind1][ind2]]) < epsilon[-2]) if denominator != 0.0: - new_weights[ind1] = accepted_weights[ind1]*(numerator/denominator) + new_weights[ind1] = accepted_weights[ind1] * (numerator / denominator) else: new_weights[ind1] = 0 new_weights = new_weights / sum(new_weights) else: - new_weights = np.ones(shape=(n_samples),)*(1.0 /n_samples) - - # 2: Resample - if accepted_y_sim != None and pow(sum(pow(new_weights,2)),-1) < resample: + new_weights = np.ones(shape=(self.n_samples), ) * (1.0 / self.n_samples) + + # 2: Resample + if accepted_y_sim != None and pow(sum(pow(new_weights, 2)), -1) < resample: print('Resampling') # Weighted resampling: - index_resampled = self.rng.choice(np.arange(n_samples), n_samples, replace = 1, p = new_weights) - accepted_parameters = accepted_parameters[index_resampled,:] - new_weights = np.ones(shape=(n_samples),)*(1.0 /n_samples) - + index_resampled = self.rng.choice(np.arange(self.n_samples), self.n_samples, replace=1, p=new_weights) + accepted_parameters = accepted_parameters[index_resampled, :] + new_weights = np.ones(shape=(self.n_samples), ) * (1.0 / self.n_samples) + # Update the weights - accepted_weights = new_weights.reshape(len(new_weights),1) + accepted_weights = new_weights.reshape(len(new_weights), 1) # 3: Drawing new perturbed samples using MCMC Kernel # print("DEBUG: Iteration " + str(aStep) + " of SMCABC algorithm.") - seed_arr = self.rng.randint(0, np.iinfo(np.uint32).max, size=n_samples, dtype=np.uint32) - index_arr = np.arange(n_samples) - seed_index_arr = np.column_stack((seed_arr,index_arr)) + seed_arr = self.rng.randint(0, np.iinfo(np.uint32).max, size=self.n_samples, dtype=np.uint32) + index_arr = np.arange(self.n_samples) + seed_index_arr = np.column_stack((seed_arr, index_arr)) seed_index_pds = self.backend.parallelize(seed_index_arr) - #update remotely required variables + # update remotely required variables # print("INFO: Broadcasting parameters.") - rc.epsilon = epsilon - rc._update_broadcasts(self.backend, accepted_parameters, accepted_weights, accepted_cov_mat, accepted_y_sim) + self.epsilon = epsilon + self._update_broadcasts(accepted_parameters, accepted_weights, accepted_cov_mat, accepted_y_sim) - #calculate resample parameters - #print("INFO: Resampling parameters") - params_and_ysim_pds = self.backend.map(rc._accept_parameter, seed_index_pds) + # calculate resample parameters + # print("INFO: Resampling parameters") + params_and_ysim_pds = self.backend.map(self._accept_parameter, seed_index_pds) params_and_ysim = self.backend.collect(params_and_ysim_pds) new_parameters, new_y_sim = [list(t) for t in zip(*params_and_ysim)] new_parameters = np.array(new_parameters) - #Update the parameters + # Update the parameters accepted_parameters = new_parameters accepted_y_sim = new_y_sim - + # 4: calculate covariance # print("INFO: Calculating covariance matrix.") - new_cov_mat = covFactor * np.cov(accepted_parameters, aweights = accepted_weights.reshape(-1), rowvar=False) - accepted_cov_mat = new_cov_mat + new_cov_mat = covFactor * np.cov(accepted_parameters, aweights=accepted_weights.reshape(-1), rowvar=False) + accepted_cov_mat = new_cov_mat # print("INFO: Saving configuration to output journal.") - if (full_output == 1 and aStep <= steps-1) or (full_output == 0 and aStep == steps-1): + if (full_output == 1 and aStep <= steps - 1) or (full_output == 0 and aStep == steps - 1): journal.add_parameters(accepted_parameters) journal.add_weights(accepted_weights) - #Add epsilon_arr to the journal + # Add epsilon_arr to the journal journal.configuration["epsilon_arr"] = epsilon return journal - - def _compute_epsilon(self, epsilon_new, epsilon, observations, accepted_y_sim, accepted_weights, n_samples, n_samples_per_param, alpha): - RHS = alpha*pow(sum(pow(accepted_weights,2)),-1) - LHS = np.zeros(shape=(n_samples),) + def _compute_epsilon(self, epsilon_new, epsilon, observations, accepted_y_sim, accepted_weights, n_samples, + n_samples_per_param, alpha): + + RHS = alpha * pow(sum(pow(accepted_weights, 2)), -1) + LHS = np.zeros(shape=(n_samples), ) for ind1 in range(n_samples): numerator = 0.0 denominator = 0.0 for ind2 in range(n_samples_per_param): numerator += (self.distance.distance(observations, [accepted_y_sim[ind1][ind2]]) < epsilon_new) denominator += (self.distance.distance(observations, [accepted_y_sim[ind1][ind2]]) < epsilon[-1]) - LHS[ind1] = accepted_weights[ind1]*(numerator/denominator) + LHS[ind1] = accepted_weights[ind1] * (numerator / denominator) if sum(LHS) == 0: result = RHS else: - LHS = LHS/sum(LHS) - LHS = pow(sum(pow(LHS,2)),-1) - result = RHS-LHS - return(result) - + LHS = LHS / sum(LHS) + LHS = pow(sum(pow(LHS, 2)), -1) + result = RHS - LHS + return (result) + def _bisection(self, func, low, high, tol): - midpoint = (low+high)/2.0 - while (high-low)/2.0 > tol: + midpoint = (low + high) / 2.0 + while (high - low) / 2.0 > tol: if func(midpoint) == 0: return midpoint - elif func(low)*func(midpoint) < 0: + elif func(low) * func(midpoint) < 0: high = midpoint - else : + else: low = midpoint - midpoint = (low+high)/2.0 - - return midpoint - - -class _RemoteContextSMCABC: - """ - Contains everything that is sent over the network like broadcast vars and map functions - """ - - def __init__(self, backend, model, distance, kernel, observations, n_samples, n_samples_per_param): - self.model = model - self.distance = distance - self.n_samples = n_samples - self.n_samples_per_param = n_samples_per_param - self.kernel = kernel - - self.epsilon = None + midpoint = (low + high) / 2.0 - # these are usually big tables, so we broadcast them to have them once - # per executor instead of once per task - self.observations_bds = backend.broadcast(observations) - self.accepted_parameters_bds = None - self.accepted_weights_bds = None - self.accepted_cov_mat_bds = None - self.accepted_y_sim_bds = None + return midpoint - def _update_broadcasts(self, backend, accepted_parameters, accepted_weights, accepted_cov_mat, accepted_y_sim): + def _update_broadcasts(self, accepted_parameters, accepted_weights, accepted_cov_mat, accepted_y_sim): def destroy(bc): if bc != None: bc.unpersist - #bc.destroy - + # bc.destroy + if not accepted_parameters is None: - self.accepted_parameters_bds = backend.broadcast(accepted_parameters) + self.accepted_parameters_bds = self.backend.broadcast(accepted_parameters) if not accepted_weights is None: - self.accepted_weights_bds = backend.broadcast(accepted_weights) + self.accepted_weights_bds = self.backend.broadcast(accepted_weights) if not accepted_cov_mat is None: - self.accepted_cov_mat_bds = backend.broadcast(accepted_cov_mat) + self.accepted_cov_mat_bds = self.backend.broadcast(accepted_cov_mat) if not accepted_y_sim is None: - self.accepted_y_sim_bds = backend.broadcast(accepted_y_sim) + self.accepted_y_sim_bds = self.backend.broadcast(accepted_y_sim) + + # define helper functions for map step - # define helper functions for map step def _accept_parameter(self, seed_index): """ Samples a single model parameter and simulate from it until distance between simulated outcome and the observation is smaller than eplison. - + :type seed: int :rtype: np.array :return: accepted parameter @@ -1990,8 +1899,8 @@ def _accept_parameter(self, seed_index): rng = np.random.RandomState(seed) self.model.prior.reseed(rng.randint(np.iinfo(np.uint32).max, dtype=np.uint32)) self.kernel.reseed(rng.randint(np.iinfo(np.uint32).max, dtype=np.uint32)) - - #print("on seed " + str(seed) + " distance: " + str(distance) + " epsilon: " + str(self.epsilon)) + + # print("on seed " + str(seed) + " distance: " + str(distance) + " epsilon: " + str(self.epsilon)) if self.accepted_parameters_bds == None: self.model.sample_from_prior() y_sim = self.model.simulate(self.n_samples_per_param) @@ -2000,7 +1909,7 @@ def _accept_parameter(self, seed_index): theta = self.accepted_parameters_bds.value()[index] while True: self.kernel.set_parameters([theta, self.accepted_cov_mat_bds.value()]) - new_theta = self.kernel.sample(1)[0,:] + new_theta = self.kernel.sample(1)[0, :] theta_is_accepted = self.model.set_parameters(new_theta) if theta_is_accepted and self.model.prior.pdf(self.model.get_parameters()) != 0: break @@ -2010,17 +1919,19 @@ def _accept_parameter(self, seed_index): numerator = 0.0 denominator = 0.0 for ind in range(self.n_samples_per_param): - numerator += (self.distance.distance(self.observations_bds.value(), [y_sim[ind]]) < self.epsilon[-1]) - denominator += (self.distance.distance(self.observations_bds.value(), [y_sim_old[ind]]) < self.epsilon[-1]) - ratio_data_epsilon = numerator/denominator - ratio_prior_prob = self.model.prior.pdf(new_theta)/self.model.prior.pdf(theta) + numerator += ( + self.distance.distance(self.observations_bds.value(), [y_sim[ind]]) < self.epsilon[-1]) + denominator += ( + self.distance.distance(self.observations_bds.value(), [y_sim_old[ind]]) < self.epsilon[-1]) + ratio_data_epsilon = numerator / denominator + ratio_prior_prob = self.model.prior.pdf(new_theta) / self.model.prior.pdf(theta) self.kernel.set_parameters([new_theta, self.accepted_cov_mat_bds.value()]) kernel_numerator = self.kernel.pdf(theta) - self.kernel.set_parameters([theta, self.accepted_cov_mat_bds.value()]) + self.kernel.set_parameters([theta, self.accepted_cov_mat_bds.value()]) kernel_denominator = self.kernel.pdf(new_theta) - ratio_likelihood_prob = kernel_numerator/kernel_denominator - acceptance_prob = min(1,ratio_data_epsilon*ratio_prior_prob*ratio_likelihood_prob) - if rng.binomial(1,acceptance_prob) == 1: + ratio_likelihood_prob = kernel_numerator / kernel_denominator + acceptance_prob = min(1, ratio_data_epsilon * ratio_prior_prob * ratio_likelihood_prob) + if rng.binomial(1, acceptance_prob) == 1: self.model.set_parameters(new_theta) else: self.model.set_parameters(theta) @@ -2028,5 +1939,7 @@ def _accept_parameter(self, seed_index): else: self.model.set_parameters(self.accepted_parameters_bds.value()[index]) y_sim = self.accepted_y_sim_bds.value()[index] - - return (self.model.get_parameters(), y_sim) \ No newline at end of file + + return (self.model.get_parameters(), y_sim) + + diff --git a/tests/inferences_tests.py b/tests/inferences_tests.py index 4c3d7047..7f78f8af 100644 --- a/tests/inferences_tests.py +++ b/tests/inferences_tests.py @@ -13,7 +13,7 @@ from abcpy.statistics import Identity -from abcpy.inferences import RejectionABC, PMC, PMCABC, _RemoteContextPMCABC, SABC, ABCsubsim, SMCABC, APMCABC, RSMCABC +from abcpy.inferences import RejectionABC, PMC, PMCABC, SABC, ABCsubsim, SMCABC, APMCABC, RSMCABC class RejectionABCTest(unittest.TestCase): def test_sample(self): @@ -38,8 +38,8 @@ def test_sample(self): y_obs = model.simulate(1) # use the rejection sampling scheme - sampler = RejectionABC(model, dist_calc, dummy, seed = 1) - journal = sampler.sample(y_obs, 10, 1, 0.1) + sampler = RejectionABC(y_obs, model, dist_calc, dummy, epsilon=0.1, n_samples=10, n_samples_per_param=1, seed = 1) + journal = sampler.sample() samples = journal.get_parameters() # test shape of samples @@ -80,8 +80,8 @@ def test_sample(self): kernel = MultiNormal(mean, cov, seed=1) T, n_sample, n_samples_per_param = 1, 10, 100 - sampler = PMC(model, likfun, kernel, backend, seed = 1) - journal = sampler.sample(y_obs, T, n_sample, n_samples_per_param, covFactor = np.array([.1,.1]), iniPoints = None) + sampler = PMC(y_obs, model, likfun, kernel, backend, n_sample, n_samples_per_param, seed = 1) + journal = sampler.sample(T, covFactor = np.array([.1,.1]), iniPoints = None) samples = (journal.get_parameters(), journal.get_weights()) # Compute posterior mean @@ -99,8 +99,8 @@ def test_sample(self): # use the PMC scheme for T = 2 T, n_sample, n_samples_per_param = 2, 10, 100 - sampler = PMC(model, likfun, kernel, backend, seed = 1) - journal = sampler.sample(y_obs, T, n_sample, n_samples_per_param, covFactor = np.array([.1,.1]), iniPoints = None) + ampler = PMC(y_obs, model, likfun, kernel, backend, n_sample, n_samples_per_param, seed=1) + journal = sampler.sample(T, covFactor=np.array([.1, .1]), iniPoints=None) samples = (journal.get_parameters(), journal.get_weights()) # Compute posterior mean @@ -115,8 +115,6 @@ def test_sample(self): self.assertLess(abs(mu_post_mean - (-1.4033145848) ), 1e-10) self.assertLess(abs(sigma_post_mean - 7.05175546876), 1e-10) - - class PMCABCTests(unittest.TestCase): def setUp(self): @@ -147,18 +145,18 @@ def setUp(self): def test_calculate_weight(self): n_samples = 2 - rc = _RemoteContextPMCABC(self.backend, self.model, self.dist_calc, self.kernel, self.observation, n_samples, 1) + sampler = PMCABC(self.observation, self.model, self.dist_calc, self.kernel, self.backend, n_samples, 1) theta = np.array([1.0]) - weight = rc._calculate_weight(theta) + weight = sampler._calculate_weight(theta) self.assertEqual(weight, 0.5) accepted_parameters = np.array([[1.0], [1.0 + np.sqrt(2)]]) accepted_weights = np.array([[.5], [.5]]) accepted_cov_mat = np.array([[1.0]]) - rc._update_broadcasts(self.backend, accepted_parameters, accepted_weights, accepted_cov_mat) - weight = rc._calculate_weight(theta) + sampler._update_broadcasts(accepted_parameters, accepted_weights, accepted_cov_mat) + weight = sampler._calculate_weight(theta) expected_weight = (2.0 * np.sqrt(2.0 * np.pi)) /(( 1 + np.exp(-1))*100) self.assertEqual(weight, expected_weight) @@ -167,8 +165,8 @@ def test_calculate_weight(self): def test_sample(self): # use the PMCABC scheme for T = 1 T, n_sample, n_simulate, eps_arr, eps_percentile = 1, 10, 1, [.1], 10 - sampler = PMCABC(self.model, self.dist_calc, self.kernel, self.backend, seed = 1) - journal = sampler.sample(self.observation, T, eps_arr, n_sample, n_simulate, eps_percentile) + sampler = PMCABC(self.observation, self.model, self.dist_calc, self.kernel, self.backend, n_sample, n_simulate, seed = 1) + journal = sampler.sample(T, eps_arr, eps_percentile) samples = (journal.get_parameters(), journal.get_weights()) # Compute posterior mean @@ -185,8 +183,9 @@ def test_sample(self): # use the PMCABC scheme for T = 2 T, n_sample, n_simulate, eps_arr, eps_percentile = 2, 10, 1, [.1,.05], 10 - sampler = PMCABC(self.model, self.dist_calc, self.kernel, self.backend, seed = 1) - journal = sampler.sample(self.observation, T, eps_arr, n_sample, n_simulate, eps_percentile) + sampler = PMCABC(self.observation, self.model, self.dist_calc, self.kernel, self.backend, n_sample, n_simulate, + seed=1) + journal = sampler.sample(T, eps_arr, eps_percentile) samples = (journal.get_parameters(), journal.get_weights()) # Compute posterior mean @@ -231,8 +230,8 @@ def setUp(self): def test_sample(self): # use the SABC scheme for T = 1 steps, epsilon, n_samples, n_samples_per_param = 1, .1, 10, 1 - sampler = SABC(self.model, self.dist_calc, self.kernel, self.backend, seed = 1) - journal = sampler.sample(self.observation, steps, epsilon, n_samples, n_samples_per_param) + sampler = SABC(self.observation, self.model, self.dist_calc, self.kernel, self.backend, n_samples, n_samples_per_param, seed = 1) + journal = sampler.sample(steps, epsilon) samples = (journal.get_parameters(), journal.get_weights()) # Compute posterior mean @@ -248,8 +247,9 @@ def test_sample(self): # use the SABC scheme for T = 2 steps, epsilon, n_samples, n_samples_per_param = 2, .1, 10, 1 - sampler = SABC(self.model, self.dist_calc, self.kernel, self.backend, seed = 1) - journal = sampler.sample(self.observation, steps, epsilon, n_samples, n_samples_per_param) + sampler = SABC(self.observation, self.model, self.dist_calc, self.kernel, self.backend, n_samples, + n_samples_per_param, seed=1) + journal = sampler.sample(steps, epsilon) samples = (journal.get_parameters(), journal.get_weights()) # Compute posterior mean @@ -264,6 +264,8 @@ def test_sample(self): self.assertLess(mu_post_mean - 1.51315443746, 10e-2) self.assertLess(sigma_post_mean - 6.85230360302, 10e-2) + + class ABCsubsimTests(unittest.TestCase): def setUp(self): # find spark and initialize it @@ -294,8 +296,8 @@ def test_sample(self): # use the ABCsubsim scheme for T = 1 steps, n_samples, n_samples_per_param = 1, 10, 1 - sampler = ABCsubsim(self.model, self.dist_calc, self.kernel, self.backend, seed = 1) - journal = sampler.sample(self.observation, steps, n_samples, n_samples_per_param) + sampler = ABCsubsim(self.observation, self.model, self.dist_calc, self.kernel, self.backend, n_samples, n_samples_per_param, seed = 1) + journal = sampler.sample(steps) samples = (journal.get_parameters(), journal.get_weights()) # Compute posterior mean @@ -311,8 +313,9 @@ def test_sample(self): # use the ABCsubsim scheme for T = 2 steps, n_samples, n_samples_per_param = 2, 10, 1 - sampler = ABCsubsim(self.model, self.dist_calc, self.kernel, self.backend, seed = 1) - journal = sampler.sample(self.observation, steps, n_samples, n_samples_per_param) + ampler = ABCsubsim(self.observation, self.model, self.dist_calc, self.kernel, self.backend, n_samples, + n_samples_per_param, seed=1) + journal = sampler.sample(steps) samples = (journal.get_parameters(), journal.get_weights()) # Compute posterior mean @@ -327,6 +330,8 @@ def test_sample(self): self.assertLess(mu_post_mean - (-2.98633946126), 10e-2) self.assertLess(sigma_post_mean - 6.40146881524, 10e-2) + + class SMCABCTests(unittest.TestCase): def setUp(self): # find spark and initialize it @@ -355,7 +360,7 @@ def setUp(self): def test_sample(self): # use the SMCABC scheme for T = 1 steps, n_sample, n_simulate = 1, 10, 1 - sampler = SMCABC(self.model, self.dist_calc, self.kernel, self.backend, seed = 1) + sampler = SMCABC(self.observation, self.model, self.dist_calc, self.kernel, self.backend, n_sample, n_simulate, seed = 1) journal = sampler.sample(self.observation, steps, n_sample, n_simulate) samples = (journal.get_parameters(), journal.get_weights()) @@ -373,8 +378,9 @@ def test_sample(self): # use the SMCABC scheme for T = 2 T, n_sample, n_simulate = 2, 10, 1 - sampler = SMCABC(self.model, self.dist_calc, self.kernel, self.backend, seed = 1) - journal = sampler.sample(self.observation, T, n_sample, n_simulate) + sampler = SMCABC(self.observation, self.model, self.dist_calc, self.kernel, self.backend, n_sample, n_simulate, + seed=1) + journal = sampler.sample(self.observation, steps, n_sample, n_simulate) samples = (journal.get_parameters(), journal.get_weights()) # Compute posterior mean @@ -387,7 +393,9 @@ def test_sample(self): self.assertEqual(sigma_sample_shape, (10,)) self.assertEqual(weights_sample_shape, (10,)) self.assertLess(mu_post_mean - (-1.12595029091), 10e-2) - self.assertLess(sigma_post_mean - 4.62512055437, 10e-2) + self.assertLess(sigma_post_mean - 4.62512055437, 10e-2) + + class APMCABCTests(unittest.TestCase): def setUp(self): @@ -417,8 +425,8 @@ def setUp(self): def test_sample(self): # use the APMCABC scheme for T = 1 steps, n_sample, n_simulate = 1, 10, 1 - sampler = APMCABC(self.model, self.dist_calc, self.kernel, self.backend, seed = 1) - journal = sampler.sample(self.observation, steps, n_sample, n_simulate) + sampler = APMCABC(self.observation, self.model, self.dist_calc, self.kernel, self.backend, n_sample, n_simulate, seed = 1) + journal = sampler.sample(steps) samples = (journal.get_parameters(), journal.get_weights()) # Compute posterior mean @@ -435,8 +443,9 @@ def test_sample(self): # use the APMCABC scheme for T = 2 T, n_sample, n_simulate = 2, 10, 1 - sampler = APMCABC(self.model, self.dist_calc, self.kernel, self.backend, seed = 1) - journal = sampler.sample(self.observation, T, n_sample, n_simulate) + ampler = APMCABC(self.observation, self.model, self.dist_calc, self.kernel, self.backend, n_sample, n_simulate, + seed=1) + journal = sampler.sample(steps) samples = (journal.get_parameters(), journal.get_weights()) # Compute posterior mean @@ -451,6 +460,8 @@ def test_sample(self): self.assertLess(mu_post_mean - (2.19137364411), 10e-2) self.assertLess(sigma_post_mean - 5.66226403628, 10e-2) + + class RSMCABCTests(unittest.TestCase): def setUp(self): # find spark and initialize it @@ -479,8 +490,8 @@ def setUp(self): def test_sample(self): # use the RSMCABC scheme for T = 1 steps, n_sample, n_simulate = 1, 10, 1 - sampler = RSMCABC(self.model, self.dist_calc, self.kernel, self.backend, seed = 1) - journal = sampler.sample(self.observation, steps, n_sample, n_simulate) + sampler = RSMCABC(self.observation, self.model, self.dist_calc, self.kernel, self.backend, n_sample, n_simulate, seed = 1) + journal = sampler.sample(steps) samples = (journal.get_parameters(), journal.get_weights()) # Compute posterior mean @@ -497,8 +508,9 @@ def test_sample(self): # use the RSMCABC scheme for T = 2 steps, n_sample, n_simulate = 2, 10, 1 - sampler = RSMCABC(self.model, self.dist_calc, self.kernel, self.backend, seed = 1) - journal = sampler.sample(self.observation, steps, n_sample, n_simulate) + sampler = RSMCABC(self.observation, self.model, self.dist_calc, self.kernel, self.backend, n_sample, n_simulate, + seed=1) + journal = sampler.sample(steps) samples = (journal.get_parameters(), journal.get_weights()) # Compute posterior mean @@ -513,5 +525,6 @@ def test_sample(self): self.assertLess(mu_post_mean - (-0.349310337252), 10e-2) self.assertLess(sigma_post_mean - 6.30221177368, 10e-2) + if __name__ == '__main__': unittest.main() diff --git a/tests/pickle_tests.py b/tests/pickle_tests.py index 8e64a1ad..8c777758 100644 --- a/tests/pickle_tests.py +++ b/tests/pickle_tests.py @@ -2,18 +2,32 @@ import cloudpickle import numpy as np +'''We use pickle in our MPI backend to send a method from the master to the workers. The object with which this method is associated cotains the backend as an attribute, while the backend itself contains the data on which the workers should work. Pickling the method results in pickling the backend, which results in the whole data being pickled and sent, which is undersirable. + +In pickle, the "getstate" method can be specified. When an object cotaining a "getstate" method is pickled, only the attributes specified within that method are pickled. + +This test checks whether everything is working correctly with cloudpickle. +''' + class ToBePickled: def __init__(self): self.included = 5 self.notIncluded = np.zeros(10**5) def __getstate__(self): + """Method that tells cloudpickle which attributes should be pickled + Returns + ------- + state + all the attributes that should be pickled + """ state = self.__dict__.copy() del state['notIncluded'] return state class PickleTests(unittest.TestCase): def test_exclusion(self,A,string): + """Tests whether after pickling and unpickling the object, the attribute which should not be included exists""" pickled_object = cloudpickle.dumps(A()) unpickled_object = cloudpickle.loads(pickled_object) assert(not(hasattr(pickled_object,string))) From 662aa7f6ddf12b580954c1859d50eefda3dc2076 Mon Sep 17 00:00:00 2001 From: Nicole Date: Wed, 20 Sep 2017 18:00:16 +0200 Subject: [PATCH 07/14] removed inferences_getstate.py --- abcpy/inferences_getstate.py | 55 ------------------------------------ 1 file changed, 55 deletions(-) delete mode 100644 abcpy/inferences_getstate.py diff --git a/abcpy/inferences_getstate.py b/abcpy/inferences_getstate.py deleted file mode 100644 index 9074e9b3..00000000 --- a/abcpy/inferences_getstate.py +++ /dev/null @@ -1,55 +0,0 @@ -import numpy as np -from abcpy.output import Journal -from scipy import optimize - -class RejectionABC: - def __init__(self, observations, model, distance, backend, epsilon, n_samples, n_samples_per_param, seed=None): - self.model = model - self.dist_calc = distance - self.backend = backend - self.rng = np.random.RandomState(seed) - self.epsilon = epsilon - self.observations_bds = backend.broadcast(observations) - self.n_samples = n_samples - self.n_samples_per_param = n_samples_per_param - - def __getstate__(self): - state = self.__dict__.copy() - del state['backend'] - return state - - def sample(self, observations, n_samples, n_samples_per_param, epsilon, full_output=0): - journal = Journal(full_output) - journal.configuration["n_samples"] = n_samples - journal.configuration["n_samples_per_param"] = n_samples_per_param - journal.configuration["epsilon"] = epsilon - - accepted_parameters = None - - # Initialize variables that need to be available remotely - - # main Rejection ABC algorithm - seed_arr = self.rng.randint(1, n_samples * n_samples, size=n_samples, dtype=np.int32) - seed_pds = self.backend.parallelize(seed_arr) - - accepted_parameters_pds = self.backend.map(self._sample_parameter, seed_pds) - accepted_parameters = self.backend.collect(accepted_parameters_pds) - accepted_parameters = np.array(accepted_parameters) - - journal.add_parameters(accepted_parameters) - journal.add_weights(np.ones((n_samples, 1))) - - return journal - - def _sample_parameter(self, seed): - distance = self.dist_calc.dist_max() - self.model.prior.reseed(seed) - - while distance > self.epsilon: - # Accept new parameter value if the distance is less than epsilon - self.model.sample_from_prior() - y_sim = self.model.simulate(self.n_samples_per_param) - distance = self.dist_calc.distance(self.observations_bds.value(), y_sim) - - return self.model.get_parameters() - From 1f4ec3a1716cdf3beded0566146f927fccaf340d Mon Sep 17 00:00:00 2001 From: Nicole Date: Thu, 21 Sep 2017 13:51:21 +0200 Subject: [PATCH 08/14] reestablished backwards compatibility and fixed tests --- abcpy/inferences.py | 253 +++++++++++++++++++------------------- tests/inferences_tests.py | 84 ++++++------- 2 files changed, 159 insertions(+), 178 deletions(-) diff --git a/abcpy/inferences.py b/abcpy/inferences.py index e413218d..15cd7181 100644 --- a/abcpy/inferences.py +++ b/abcpy/inferences.py @@ -3,14 +3,27 @@ from scipy import optimize class RejectionABC: - def __init__(self, observations, model, distance, backend, epsilon, n_samples, n_samples_per_param, seed=None): + """This base class implements the rejection algorithm based inference scheme [1] for + Approximate Bayesian Computation. + + [1] Tavaré, S., Balding, D., Griffith, R., Donnelly, P.: Inferring coalescence + times from DNA sequence data. Genetics 145(2), 505–518 (1997). + + Parameters + ---------- + model: abcpy.models.Model + Model object that conforms to the Model class. + distance: abcpy.distances.Distance + Distance object that conforms to the Distance class. + backend: abcpy.backends.Backend + Backend object that conforms to the Backend class. + seed: integer, optional + Optional initial seed for the random number generator. The default value is generated randomly. + """ + def __init__(self, model, distance, backend, seed=None): self.model = model self.dist_calc = distance self.backend = backend - self.epsilon = epsilon - self.observations_bds = self.backend.broadcast(observations) - self.n_samples = n_samples - self.n_samples_per_param = n_samples_per_param self.rng = np.random.RandomState(seed) def __getstate__(self): @@ -19,7 +32,12 @@ def __getstate__(self): del state['backend'] return state - def sample(self, full_output=0): + def sample(self, observations, n_samples, n_samples_per_param, epsilon, full_output=0): + + self.observations_bds = self.backend.broadcast(observations) + self.n_samples = n_samples + self.n_samples_per_param = n_samples_per_param + self.epsilon = epsilon journal = Journal(full_output) journal.configuration["n_samples"] = self.n_samples journal.configuration["n_samples_per_param"] = self.n_samples_per_param @@ -30,7 +48,7 @@ def sample(self, full_output=0): # Initialize variables that need to be available remotely # main Rejection ABC algorithm - seed_arr = self.rng.randint(1, self.n_samples * self.n_samples, size=self.n_samples, dtype=np.int32) + seed_arr = self.rng.randint(1, n_samples * n_samples, size=n_samples, dtype=np.int32) seed_pds = self.backend.parallelize(seed_arr) accepted_parameters_pds = self.backend.map(self._sample_parameter, seed_pds) @@ -38,7 +56,7 @@ def sample(self, full_output=0): accepted_parameters = np.array(accepted_parameters) journal.add_parameters(accepted_parameters) - journal.add_weights(np.ones((self.n_samples, 1))) + journal.add_weights(np.ones((n_samples, 1))) return journal @@ -80,42 +98,19 @@ class PMCABC: Optional initial seed for the random number generator. The default value is generated randomly. """ - def __init__(self, observations, model, distance, kernel, backend, n_samples=10000, n_samples_per_param=1, seed=None): - """Constructor of PMCABC inference schemes. - - Parameters - ---------- - observations : numpy.ndarray - Observed data. - model : abcpy.models.Model - Model object that conforms to the Model class - distance : abcpy.distances.Distance - Distance object that conforms to the Distance class. - kernel : abcpy.distributions.Distribution - Distribution object defining the perturbation kernel needed for the sampling - backend : abcpy.backends.Backend - Backend object that conforms to the Backend class - n_samples : integer, optional - Number of samples to generate. The default value is 10000. - n_samples_per_param : integer, optional - Number of data points in each simulated data set. The default value is 1. - seed : integer, optional - Optional initial seed for the random number generator. The default value is generated randomly. - - """ + def __init__(self, model, distance, kernel, backend, seed=None): + self.model = model self.distance = distance self.kernel = kernel self.backend = backend - self.n_samples = n_samples - self.n_samples_per_param = n_samples_per_param self.rng = np.random.RandomState(seed) - self.epsilon = None + #added this here, so that the calculate_weight function can be called without calling sample first + self.n_samples = 2 # these are usually big tables, so we broadcast them to have them once # per executor instead of once per task - self.observations_bds = self.backend.broadcast(observations) self.accepted_parameters_bds = None self.accepted_weights_bds = None self.accepted_cov_mat_bds = None @@ -126,8 +121,7 @@ def __getstate__(self): del state['backend'] return state - def sample(self, steps, epsilon_init, epsilon_percentile=0, - covFactor=2, full_output=0): + def sample(self, observations, steps, epsilon_init, n_samples = 10000, n_samples_per_param = 1, epsilon_percentile = 0, covFactor = 2, full_output=0): """Samples from the posterior distribution of the model parameter given the observed data observations. @@ -141,7 +135,10 @@ def sample(self, steps, epsilon_init, epsilon_percentile=0, An array of proposed values of epsilon to be used at each steps. Can be supplied A single value to be used as the threshold in Step 1 or a `steps`-dimensional array of values to be used as the threshold in evry steps. - + n_samples : integer, optional + Number of samples to generate. The default value is 10000. + n_samples_per_param : integer, optional + Number of data points in each simulated data set. The default value is 1. epsilon_percentile : float, optional A value between [0, 100]. The default value is 0, meaning the threshold value provided by the user being used. covFactor : float, optional @@ -156,6 +153,10 @@ def sample(self, steps, epsilon_init, epsilon_percentile=0, A journal containing simulation results, metadata and optionally intermediate results. """ + self.observations_bds = self.backend.broadcast(observations) + self.n_samples = n_samples + self.n_samples_per_param=n_samples_per_param + journal = Journal(full_output) journal.configuration["type_model"] = type(self.model) journal.configuration["type_dist_func"] = type(self.distance) @@ -181,7 +182,7 @@ def sample(self, steps, epsilon_init, epsilon_percentile=0, # print("INFO: Starting PMCABC iterations.") for aStep in range(0, steps): # print("DEBUG: Iteration " + str(aStep) + " of PMCABC algorithm.") - seed_arr = self.rng.randint(0, np.iinfo(np.uint32).max, size=self.n_samples, dtype=np.uint32) + seed_arr = self.rng.randint(0, np.iinfo(np.uint32).max, size=n_samples, dtype=np.uint32) seed_pds = self.backend.parallelize(seed_arr) # 0: update remotely required variables @@ -333,13 +334,11 @@ class PMC: """ - def __init__(self, observations, model, likfun, kernel, backend, n_samples=10000, n_samples_per_param=100, seed=None): + def __init__(self, model, likfun, kernel, backend, seed=None): """Constructor of PMC inference schemes. Parameters ---------- - observations : numpy.ndarray - Observed data. model : abcpy.models.Model Model object that conforms to the Model class likfun : abcpy.approx_lhd.Approx_likelihood @@ -348,10 +347,6 @@ def __init__(self, observations, model, likfun, kernel, backend, n_samples=10000 Distribution object defining the perturbation kernel needed for the sampling backend : abcpy.backends.Backend Backend object that conforms to the Backend class - n_samples : integer, optional - Number of samples to generate. The default value is 10000. - n_samples_per_param : integer, optional - Number of data points in each simulated data set. The default value is 1. seed : integer, optional Optional initial seed for the random number generator. The default value is generated randomly. @@ -360,13 +355,10 @@ def __init__(self, observations, model, likfun, kernel, backend, n_samples=10000 self.likfun = likfun self.kernel = kernel self.backend = backend - self.n_samples = n_samples - self.n_samples_per_param = n_samples_per_param self.rng = np.random.RandomState(seed) # these are usually big tables, so we broadcast them to have them once # per executor instead of once per task - self.observations_bds = self.backend.broadcast(observations) self.accepted_parameters_bds = None self.accepted_weights_bds = None self.accepted_cov_mat_bds = None @@ -377,15 +369,20 @@ def __getstate__(self): del state['backend'] return state - def sample(self, steps, covFactor=None, iniPoints=None, - full_output=0): + def sample(self, observations, steps, n_samples = 10000, n_samples_per_param = 100, covFactor = None, iniPoints = None, full_output=0): """Samples from the posterior distribution of the model parameter given the observed data observations. Parameters ---------- + observations : python list + Observed data steps : integer number of iterations in the sequential algoritm ("generations") + n_samples : integer, optional + number of samples to generate. The default value is 10000. + n_samples_per_param : integer, optional + number of data points in each simulated data set. The default value is 100. covFactor : float, optional scaling parameter of the covariance matrix. The default is a p dimensional array of 1 when p is the dimension of the parameter. inipoints : numpy.ndarray, optional @@ -404,6 +401,9 @@ def sample(self, steps, covFactor=None, iniPoints=None, """ + self.observations_bds = self.backend.broadcast(observations) + self.n_samples = n_samples + self.n_samples_per_param = n_samples_per_param journal = Journal(full_output) journal.configuration["type_model"] = type(self.model) journal.configuration["type_lhd_func"] = type(self.likfun) @@ -422,11 +422,11 @@ def sample(self, steps, covFactor=None, iniPoints=None, # Initialize particles: When not supplied, randomly draw them from prior distribution # Weights of particles: Assign equal weights for each of the particles if iniPoints == None: - accepted_parameters = np.zeros(shape=(self.n_samples, dim)) - for ind in range(0, self.n_samples): + accepted_parameters = np.zeros(shape=(n_samples, dim)) + for ind in range(0, n_samples): self.model.sample_from_prior() accepted_parameters[ind, :] = self.model.get_parameters() - accepted_weights = np.ones((self.n_samples, 1), dtype=np.float) / self.n_samples + accepted_weights = np.ones((n_samples, 1), dtype=np.float) / n_samples else: accepted_parameters = iniPoints accepted_weights = np.ones((iniPoints.shape[0], 1), dtype=np.float) / iniPoints.shape[0] @@ -448,10 +448,10 @@ def sample(self, steps, covFactor=None, iniPoints=None, # 1: calculate resample parameters # print("INFO: Resample parameters.") - index = self.rng.choice(accepted_parameters.shape[0], size=self.n_samples, p=accepted_weights.reshape(-1)) + index = self.rng.choice(accepted_parameters.shape[0], size=n_samples, p=accepted_weights.reshape(-1)) # Choose a new particle using the resampled particle (make the boundary proper) # Initialize new_parameters - new_parameters = np.zeros((self.n_samples, dim), dtype=np.float) + new_parameters = np.zeros((n_samples, dim), dtype=np.float) for ind in range(0, self.n_samples): self.kernel.set_parameters([accepted_parameters[index[ind], :], accepted_cov_mat]) while True: @@ -582,19 +582,15 @@ class SABC: Optional initial seed for the random number generator. The default value is generated randomly. """ - def __init__(self, observations, model, distance, kernel, backend, n_samples=10000, n_samples_per_param=1, seed=None): + def __init__(self, model, distance, kernel, backend, seed=None): self.model = model self.distance = distance self.kernel = kernel self.backend = backend - self.n_samples = n_samples - self.n_samples_per_param = n_samples_per_param - self.epsilon = None self.rng = np.random.RandomState(seed) # these are usually big tables, so we broadcast them to have them once # per executor instead of once per task - self.observations_bds = self.backend.broadcast(observations) self.accepted_parameters_bds = None self.accepted_cov_mat_bds = None self.smooth_distances_bds = None @@ -606,8 +602,7 @@ def __getstate__(self): del state['backend'] return state - def sample(self, steps, epsilon, beta=2, delta=0.2, v=0.3, - ar_cutoff=0.5, resample=None, n_update=None, adaptcov=1, full_output=0): + def sample(self, observations, steps, epsilon, n_samples = 10000, n_samples_per_param = 1, beta = 2, delta = 0.2, v = 0.3, ar_cutoff = 0.5, resample = None, n_update = None, adaptcov = 1, full_output=0): """Samples from the posterior distribution of the model parameter given the observed data observations. @@ -646,6 +641,10 @@ def sample(self, steps, epsilon, beta=2, delta=0.2, v=0.3, abcpy.output.Journal A journal containing simulation results, metadata and optionally intermediate results. """ + self.observations_bds = self.backend.broadcast(observations) + self.epsilon = epsilon + self.n_samples = n_samples + self.n_samples_per_param = n_samples_per_param journal = Journal(full_output) journal.configuration["type_model"] = type(self.model) journal.configuration["type_dist_func"] = type(self.distance) @@ -661,19 +660,19 @@ def sample(self, steps, epsilon, beta=2, delta=0.2, v=0.3, journal.configuration["adaptcov"] = adaptcov journal.configuration["full_output"] = full_output - accepted_parameters = np.zeros(shape=(self.n_samples, len(self.model.get_parameters()))) - distances = np.zeros(shape=(self.n_samples,)) - smooth_distances = np.zeros(shape=(self.n_samples,)) - accepted_weights = np.ones(shape=(self.n_samples, 1)) + accepted_parameters = np.zeros(shape=(n_samples, len(self.model.get_parameters()))) + distances = np.zeros(shape=(n_samples,)) + smooth_distances = np.zeros(shape=(n_samples,)) + accepted_weights = np.ones(shape=(n_samples, 1)) all_distances = None accepted_cov_mat = None if resample == None: - resample = self.n_samples + resample = n_samples if n_update == None: - n_update = self.n_samples + n_update = n_samples sample_array = np.ones(shape=(steps,)) - sample_array[0] = self.n_samples + sample_array[0] = n_samples sample_array[1:] = n_update ## Acceptance counter to determine the resampling step accept = 0 @@ -705,7 +704,7 @@ def sample(self, steps, epsilon, beta=2, delta=0.2, v=0.3, # Reading all_distances at Initial step if aStep == 0: - index = np.linspace(0, self.n_samples - 1, self.n_samples).astype(int).reshape(self.n_samples, ) + index = np.linspace(0, n_samples - 1, n_samples).astype(int).reshape(n_samples, ) accept = 0 all_distances = new_all_distances @@ -755,7 +754,7 @@ def sample(self, steps, epsilon, beta=2, delta=0.2, v=0.3, ## Weighted resampling: weight = np.exp(-smooth_distances * delta / U) weight = weight / sum(weight) - index_resampled = self.rng.choice(np.arange(self.n_samples), self.n_samples, replace=1, p=weight) + index_resampled = self.rng.choice(np.arange(n_samples), n_samples, replace=1, p=weight) accepted_parameters = accepted_parameters[index_resampled, :] smooth_distances = smooth_distances[index_resampled] @@ -927,21 +926,17 @@ class ABCsubsim: Optional initial seed for the random number generator. The default value is generated randomly. """ - def __init__(self, observations, model, distance, kernel, backend, chain_length=10, n_samples=10000, n_samples_per_param=1, seed=None): + def __init__(self, model, distance, kernel, backend, seed=None): self.model = model self.distance = distance self.kernel = kernel self.backend = backend - self.n_samples = n_samples - self.chain_length = chain_length - self.n_samples_per_param = n_samples_per_param self.rng = np.random.RandomState(seed) self.anneal_parameter = None # these are usually big tables, so we broadcast them to have them once # per executor instead of once per task - self.observations_bds = self.backend.broadcast(observations) self.accepted_parameters_bds = None self.accepted_cov_mat_bds = None @@ -951,8 +946,7 @@ def __getstate__(self): del state['backend'] return state - def sample(self, steps, ap_change_cutoff=10, - full_output=0): + def sample(self, observations, steps, n_samples = 10000, n_samples_per_param = 1, chain_length = 10, ap_change_cutoff = 10, full_output=0): """Samples from the posterior distribution of the model parameter given the observed data observations. @@ -974,6 +968,10 @@ def sample(self, steps, ap_change_cutoff=10, abcpy.output.Journal A journal containing simulation results, metadata and optionally intermediate results. """ + self.observations_bds = self.backend.broadcast(observations) + self.chain_length = chain_length + self.n_samples = n_samples + self.n_samples_per_param = n_samples_per_param journal = Journal(full_output) journal.configuration["type_model"] = type(self.model) @@ -986,7 +984,7 @@ def sample(self, steps, ap_change_cutoff=10, journal.configuration["full_output"] = full_output accepted_parameters = None - accepted_weights = np.ones(shape=(self.n_samples, 1)) + accepted_weights = np.ones(shape=(n_samples, 1)) accepted_cov_mat = None anneal_parameter = 0 anneal_parameter_old = 0 @@ -996,10 +994,10 @@ def sample(self, steps, ap_change_cutoff=10, for aStep in range(0, steps): # main ABCsubsim algorithm # print("INFO: Initialization of ABCsubsim") - seed_arr = self.rng.randint(0, np.iinfo(np.uint32).max, size=int(self.n_samples / temp_chain_length), + seed_arr = self.rng.randint(0, np.iinfo(np.uint32).max, size=int(n_samples / temp_chain_length), dtype=np.uint32) - index_arr = np.linspace(0, self.n_samples / temp_chain_length - 1, self.n_samples / temp_chain_length).astype( - int).reshape(int(self.n_samples / temp_chain_length), ) + index_arr = np.linspace(0, n_samples / temp_chain_length - 1, n_samples / temp_chain_length).astype( + int).reshape(int(n_samples / temp_chain_length), ) seed_index_arr = np.column_stack((seed_arr, index_arr)) seed_index_pds = self.backend.parallelize(seed_index_arr) @@ -1021,11 +1019,11 @@ def sample(self, steps, ap_change_cutoff=10, accepted_parameters = accepted_parameters[SortIndex, :] # 3: Calculate and broadcast annealling parameters - temp_chain_length = self.chain_length + temp_chain_length = chain_length if aStep > 0: anneal_parameter_old = anneal_parameter anneal_parameter = 0.5 * ( - distances[int(self.n_samples / temp_chain_length)] + distances[int(self.n_samples / temp_chain_length) + 1]) + distances[int(n_samples / temp_chain_length)] + distances[int(n_samples / temp_chain_length) + 1]) self.anneal_parameter = anneal_parameter # 4: Update proposal covariance matrix (Parallelized) @@ -1208,21 +1206,17 @@ class RSMCABC: Optional initial seed for the random number generator. The default value is generated randomly. """ - def __init__(self, observations, model, distance, kernel, backend, n_samples=10000, n_samples_per_param=1, alpha=0.1, seed=None): + def __init__(self, model, distance, kernel, backend, seed=None): self.model = model self.distance = distance self.kernel = kernel self.backend = backend - self.n_samples=n_samples - self.n_samples_per_param = n_samples_per_param - self.alpha = alpha - self.epsilon = None + self.R=None self.rng = np.random.RandomState(seed) # these are usually big tables, so we broadcast them to have them once # per executor instead of once per task - self.observations_bds = self.backend.broadcast(observations) self.accepted_parameters_bds = None self.accepted_dist_bds = None self.accepted_cov_mat_bds = None @@ -1233,8 +1227,7 @@ def __getstate__(self): del state['backend'] return state - def sample(self, steps, epsilon_init=100, - epsilon_final=0.1, const=1, covFactor=2.0, full_output=0): + def sample(self, observations, steps, n_samples = 10000, n_samples_per_param = 1, alpha = 0.1, epsilon_init = 100, epsilon_final = 0.1, const = 1, covFactor = 2.0, full_output=0): """Samples from the posterior distribution of the model parameter given the observed data observations. @@ -1267,6 +1260,10 @@ def sample(self, steps, epsilon_init=100, abcpy.output.Journal A journal containing simulation results, metadata and optionally intermediate results. """ + self.observations_bds = self.backend.broadcast(observations) + self.alpha = alpha + self.n_samples = n_samples + self.n_samples_per_param = n_samples_per_param journal = Journal(full_output) journal.configuration["type_model"] = type(self.model) @@ -1287,17 +1284,17 @@ def sample(self, steps, epsilon_init=100, # and finally Drawing new new/perturbed samples using prior or MCMC Kernel # print("DEBUG: Iteration " + str(aStep) + " of RSMCABC algorithm.") if aStep == 0: - n_replenish = self.n_samples + n_replenish = n_samples # Compute epsilon epsilon = [epsilon_init] R = int(1) else: - n_replenish = round(self.n_samples * self.alpha) + n_replenish = round(n_samples * alpha) # Throw away N_alpha particles with largest dist - accepted_parameters = np.delete(accepted_parameters, np.arange(round(self.n_samples * self.alpha)) + ( - self.n_samples - round(self.n_samples * self.alpha)), 0) + accepted_parameters = np.delete(accepted_parameters, np.arange(round(n_samples * alpha)) + ( + self.n_samples - round(n_samples * alpha)), 0) accepted_dist = np.delete(accepted_dist, - np.arange(round(self.n_samples * self.alpha)) + (self.n_samples - round(self.n_samples * self.alpha)), + np.arange(round(n_samples * alpha)) + (n_samples - round(n_samples * alpha)), 0) # Compute epsilon epsilon.append(accepted_dist[-1]) @@ -1348,7 +1345,7 @@ def sample(self, steps, epsilon_init=100, # print("INFO: Saving configuration to output journal.") if (full_output == 1 and aStep <= steps - 1) or (full_output == 0 and aStep == steps - 1): journal.add_parameters(accepted_parameters) - journal.add_weights(np.ones(shape=(self.n_samples, 1)) * (1 / self.n_samples)) + journal.add_weights(np.ones(shape=(n_samples, 1)) * (1 / n_samples)) # Add epsilon_arr to the journal journal.configuration["epsilon_arr"] = epsilon @@ -1441,22 +1438,17 @@ class APMCABC: Optional initial seed for the random number generator. The default value is generated randomly. """ - def __init__(self, observations, model, distance, kernel, backend, n_samples=10000, n_samples_per_param=1, alpha=0.9, seed=None): + def __init__(self, model, distance, kernel, backend, seed=None): self.model = model self.distance = distance self.kernel = kernel self.backend = backend - self.n_samples = n_samples - self.n_samples_per_param = n_samples_per_param - - self.alpha = alpha self.epsilon= None self.rng = np.random.RandomState(seed) # these are usually big tables, so we broadcast them to have them once # per executor instead of once per task - self.observations_bds = self.backend.broadcast(observations) self.alpha_accepted_parameters_bds = None self.alpha_accepted_weights_bds = None self.alpha_accepted_dist = None @@ -1468,8 +1460,7 @@ def __getstate__(self): del state['backend'] return state - def sample(self, steps, acceptance_cutoff=0.2, - covFactor=2.0, full_output=0): + def sample(self, observations, steps, n_samples = 10000, n_samples_per_param = 1, alpha = 0.9, acceptance_cutoff = 0.2, covFactor = 2.0, full_output=0): """Samples from the posterior distribution of the model parameter given the observed data observations. @@ -1498,6 +1489,11 @@ def sample(self, steps, acceptance_cutoff=0.2, abcpy.output.Journal A journal containing simulation results, metadata and optionally intermediate results. """ + self.observations_bds = self.backend.broadcast(observations) + self.alpha = alpha + self.n_samples = n_samples + self.n_samples_per_param = n_samples_per_param + journal = Journal(full_output) journal.configuration["type_model"] = type(self.model) @@ -1521,9 +1517,9 @@ def sample(self, steps, acceptance_cutoff=0.2, # 0: Drawing new new/perturbed samples using prior or MCMC Kernel # print("DEBUG: Iteration " + str(aStep) + " of APMCABC algorithm.") if aStep > 0: - n_additional_samples = self.n_samples - round(self.n_samples * self.alpha) + n_additional_samples = n_samples - round(n_samples * alpha) else: - n_additional_samples = self.n_samples + n_additional_samples = n_samples seed_arr = self.rng.randint(0, np.iinfo(np.uint32).max, size=n_additional_samples, dtype=np.uint32) seed_pds = self.backend.parallelize(seed_arr) @@ -1543,14 +1539,14 @@ def sample(self, steps, acceptance_cutoff=0.2, new_weights = np.array(new_weights).reshape(n_additional_samples, 1) # 1: Update all parameters, compute acceptance probability, compute epsilon - if len(new_weights) == self.n_samples: + if len(new_weights) == n_samples: accepted_parameters = new_parameters accepted_dist = new_dist accepted_weights = new_weights # Compute acceptance probability prob_acceptance = 1 # Compute epsilon - epsilon = [np.percentile(accepted_dist, self.alpha * 100)] + epsilon = [np.percentile(accepted_dist, alpha * 100)] else: accepted_parameters = np.concatenate((alpha_accepted_parameters, new_parameters)) accepted_dist = np.concatenate((alpha_accepted_dist, new_dist)) @@ -1558,7 +1554,7 @@ def sample(self, steps, acceptance_cutoff=0.2, # Compute acceptance probability prob_acceptance = sum(new_dist < epsilon[-1]) / len(new_dist) # Compute epsilon - epsilon.append(np.percentile(accepted_dist, self.alpha * 100)) + epsilon.append(np.percentile(accepted_dist, alpha * 100)) # 2: Update alpha_parameters, alpha_dist and alpha_weights index_alpha = accepted_dist < epsilon[-1] @@ -1672,20 +1668,18 @@ class SMCABC: Optional initial seed for the random number generator. The default value is generated randomly. """ - def __init__(self, observations, model, distance, kernel, backend, n_samples=10000, n_samples_per_param=1, seed=None): + def __init__(self, model, distance, kernel, backend, seed=None): self.model = model self.distance = distance self.kernel = kernel self.backend = backend - self.n_samples = n_samples - self.n_samples_per_param = n_samples_per_param self.epsilon = None self.rng = np.random.RandomState(seed) # these are usually big tables, so we broadcast them to have them once # per executor instead of once per task - self.observations_bds = self.backend.broadcast(observations) + self.observations_bds = None self.accepted_parameters_bds = None self.accepted_weights_bds = None self.accepted_cov_mat_bds = None @@ -1697,8 +1691,7 @@ def __getstate__(self): del state['backend'] return state - def sample(self, observations, steps, epsilon_final=0.1, alpha=0.95, - covFactor=2, resample=None, full_output=0): + def sample(self, observations, steps, n_samples = 10000, n_samples_per_param = 1, epsilon_final = 0.1, alpha = 0.95, covFactor = 2, resample = None, full_output=0): """Samples from the posterior distribution of the model parameter given the observed data observations. @@ -1728,7 +1721,9 @@ def sample(self, observations, steps, epsilon_final=0.1, alpha=0.95, abcpy.output.Journal A journal containing simulation results, metadata and optionally intermediate results. """ - + self.observations_bds= self.backend.broadcast(observations) + self.n_samples = n_samples + self.n_samples_per_param = n_samples_per_param journal = Journal(full_output) journal.configuration["type_model"] = type(self.model) journal.configuration["type_dist_func"] = type(self.distance) @@ -1743,7 +1738,7 @@ def sample(self, observations, steps, epsilon_final=0.1, alpha=0.95, # Define the resmaple parameter if resample == None: - resample = self.n_samples * 0.5 + resample = n_samples * 0.5 # Define epsilon_init epsilon = [10000] @@ -1761,7 +1756,7 @@ def sample(self, observations, steps, epsilon_final=0.1, alpha=0.95, # Compute epsilon for next step fun = lambda epsilon_var: self._compute_epsilon(epsilon_var, \ epsilon, observations, accepted_y_sim, accepted_weights, - self.n_samples, self.n_samples_per_param, alpha) + n_samples, n_samples_per_param, alpha) epsilon_new = self._bisection(fun, epsilon_final, epsilon[-1], 0.001) if epsilon_new < epsilon_final: epsilon_new = epsilon_final @@ -1770,11 +1765,11 @@ def sample(self, observations, steps, epsilon_final=0.1, alpha=0.95, # 1: calculate weights for new parameters # print("INFO: Calculating weights.") if accepted_y_sim != None: - new_weights = np.zeros(shape=(self.n_samples), ) - for ind1 in range(self.n_samples): + new_weights = np.zeros(shape=(n_samples), ) + for ind1 in range(n_samples): numerator = 0.0 denominator = 0.0 - for ind2 in range(self.n_samples_per_param): + for ind2 in range(n_samples_per_param): numerator += (self.distance.distance(observations, [accepted_y_sim[ind1][ind2]]) < epsilon[-1]) denominator += ( self.distance.distance(observations, [accepted_y_sim[ind1][ind2]]) < epsilon[-2]) @@ -1784,23 +1779,23 @@ def sample(self, observations, steps, epsilon_final=0.1, alpha=0.95, new_weights[ind1] = 0 new_weights = new_weights / sum(new_weights) else: - new_weights = np.ones(shape=(self.n_samples), ) * (1.0 / self.n_samples) + new_weights = np.ones(shape=(n_samples), ) * (1.0 / n_samples) # 2: Resample if accepted_y_sim != None and pow(sum(pow(new_weights, 2)), -1) < resample: print('Resampling') # Weighted resampling: - index_resampled = self.rng.choice(np.arange(self.n_samples), self.n_samples, replace=1, p=new_weights) + index_resampled = self.rng.choice(np.arange(n_samples), n_samples, replace=1, p=new_weights) accepted_parameters = accepted_parameters[index_resampled, :] - new_weights = np.ones(shape=(self.n_samples), ) * (1.0 / self.n_samples) + new_weights = np.ones(shape=(n_samples), ) * (1.0 / n_samples) # Update the weights accepted_weights = new_weights.reshape(len(new_weights), 1) # 3: Drawing new perturbed samples using MCMC Kernel # print("DEBUG: Iteration " + str(aStep) + " of SMCABC algorithm.") - seed_arr = self.rng.randint(0, np.iinfo(np.uint32).max, size=self.n_samples, dtype=np.uint32) - index_arr = np.arange(self.n_samples) + seed_arr = self.rng.randint(0, np.iinfo(np.uint32).max, size=n_samples, dtype=np.uint32) + index_arr = np.arange(n_samples) seed_index_arr = np.column_stack((seed_arr, index_arr)) seed_index_pds = self.backend.parallelize(seed_index_arr) diff --git a/tests/inferences_tests.py b/tests/inferences_tests.py index 7f78f8af..dfda3758 100644 --- a/tests/inferences_tests.py +++ b/tests/inferences_tests.py @@ -38,8 +38,8 @@ def test_sample(self): y_obs = model.simulate(1) # use the rejection sampling scheme - sampler = RejectionABC(y_obs, model, dist_calc, dummy, epsilon=0.1, n_samples=10, n_samples_per_param=1, seed = 1) - journal = sampler.sample() + sampler = RejectionABC(model, dist_calc, dummy, seed = 1) + journal = sampler.sample(y_obs, 10, 1, 0.1) samples = journal.get_parameters() # test shape of samples @@ -80,8 +80,8 @@ def test_sample(self): kernel = MultiNormal(mean, cov, seed=1) T, n_sample, n_samples_per_param = 1, 10, 100 - sampler = PMC(y_obs, model, likfun, kernel, backend, n_sample, n_samples_per_param, seed = 1) - journal = sampler.sample(T, covFactor = np.array([.1,.1]), iniPoints = None) + sampler = PMC(model, likfun, kernel, backend, seed = 1) + journal = sampler.sample(y_obs, T, n_sample, n_samples_per_param, covFactor = np.array([.1,.1]), iniPoints = None) samples = (journal.get_parameters(), journal.get_weights()) # Compute posterior mean @@ -99,8 +99,8 @@ def test_sample(self): # use the PMC scheme for T = 2 T, n_sample, n_samples_per_param = 2, 10, 100 - ampler = PMC(y_obs, model, likfun, kernel, backend, n_sample, n_samples_per_param, seed=1) - journal = sampler.sample(T, covFactor=np.array([.1, .1]), iniPoints=None) + sampler = PMC(model, likfun, kernel, backend, seed = 1) + journal = sampler.sample(y_obs, T, n_sample, n_samples_per_param, covFactor = np.array([.1,.1]), iniPoints = None) samples = (journal.get_parameters(), journal.get_weights()) # Compute posterior mean @@ -145,18 +145,18 @@ def setUp(self): def test_calculate_weight(self): n_samples = 2 - sampler = PMCABC(self.observation, self.model, self.dist_calc, self.kernel, self.backend, n_samples, 1) + rc = PMCABC(self.model, self.dist_calc, self.kernel, self.backend, 1) theta = np.array([1.0]) - weight = sampler._calculate_weight(theta) + weight = rc._calculate_weight(theta) self.assertEqual(weight, 0.5) accepted_parameters = np.array([[1.0], [1.0 + np.sqrt(2)]]) accepted_weights = np.array([[.5], [.5]]) accepted_cov_mat = np.array([[1.0]]) - sampler._update_broadcasts(accepted_parameters, accepted_weights, accepted_cov_mat) - weight = sampler._calculate_weight(theta) + rc._update_broadcasts(self.backend, accepted_parameters, accepted_weights, accepted_cov_mat) + weight = rc._calculate_weight(theta) expected_weight = (2.0 * np.sqrt(2.0 * np.pi)) /(( 1 + np.exp(-1))*100) self.assertEqual(weight, expected_weight) @@ -165,8 +165,8 @@ def test_calculate_weight(self): def test_sample(self): # use the PMCABC scheme for T = 1 T, n_sample, n_simulate, eps_arr, eps_percentile = 1, 10, 1, [.1], 10 - sampler = PMCABC(self.observation, self.model, self.dist_calc, self.kernel, self.backend, n_sample, n_simulate, seed = 1) - journal = sampler.sample(T, eps_arr, eps_percentile) + sampler = PMCABC(self.model, self.dist_calc, self.kernel, self.backend, seed = 1) + journal = sampler.sample(self.observation, T, eps_arr, n_sample, n_simulate, eps_percentile) samples = (journal.get_parameters(), journal.get_weights()) # Compute posterior mean @@ -183,9 +183,8 @@ def test_sample(self): # use the PMCABC scheme for T = 2 T, n_sample, n_simulate, eps_arr, eps_percentile = 2, 10, 1, [.1,.05], 10 - sampler = PMCABC(self.observation, self.model, self.dist_calc, self.kernel, self.backend, n_sample, n_simulate, - seed=1) - journal = sampler.sample(T, eps_arr, eps_percentile) + sampler = PMCABC(self.model, self.dist_calc, self.kernel, self.backend, seed = 1) + journal = sampler.sample(self.observation, T, eps_arr, n_sample, n_simulate, eps_percentile) samples = (journal.get_parameters(), journal.get_weights()) # Compute posterior mean @@ -230,8 +229,8 @@ def setUp(self): def test_sample(self): # use the SABC scheme for T = 1 steps, epsilon, n_samples, n_samples_per_param = 1, .1, 10, 1 - sampler = SABC(self.observation, self.model, self.dist_calc, self.kernel, self.backend, n_samples, n_samples_per_param, seed = 1) - journal = sampler.sample(steps, epsilon) + sampler = SABC(self.model, self.dist_calc, self.kernel, self.backend, seed = 1) + journal = sampler.sample(self.observation, steps, epsilon, n_samples, n_samples_per_param) samples = (journal.get_parameters(), journal.get_weights()) # Compute posterior mean @@ -247,9 +246,8 @@ def test_sample(self): # use the SABC scheme for T = 2 steps, epsilon, n_samples, n_samples_per_param = 2, .1, 10, 1 - sampler = SABC(self.observation, self.model, self.dist_calc, self.kernel, self.backend, n_samples, - n_samples_per_param, seed=1) - journal = sampler.sample(steps, epsilon) + sampler = SABC(self.model, self.dist_calc, self.kernel, self.backend, seed = 1) + journal = sampler.sample(self.observation, steps, epsilon, n_samples, n_samples_per_param) samples = (journal.get_parameters(), journal.get_weights()) # Compute posterior mean @@ -264,8 +262,6 @@ def test_sample(self): self.assertLess(mu_post_mean - 1.51315443746, 10e-2) self.assertLess(sigma_post_mean - 6.85230360302, 10e-2) - - class ABCsubsimTests(unittest.TestCase): def setUp(self): # find spark and initialize it @@ -296,8 +292,8 @@ def test_sample(self): # use the ABCsubsim scheme for T = 1 steps, n_samples, n_samples_per_param = 1, 10, 1 - sampler = ABCsubsim(self.observation, self.model, self.dist_calc, self.kernel, self.backend, n_samples, n_samples_per_param, seed = 1) - journal = sampler.sample(steps) + sampler = ABCsubsim(self.model, self.dist_calc, self.kernel, self.backend, seed = 1) + journal = sampler.sample(self.observation, steps, n_samples, n_samples_per_param) samples = (journal.get_parameters(), journal.get_weights()) # Compute posterior mean @@ -313,9 +309,8 @@ def test_sample(self): # use the ABCsubsim scheme for T = 2 steps, n_samples, n_samples_per_param = 2, 10, 1 - ampler = ABCsubsim(self.observation, self.model, self.dist_calc, self.kernel, self.backend, n_samples, - n_samples_per_param, seed=1) - journal = sampler.sample(steps) + sampler = ABCsubsim(self.model, self.dist_calc, self.kernel, self.backend, seed = 1) + journal = sampler.sample(self.observation, steps, n_samples, n_samples_per_param) samples = (journal.get_parameters(), journal.get_weights()) # Compute posterior mean @@ -328,8 +323,7 @@ def test_sample(self): self.assertEqual(sigma_sample_shape, (10,)) self.assertEqual(weights_sample_shape, (10,)) self.assertLess(mu_post_mean - (-2.98633946126), 10e-2) - self.assertLess(sigma_post_mean - 6.40146881524, 10e-2) - + self.assertLess(sigma_post_mean - 6.40146881524, 10e-2) class SMCABCTests(unittest.TestCase): @@ -360,7 +354,7 @@ def setUp(self): def test_sample(self): # use the SMCABC scheme for T = 1 steps, n_sample, n_simulate = 1, 10, 1 - sampler = SMCABC(self.observation, self.model, self.dist_calc, self.kernel, self.backend, n_sample, n_simulate, seed = 1) + sampler = SMCABC(self.model, self.dist_calc, self.kernel, self.backend, seed = 1) journal = sampler.sample(self.observation, steps, n_sample, n_simulate) samples = (journal.get_parameters(), journal.get_weights()) @@ -378,9 +372,8 @@ def test_sample(self): # use the SMCABC scheme for T = 2 T, n_sample, n_simulate = 2, 10, 1 - sampler = SMCABC(self.observation, self.model, self.dist_calc, self.kernel, self.backend, n_sample, n_simulate, - seed=1) - journal = sampler.sample(self.observation, steps, n_sample, n_simulate) + sampler = SMCABC(self.model, self.dist_calc, self.kernel, self.backend, seed = 1) + journal = sampler.sample(self.observation, T, n_sample, n_simulate) samples = (journal.get_parameters(), journal.get_weights()) # Compute posterior mean @@ -393,9 +386,7 @@ def test_sample(self): self.assertEqual(sigma_sample_shape, (10,)) self.assertEqual(weights_sample_shape, (10,)) self.assertLess(mu_post_mean - (-1.12595029091), 10e-2) - self.assertLess(sigma_post_mean - 4.62512055437, 10e-2) - - + self.assertLess(sigma_post_mean - 4.62512055437, 10e-2) class APMCABCTests(unittest.TestCase): def setUp(self): @@ -425,8 +416,8 @@ def setUp(self): def test_sample(self): # use the APMCABC scheme for T = 1 steps, n_sample, n_simulate = 1, 10, 1 - sampler = APMCABC(self.observation, self.model, self.dist_calc, self.kernel, self.backend, n_sample, n_simulate, seed = 1) - journal = sampler.sample(steps) + sampler = APMCABC(self.model, self.dist_calc, self.kernel, self.backend, seed = 1) + journal = sampler.sample(self.observation, steps, n_sample, n_simulate) samples = (journal.get_parameters(), journal.get_weights()) # Compute posterior mean @@ -443,9 +434,8 @@ def test_sample(self): # use the APMCABC scheme for T = 2 T, n_sample, n_simulate = 2, 10, 1 - ampler = APMCABC(self.observation, self.model, self.dist_calc, self.kernel, self.backend, n_sample, n_simulate, - seed=1) - journal = sampler.sample(steps) + sampler = APMCABC(self.model, self.dist_calc, self.kernel, self.backend, seed = 1) + journal = sampler.sample(self.observation, T, n_sample, n_simulate) samples = (journal.get_parameters(), journal.get_weights()) # Compute posterior mean @@ -460,8 +450,6 @@ def test_sample(self): self.assertLess(mu_post_mean - (2.19137364411), 10e-2) self.assertLess(sigma_post_mean - 5.66226403628, 10e-2) - - class RSMCABCTests(unittest.TestCase): def setUp(self): # find spark and initialize it @@ -490,8 +478,8 @@ def setUp(self): def test_sample(self): # use the RSMCABC scheme for T = 1 steps, n_sample, n_simulate = 1, 10, 1 - sampler = RSMCABC(self.observation, self.model, self.dist_calc, self.kernel, self.backend, n_sample, n_simulate, seed = 1) - journal = sampler.sample(steps) + sampler = RSMCABC(self.model, self.dist_calc, self.kernel, self.backend, seed = 1) + journal = sampler.sample(self.observation, steps, n_sample, n_simulate) samples = (journal.get_parameters(), journal.get_weights()) # Compute posterior mean @@ -508,9 +496,8 @@ def test_sample(self): # use the RSMCABC scheme for T = 2 steps, n_sample, n_simulate = 2, 10, 1 - sampler = RSMCABC(self.observation, self.model, self.dist_calc, self.kernel, self.backend, n_sample, n_simulate, - seed=1) - journal = sampler.sample(steps) + sampler = RSMCABC(self.model, self.dist_calc, self.kernel, self.backend, seed = 1) + journal = sampler.sample(self.observation, steps, n_sample, n_simulate) samples = (journal.get_parameters(), journal.get_weights()) # Compute posterior mean @@ -525,6 +512,5 @@ def test_sample(self): self.assertLess(mu_post_mean - (-0.349310337252), 10e-2) self.assertLess(sigma_post_mean - 6.30221177368, 10e-2) - if __name__ == '__main__': unittest.main() From 3ceb2d6d84d71769eb859ec19f73eed82ba686c4 Mon Sep 17 00:00:00 2001 From: Nicole Date: Thu, 21 Sep 2017 15:25:14 +0200 Subject: [PATCH 09/14] fixed small bug in testing of inferences --- tests/inferences_tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/inferences_tests.py b/tests/inferences_tests.py index dfda3758..d544b081 100644 --- a/tests/inferences_tests.py +++ b/tests/inferences_tests.py @@ -155,7 +155,7 @@ def test_calculate_weight(self): accepted_parameters = np.array([[1.0], [1.0 + np.sqrt(2)]]) accepted_weights = np.array([[.5], [.5]]) accepted_cov_mat = np.array([[1.0]]) - rc._update_broadcasts(self.backend, accepted_parameters, accepted_weights, accepted_cov_mat) + rc._update_broadcasts(accepted_parameters, accepted_weights, accepted_cov_mat) weight = rc._calculate_weight(theta) expected_weight = (2.0 * np.sqrt(2.0 * np.pi)) /(( 1 + np.exp(-1))*100) self.assertEqual(weight, expected_weight) From dcf15743f3bf415595f5e55e730ff176cd55502e Mon Sep 17 00:00:00 2001 From: Nicole Date: Thu, 28 Sep 2017 14:13:36 +0200 Subject: [PATCH 10/14] Added abstract base class for inferences.py. Removed remaining RemoteContexts. Cleaned up documentation and implemented small changes --- .gitignore | 14 + abcpy/approx_lhd.py | 30 +- abcpy/backends/base.py | 14 +- abcpy/backends/mpi.py | 31 +- abcpy/distances.py | 8 +- abcpy/distributions.py | 12 +- abcpy/inferences.py | 853 ++++++++++++++++++++++++++++--------- abcpy/models.py | 107 +++-- abcpy/modelselections.py | 112 +++-- abcpy/statistics.py | 4 +- abcpy/summaryselections.py | 35 +- tests/pickle_tests.py | 3 +- 12 files changed, 825 insertions(+), 398 deletions(-) diff --git a/.gitignore b/.gitignore index 72364f99..dedc2fec 100644 --- a/.gitignore +++ b/.gitignore @@ -87,3 +87,17 @@ ENV/ # Rope project settings .ropeproject +abcpy/.idea/ +abcpy/inf_no_abs.py +abcpy/inferences_old.py +abcpy/modelselection_old.py +abcpy/pymc_dist.py +abcpy/summaryselection_old.py +abcpy/test.py +abcpy/uniform.py +tests/backend.py +tests/save.p +tests/save2.p +tests/save3.p +.idea/ + diff --git a/abcpy/approx_lhd.py b/abcpy/approx_lhd.py index a7801361..5412acfd 100644 --- a/abcpy/approx_lhd.py +++ b/abcpy/approx_lhd.py @@ -6,19 +6,20 @@ class Approx_likelihood(metaclass = ABCMeta): """This abstract base class defines the approximate likelihood - function. To approximate the likelihood function at a parameter value given observed dataset, - we need to pass a dataset simulated from model set at the parameter value and the observed dataset. + function. To approximate the likelihood function at a parameter value given observed data set, + we need to pass a data set simulated from model set at the parameter value and the observed data set. """ @abstractmethod def __init__(self, statistics_calc): - """ The constructor of a sub-class must accept a non-optional statistics - calculator, which is stored to self.statistics_calc. + """ + The constructor of a sub-class must accept a non-optional statistics + calculator, which is stored to self.statistics_calc. - Parameters - ---------- - statistics_calc : abcpy.stasistics.Statistics - Statistics extractor object that conforms to the Statistics class. + Parameters + ---------- + statistics_calc : abcpy.stasistics.Statistics + Statistics extractor object that conforms to the Statistics class. """ raise NotImplemented @@ -106,10 +107,8 @@ class PenLogReg(Approx_likelihood): [2] Friedman, J., Hastie, T., and Tibshirani, R. (2010). Regularization paths for generalized linear models via coordinate descent. Journal of Statistical - Software, 33(1), 1–22. - """ - def __init__(self, statistics_calc, model, n_simulate, n_folds=10, max_iter = 100000, seed = None): - """ + Software, 33(1), 1–22. + Parameters ---------- statistics_calc : abcpy.stasistics.Statistics @@ -124,10 +123,11 @@ def __init__(self, statistics_calc, model, n_simulate, n_folds=10, max_iter = 10 Maximum passes over the data. The default is 100000. seed: int, optional Seed for the random number generator. The used glmnet solver is not - deterministic, this seed is used for determining the cv folds. The default value is + deterministic, this seed is used for determining the cv folds. The default value is None. - """ - + """ + def __init__(self, statistics_calc, model, n_simulate, n_folds=10, max_iter = 100000, seed = None): + self.model = model self.statistics_calc = statistics_calc self.n_folds = n_folds diff --git a/abcpy/backends/base.py b/abcpy/backends/base.py index c525e34b..13e6d760 100644 --- a/abcpy/backends/base.py +++ b/abcpy/backends/base.py @@ -30,7 +30,7 @@ def parallelize(self, list): A reference object that represents the parallelized list """ - raise NotImplemented + raise NotImplementedError @abstractmethod @@ -49,7 +49,7 @@ def broadcast(self, object): A reference to the broadcasted object """ - raise NotImplemented + raise NotImplementedError @abstractmethod @@ -72,7 +72,7 @@ def map(self, func, pds): a new parallel data set that contains the result of the map """ - raise NotImplemented + raise NotImplementedError @abstractmethod @@ -91,7 +91,7 @@ def collect(self, pds): all elements of pds as a list """ - raise NotImplemented + raise NotImplementedError class PDS: @@ -101,7 +101,7 @@ class PDS: @abstractmethod def __init__(self): - raise NotImplemented + raise NotImplementedError class BDS: @@ -111,7 +111,7 @@ class BDS: @abstractmethod def __init__(self): - raise NotImplemented + raise NotImplementedError @abstractmethod @@ -119,7 +119,7 @@ def value(self): """ This method should return the actual object that the broadcast data set represents. """ - raise NotImplemented + raise NotImplementedError class BackendDummy(Backend): diff --git a/abcpy/backends/mpi.py b/abcpy/backends/mpi.py index 2d1bab84..479d1443 100644 --- a/abcpy/backends/mpi.py +++ b/abcpy/backends/mpi.py @@ -1,5 +1,6 @@ import numpy as np import cloudpickle +import pickle from mpi4py import MPI from abcpy.backends import Backend, PDS, BDS @@ -54,7 +55,7 @@ def __command_slaves(self, command, data): elif command == self.OP_MAP: #In map we receive data as (pds_id,pds_id_new,func) #Use cloudpickle to dump the function into a string. - function_packed = cloudpickle.dumps(data[2]) + function_packed = cloudpickle.dumps(data[2], pickle.HIGHEST_PROTOCOL) data_packet = (command, data[0], data[1], function_packed) elif command == self.OP_BROADCAST: @@ -74,34 +75,6 @@ def __command_slaves(self, command, data): _ = self.comm.bcast(data_packet, root=0) - def __sanitize_and_pack_func(self, func): - """ - Prevents the function from packing the backend by temporarily - setting it to another variable and then uses cloudpickle - to pack it into a string to be sent. - - Parameters - ---------- - func: Python Function - The function we are supposed to pack while sending it along to the slaves - during the map function - - Returns - ------- - Returns a string of the function packed by cloudpickle - - """ - - #Set the backend to None to prevent it from being packed - globals()['backend'] = {} - - function_packed = cloudpickle.dumps(func) - #Reset the backend to self after it's been packed - globals()['backend'] = self - - return function_packed - - def __generate_new_pds_id(self): """ This method generates a new pds_id to associate a PDS with it's remote counterpart diff --git a/abcpy/distances.py b/abcpy/distances.py index a2897078..7ecf5a51 100644 --- a/abcpy/distances.py +++ b/abcpy/distances.py @@ -21,7 +21,7 @@ def __init__(self, statistics_calc): Statistics extractor object that conforms to the Statistics class. """ - raise NotImplemented + raise NotImplementedError @abstractmethod @@ -58,11 +58,11 @@ def distance(d1, d2): The distance between the two input data sets. """ - raise NotImplemented + raise NotImplementedError @abstractmethod - def dist_max(): + def dist_max(self): """To be overwritten by sub-class: should return maximum possible value of the desired distance function. @@ -76,7 +76,7 @@ def dist_max(): The maximal possible value of the desired distance function. """ - raise NotImplemented + raise NotImplementedError def _calculate_summary_stat(self,d1,d2): diff --git a/abcpy/distributions.py b/abcpy/distributions.py index ff3f7d77..44d54d74 100644 --- a/abcpy/distributions.py +++ b/abcpy/distributions.py @@ -19,12 +19,12 @@ def set_parameters(self, params): Parameters ---------- - theta: list + params: list Contains all the distributions parameters. """ - raise NotImplemented + raise NotImplementedError @@ -39,7 +39,7 @@ def reseed(self, seed): """ - raise NotImplemented + raise NotImplementedError @@ -58,7 +58,7 @@ def sample(self, k): """ - raise NotImplemented + raise NotImplementedError @abstractmethod @@ -77,7 +77,7 @@ def pdf(self, x): """ - raise NotImplemented + raise NotImplementedError class MultiNormal(Distribution): @@ -130,7 +130,7 @@ class Uniform(Distribution): """ This class implements a p-dimensional uniform Prior distribution in a closed interval. """ - def __init__(self, lb : np.ndarray, ub : np.ndarray, seed=None): + def __init__(self, lb, ub, seed=None): """ Defines the upper and lower bounds of a p-dimensional uniform Prior distribution in a closed interval. diff --git a/abcpy/inferences.py b/abcpy/inferences.py index 15cd7181..9a5c8923 100644 --- a/abcpy/inferences.py +++ b/abcpy/inferences.py @@ -1,8 +1,231 @@ +from abc import ABCMeta, abstractmethod, abstractproperty + import numpy as np from abcpy.output import Journal from scipy import optimize -class RejectionABC: + +class InferenceMethod(metaclass = ABCMeta): + """ + This abstract base class represents an inference method. + + """ + + def __getstate__(self): + """Cloudpickle is used with the MPIBackend. This function ensures that the backend itself + is not pickled + """ + state = self.__dict__.copy() + del state['backend'] + return state + + @abstractmethod + def sample(self): + """To be overwritten by any sub-class: + Samples from the posterior distribution of the model parameter given the observed + data observations. + """ + raise NotImplementedError + + @abstractproperty + def model(self): + """To be overwritten by any sub-class: an attribute specifying the model to be used + """ + raise NotImplementedError + + @abstractproperty + def rng(self): + """To be overwritten by any sub-class: an attribute specifying the random number generator to be used + """ + raise NotImplementedError + + @abstractproperty + def n_samples(self): + """To be overwritten by any sub-class: an attribute specifying the number of samples to be generated + """ + raise NotImplementedError + + @abstractproperty + def n_samples_per_param(self): + """To be overwritten by any sub-class: an attribute specifying the number of data points in each simulated data set.""" + raise NotImplementedError + + @abstractproperty + def observations_bds(self): + """To be overwritten by any sub-class: an attribute saving the observations as bds + """ + raise NotImplementedError + + +class BasePMC(InferenceMethod, metaclass = ABCMeta): + """ + This abstract base class represents inference methods that use Population Monte Carlo. + + """ + @abstractmethod + def _update_broadcasts(self, accepted_parameters, accepted_weights, accepted_cov_mat): + """ + To be overwritten by any sub-class: broadcasts updated values + + Parameters + ---------- + accepted_parameters: numpy.array + Contains all new accepted parameters. + accepted_weights: numpy.array + Contains all the new accepted weights. + accepted_cov_mat: numpy.ndarray + Contains the new accepted covariance matrix + + Returns + ------- + None + """ + raise NotImplementedError + + @abstractmethod + def _calculate_weight(self, theta): + """ + To be overwritten by any sub-class: + Calculates the weight for the given parameter using + accepted_parameters, accepted_cov_mat + + Parameters + ---------- + theta: np.array + 1xp matrix containing the model parameters, where p is the dimension of parameters + + Returns + ------- + float + the new weight for theta + """ + raise NotImplementedError + + @abstractproperty + def kernel(self): + """To be overwritten by any sub-class: an attribute specifying the kernel to be used + """ + raise NotImplementedError + + @abstractproperty + def accepted_parameters_bds(self): + """To be overwritten by any sub-class: an attribute saving the accepted parameters as bds + """ + raise NotImplementedError + + @abstractproperty + def accepted_weights_bds(self): + """To be overwritten by any sub-class: an attribute saving the accepted weights as bds + """ + raise NotImplementedError + + @abstractproperty + def accepted_cov_mat_bds(self): + """To be overwritten by any sub-class: an attribute saving the accepted covariance matrix as bds + """ + raise NotImplementedError + + + +class BaseAnnealing(InferenceMethod, metaclass = ABCMeta): + """ + This abstract base class represents inference methods that use annealing. + + """ + + @abstractmethod + def _update_broadcasts(self): + raise NotImplementedError + + @abstractmethod + def _accept_parameter(self): + raise NotImplementedError + + @abstractproperty + def distance(self): + """To be overwritten by any sub-class: an attribute specifying the distance measure to be used + """ + raise NotImplementedError + + @abstractproperty + def kernel(self): + """To be overwritten by any sub-class: an attribute specifying the kernel to be used + """ + raise NotImplementedError + + @abstractproperty + def accepted_parameters_bds(self): + """To be overwritten by any sub-class: an attribute saving the accepted parameters as bds + """ + raise NotImplementedError + + @abstractproperty + def accepted_cov_mat_bds(self): + """To be overwritten by any sub-class: an attribute saving the accepted covariance matrix as bds + """ + raise NotImplementedError + +class BaseAdaptivePopulationMC(InferenceMethod, metaclass = ABCMeta): + """ + This abstract base class represents inference methods that use Adaptive Population Monte Carlo. + + """ + + @abstractmethod + def _update_broadcasts(self): + """ + To be overwritten by any sub-class: broadcasts updated values + + Parameters + ---------- + accepted_parameters: numpy.array + Contains all new accepted parameters. + accepted_weights: numpy.array + Contains all the new accepted weights. + accepted_cov_mat: numpy.ndarray + Contains the new accepted covariance matrix + + Returns + ------- + None + """ + raise NotImplementedError + + @abstractmethod + def _accept_parameter(self): + """ + To be overwritten by any sub-class: + Samples a single model parameter and simulate from it until + accepted with some probability. + + """ + raise NotImplementedError + + @abstractproperty + def distance(self): + """To be overwritten by any sub-class: an attribute specifying the distance measure to be used + """ + raise NotImplementedError + + @abstractproperty + def kernel(self): + """To be overwritten by any sub-class: an attribute specifying the kernel to be used + """ + raise NotImplementedError + + @abstractproperty + def accepted_parameters_bds(self): + """To be overwritten by any sub-class: an attribute saving the accepted parameters as bds + """ + raise NotImplementedError + + @abstractproperty + def accepted_cov_mat_bds(self): + """To be overwritten by any sub-class: an attribute saving the accepted covariance matrix as bds + """ + raise NotImplementedError + +class RejectionABC(InferenceMethod): """This base class implements the rejection algorithm based inference scheme [1] for Approximate Bayesian Computation. @@ -12,32 +235,59 @@ class RejectionABC: Parameters ---------- model: abcpy.models.Model - Model object that conforms to the Model class. + Model object defining the model to be used. distance: abcpy.distances.Distance - Distance object that conforms to the Distance class. + Distance object defining the distance measure to compare simulated and observed data sets. backend: abcpy.backends.Backend - Backend object that conforms to the Backend class. + Backend object defining the backend to be used. seed: integer, optional Optional initial seed for the random number generator. The default value is generated randomly. """ + + model = None + distance = None + rng = None + + n_samples = None + n_samples_per_param = None + epsilon = None + + observations_bds = None + def __init__(self, model, distance, backend, seed=None): self.model = model - self.dist_calc = distance + self.distance = distance self.backend = backend self.rng = np.random.RandomState(seed) - def __getstate__(self): - #tells cloudpickle to not include the backend when pickling - state = self.__dict__.copy() - del state['backend'] - return state - def sample(self, observations, n_samples, n_samples_per_param, epsilon, full_output=0): + """ + Samples from the posterior distribution of the model parameter given the observed + data observations. + Parameters + ---------- + observations: numpy.ndarray + Observed data. + n_samples: integer + Number of samples to generate + n_samples_per_param: integer + Number of data points in each simulated data set. + epsilon: float + Value of threshold + full_output: integer, optional + If full_output==1, intermediate results are included in output journal. + The default value is 0, meaning the intermediate results are not saved. + Returns + ------- + abcpy.output.Journal + a journal containing simulation results, metadata and optionally intermediate results. + """ self.observations_bds = self.backend.broadcast(observations) self.n_samples = n_samples self.n_samples_per_param = n_samples_per_param self.epsilon = epsilon + journal = Journal(full_output) journal.configuration["n_samples"] = self.n_samples journal.configuration["n_samples_per_param"] = self.n_samples_per_param @@ -45,8 +295,6 @@ def sample(self, observations, n_samples, n_samples_per_param, epsilon, full_out accepted_parameters = None - # Initialize variables that need to be available remotely - # main Rejection ABC algorithm seed_arr = self.rng.randint(1, n_samples * n_samples, size=n_samples, dtype=np.int32) seed_pds = self.backend.parallelize(seed_arr) @@ -61,20 +309,35 @@ def sample(self, observations, n_samples, n_samples_per_param, epsilon, full_out return journal def _sample_parameter(self, seed): - distance = self.dist_calc.dist_max() + """ + Samples a single model parameter and simulates from it until + distance between simulated outcome and the observation is + smaller than epsilon. + + Parameters + ---------- + seed: int + value of a seed to be used for reseeding + Returns + ------- + np.array + accepted parameter + """ + + distance = self.distance.dist_max() self.model.prior.reseed(seed) while distance > self.epsilon: # Accept new parameter value if the distance is less than epsilon self.model.sample_from_prior() y_sim = self.model.simulate(self.n_samples_per_param) - distance = self.dist_calc.distance(self.observations_bds.value(), y_sim) + distance = self.distance.distance(self.observations_bds.value(), y_sim) return self.model.get_parameters() - -class PMCABC: - """This base class implements a modified version of Population Monte Carlo based inference scheme +class PMCABC(BasePMC, InferenceMethod): + """ + This base class implements a modified version of Population Monte Carlo based inference scheme for Approximate Bayesian computation of Beaumont et. al. [1]. Here the threshold value at `t`-th generation are adaptively chosen by taking the maximum between the epsilon_percentile-th value of discrepancies of the accepted parameters at `t-1`-th generation and the threshold value provided for this generation by the user. If we take the @@ -87,17 +350,32 @@ class PMCABC: Parameters ---------- model : abcpy.models.Model - Model object that conforms to the Model class. + Model object defining the model to be used. distance : abcpy.distances.Distance - Distance object that conforms to the Distance class. + Distance object defining the distance measure to compare simulated and observed data sets. kernel : abcpy.distributions.Distribution - Distribution object defining the perturbation kernel needed for the sampling + Distribution object defining the perturbation kernel needed for the sampling. backend : abcpy.backends.Backend - Backend object that conforms to the Backend class. + Backend object defining the backend to be used. seed : integer, optional Optional initial seed for the random number generator. The default value is generated randomly. """ + model = None + distance = None + kernel = None + rng = None + + #default value, set so that testing works + n_samples = 2 + n_samples_per_param = None + + observations_bds = None + accepted_parameters_bds = None + accepted_weights_bds = None + accepted_cov_mat_bds = None + + def __init__(self, model, distance, kernel, backend, seed=None): self.model = model @@ -106,20 +384,13 @@ def __init__(self, model, distance, kernel, backend, seed=None): self.backend = backend self.rng = np.random.RandomState(seed) - #added this here, so that the calculate_weight function can be called without calling sample first - self.n_samples = 2 - # these are usually big tables, so we broadcast them to have them once # per executor instead of once per task + self.observations_bds = None self.accepted_parameters_bds = None self.accepted_weights_bds = None self.accepted_cov_mat_bds = None - def __getstate__(self): - #tells cloudpickle to not include the backend when pickling - state = self.__dict__.copy() - del state['backend'] - return state def sample(self, observations, steps, epsilon_init, n_samples = 10000, n_samples_per_param = 1, epsilon_percentile = 0, covFactor = 2, full_output=0): """Samples from the posterior distribution of the model parameter given the observed @@ -168,6 +439,7 @@ def sample(self, observations, steps, epsilon_init, n_samples = 10000, n_samples accepted_parameters = None accepted_weights = None accepted_cov_mat = None + # Define epsilon_arr if len(epsilon_init) == steps: epsilon_arr = epsilon_init @@ -207,7 +479,7 @@ def sample(self, observations, steps, epsilon_init, n_samples = 10000, n_samples epsilon_arr[aStep + 1] = np.max( [np.percentile(distances, epsilon_percentile), epsilon_arr[aStep + 1]]) - # 2: calculate weights for new parameters + # 2: calculate weights for new parameters # print("INFO: Calculating weights.") new_parameters_pds = self.backend.parallelize(new_parameters) new_weights_pds = self.backend.map(self._calculate_weight, new_parameters_pds) @@ -254,12 +526,19 @@ def _resample_parameter(self, seed): """ Samples a single model parameter and simulate from it until distance between simulated outcome and the observation is - smaller than eplison. + smaller than epsilon. - :type seed: int - :rtype: np.array - :return: accepted parameter + Parameters + ---------- + seed: integer + initial seed for the random number generator. + + Returns + ------- + np.array + accepted parameter """ + rng = np.random.RandomState(seed) self.model.prior.reseed(rng.randint(np.iinfo(np.uint32).max, dtype=np.uint32)) self.kernel.reseed(rng.randint(np.iinfo(np.uint32).max, dtype=np.uint32)) @@ -284,6 +563,7 @@ def _resample_parameter(self, seed): y_sim = self.model.simulate(self.n_samples_per_param) distance = self.distance.distance(self.observations_bds.value(), y_sim) + return (self.model.get_parameters(), distance) def _calculate_weight(self, theta): @@ -294,7 +574,7 @@ def _calculate_weight(self, theta): Parameters ---------- theta: np.array - 1xp matrix containing model parameter, where p is the dimension of parameter + 1xp matrix containing model parameter, where p is the number of parameters Returns ------- @@ -313,11 +593,12 @@ def _calculate_weight(self, theta): [self.accepted_parameters_bds.value()[i, :], self.accepted_cov_mat_bds.value()]) pdf_value = self.kernel.pdf(theta) denominator += self.accepted_weights_bds.value()[i, 0] * pdf_value - return 1.0 * prior_prob / denominator + return 1.0 * prior_prob / denominator -class PMC: - """Population Monte Carlo based inference scheme of Cappé et. al. [1]. +class PMC(BasePMC, InferenceMethod): + """ + Population Monte Carlo based inference scheme of Cappé et. al. [1]. This algorithm assumes a likelihood function is available and can be evaluated at any parameter value given the oberved dataset. In absence of the @@ -332,25 +613,35 @@ class PMC: [2] C. Andrieu and G. O. Roberts. The pseudo-marginal approach for efficient Monte Carlo computations. Annals of Statistics, 37(2):697–725, 04 2009. + Parameters + ---------- + model : abcpy.models.Model + Model object defining the model to be used. + likfun : abcpy.approx_lhd.Approx_likelihood + Approx_likelihood object defining the approximated likelihood to be used. + kernel : abcpy.distributions.Distribution + Distribution object defining the perturbation kernel needed for the sampling. + backend : abcpy.backends.Backend + Backend object defining the backend to be used. + seed : integer, optional + Optional initial seed for the random number generator. The default value is generated randomly. + """ - def __init__(self, model, likfun, kernel, backend, seed=None): - """Constructor of PMC inference schemes. + model = None + likfun = None + kernel = None + rng = None - Parameters - ---------- - model : abcpy.models.Model - Model object that conforms to the Model class - likfun : abcpy.approx_lhd.Approx_likelihood - Approx_likelihood object that conforms to the Approx_likelihood class - kernel : abcpy.distributions.Distribution - Distribution object defining the perturbation kernel needed for the sampling - backend : abcpy.backends.Backend - Backend object that conforms to the Backend class - seed : integer, optional - Optional initial seed for the random number generator. The default value is generated randomly. + n_samples = None + n_samples_per_param = None - """ + observations_bds = None + accepted_parameters_bds = None + accepted_weights_bds = None + accepted_cov_mat_bds = None + + def __init__(self, model, likfun, kernel, backend, seed=None): self.model = model self.likfun = likfun self.kernel = kernel @@ -359,15 +650,11 @@ def __init__(self, model, likfun, kernel, backend, seed=None): # these are usually big tables, so we broadcast them to have them once # per executor instead of once per task + self.observations_bds = None self.accepted_parameters_bds = None self.accepted_weights_bds = None self.accepted_cov_mat_bds = None - def __getstate__(self): - #tells cloudpickle to not include the backend when pickling - state = self.__dict__.copy() - del state['backend'] - return state def sample(self, observations, steps, n_samples = 10000, n_samples_per_param = 100, covFactor = None, iniPoints = None, full_output=0): """Samples from the posterior distribution of the model parameter given the observed @@ -395,15 +682,12 @@ def sample(self, observations, steps, n_samples = 10000, n_samples_per_param = 1 ------- abcpy.output.Journal A journal containing simulation results, metadata and optionally intermediate results. - - Parameters - ---------- - """ self.observations_bds = self.backend.broadcast(observations) self.n_samples = n_samples self.n_samples_per_param = n_samples_per_param + journal = Journal(full_output) journal.configuration["type_model"] = type(self.model) journal.configuration["type_lhd_func"] = type(self.likfun) @@ -417,6 +701,7 @@ def sample(self, observations, steps, n_samples = 10000, n_samples_per_param = 1 accepted_weights = None accepted_cov_mat = None new_theta = None + dim = len(self.model.get_parameters()) # Initialize particles: When not supplied, randomly draw them from prior distribution @@ -460,6 +745,7 @@ def sample(self, observations, steps, n_samples = 10000, n_samples_per_param = 1 if theta_is_accepted and self.model.prior.pdf(self.model.get_parameters()) != 0: new_parameters[ind, :] = new_theta break + # 2: calculate approximate lieklihood for new parameters # print("INFO: Calculate approximate likelihood.") new_parameters_pds = self.backend.parallelize(new_parameters) @@ -515,25 +801,33 @@ def _approx_lik_calc(self, theta): """ Compute likelihood for new parameters using approximate likelihood function - :seed = aTuple[0] - :theta = aTuple[1] - :type seed: int - :rtype: np.arraybg + Parameters + ---------- + theta: numpy.ndarray + 1xp matrix containing the model parameters, where p is the number of parameters - :return: likelihood values of new parameter + Returns + ------- + float + The approximated likelihood function """ # Assign theta to model self.model.set_parameters(theta) + # Simulate the fake data from the model given the parameter value theta # print("DEBUG: Simulate model for parameter " + str(theta)) y_sim = self.model.simulate(self.n_samples_per_param) + # print("DEBUG: Extracting observation.") obs = self.observations_bds.value() + # print("DEBUG: Computing likelihood...") lhd = self.likfun.likelihood(obs, y_sim) + # print("DEBUG: Likelihood is :" + str(lhd)) pdf_at_theta = self.model.prior.pdf(theta) + # print("DEBUG: prior pdf evaluated at theta is :" + str(pdf_at_theta)) return pdf_at_theta * lhd @@ -542,10 +836,15 @@ def _calculate_weight(self, theta): Calculates the weight for the given parameter using accepted_parameters, accepted_cov_mat - :type theta: np.array - :param theta: 1xp matrix containing model parameters, where p is the number of parameters - :rtype: float - :return: the new weight for theta + Parameters + ---------- + theta: np.ndarray + 1xp matrix containing the model parameters, where p is the number of parameters + + Returns + ------- + float + The new weight for theta """ if self.accepted_weights_bds is None: @@ -559,11 +858,12 @@ def _calculate_weight(self, theta): [self.accepted_parameters_bds.value()[i, :], self.accepted_cov_mat_bds.value()]) pdf_value = self.kernel.pdf(theta) denominator += self.accepted_weights_bds.value()[i, 0] * pdf_value - return 1.0 * prior_prob / denominator + return 1.0 * prior_prob / denominator -class SABC: - """This base class implements a modified version of Simulated Annealing Approximate Bayesian Computation (SABC) of [1] when the prior is non-informative. +class SABC(BaseAnnealing, InferenceMethod): + """ + This base class implements a modified version of Simulated Annealing Approximate Bayesian Computation (SABC) of [1] when the prior is non-informative. [1] C. Albert, H. R. Kuensch and A. Scheidegger. A Simulated Annealing Approach to Approximate Bayes Computations. Statistics and Computing, (2014). @@ -571,17 +871,32 @@ class SABC: Parameters ---------- model : abcpy.models.Model - Model object that conforms to the Model class. + Model object defining the model to be used. distance : abcpy.distances.Distance - Distance object that conforms to the Distance class. + Distance object defining the distance measure used to compare simulated and observed data sets. kernel : abcpy.distributions.Distribution - Distribution object defining the perturbation kernel needed for the sampling + Distribution object defining the perturbation kernel needed for the sampling. backend : abcpy.backends.Backend - Backend object that conforms to the Backend class. + Backend object defining the backend to be used. seed : integer, optional Optional initial seed for the random number generator. The default value is generated randomly. """ + model = None + distance = None + kernel = None + rng = None + + n_samples = None + n_samples_per_param = None + epsilon = None + + observations_bds = None + accepted_parameters_bds = None + accepted_cov_mat_bds = None + smooth_distances_bds = None + all_distances_bds = None + def __init__(self, model, distance, kernel, backend, seed=None): self.model = model self.distance = distance @@ -591,16 +906,12 @@ def __init__(self, model, distance, kernel, backend, seed=None): # these are usually big tables, so we broadcast them to have them once # per executor instead of once per task + self.observations_bds = None self.accepted_parameters_bds = None self.accepted_cov_mat_bds = None self.smooth_distances_bds = None self.all_distances_bds = None - def __getstate__(self): - #tells cloudpickle to not include the backend when pickling - state = self.__dict__.copy() - del state['backend'] - return state def sample(self, observations, steps, epsilon, n_samples = 10000, n_samples_per_param = 1, beta = 2, delta = 0.2, v = 0.3, ar_cutoff = 0.5, resample = None, n_update = None, adaptcov = 1, full_output=0): """Samples from the posterior distribution of the model parameter given the observed @@ -641,10 +952,12 @@ def sample(self, observations, steps, epsilon, n_samples = 10000, n_samples_per_ abcpy.output.Journal A journal containing simulation results, metadata and optionally intermediate results. """ + self.observations_bds = self.backend.broadcast(observations) self.epsilon = epsilon self.n_samples = n_samples self.n_samples_per_param = n_samples_per_param + journal = Journal(full_output) journal.configuration["type_model"] = type(self.model) journal.configuration["type_dist_func"] = type(self.distance) @@ -674,6 +987,7 @@ def sample(self, observations, steps, epsilon, n_samples = 10000, n_samples_per_ sample_array = np.ones(shape=(steps,)) sample_array[0] = n_samples sample_array[1:] = n_update + ## Acceptance counter to determine the resampling step accept = 0 samples_until = 0 @@ -732,7 +1046,7 @@ def sample(self, observations, steps, epsilon, n_samples = 10000, n_samples_per_ 0.0001 * (np.var(np.transpose(accepted_parameters))) * np.eye( accepted_parameters.shape[1]) - ## 4: Show progress and if acceptance rate smaller than a value break the iteration + # 4: Show progress and if acceptance rate smaller than a value break the iteration # print("INFO: Saving intermediate configuration to output journal.") if full_output == 1: @@ -783,6 +1097,19 @@ def _smoother_distance(self, distance, old_distance): [1] C. Albert, H. R. Kuensch and A. Scheidegger. A Simulated Annealing Approach to Approximate Bayes Computations. Statistics and Computing 0960-3174 (2014). + + Parameters + ---------- + distance: numpy.ndarray + Current distance between the simulated and observed data + old_distance: numpy.ndarray + Last distance between the simulated and observed data + + Returns + ------- + numpy.ndarray + Smoothed distance + """ smoothed_distance = np.zeros(shape=(len(distance),)) @@ -796,6 +1123,20 @@ def _smoother_distance(self, distance, old_distance): return smoothed_distance def _average_redefined_distance(self, distance, epsilon): + """ + Function to calculate the weighted average of the distance + Parameters + ---------- + distance: numpy.ndarray + Distance between simulated and observed data set + epsilon: float + threshold + + Returns + ------- + numpy.ndarray + Weighted average of the distance + """ if epsilon == 0: U = 0 else: @@ -827,33 +1168,23 @@ def destroy(bc): if not all_distances is None: self.all_distances_bds = self.backend.broadcast(all_distances) - def _smoother_distance_remote(self, distance, old_distance): - """Smooths the distance using the Equation 14 of [1]. - - [1] C. Albert, H. R. Kuensch and A. Scheidegger. A Simulated Annealing Approach to - Approximate Bayes Computations. Statistics and Computing 0960-3174 (2014). - """ - - smoothed_distance = np.zeros(shape=(len(distance),)) - - for ind in range(0, len(distance)): - if distance[ind] < np.min(old_distance): - smoothed_distance[ind] = (distance[ind] / np.min(old_distance)) / len(old_distance) - else: - smoothed_distance[ind] = np.mean(np.array(old_distance) < distance[ind]) - - return smoothed_distance - # define helper functions for map step def _accept_parameter(self, seed): """ Samples a single model parameter and simulate from it until accepted with probabilty exp[-rho(x,y)/epsilon]. - :type seed: int - :rtype: np.array - :return: accepted parameter + Parameters + ---------- + seed: integer + Initial seed for the random number generator. + + Returns + ------- + numpy.ndarray + accepted parameter """ + rng = np.random.RandomState(seed) self.model.prior.reseed(rng.randint(np.iinfo(np.uint32).max, dtype=np.uint32)) self.kernel.reseed(rng.randint(np.iinfo(np.uint32).max, dtype=np.uint32)) @@ -889,7 +1220,7 @@ def _accept_parameter(self, seed): break y_sim = self.model.simulate(self.n_samples_per_param) distance = self.distance.distance(self.observations_bds.value(), y_sim) - smooth_distance = self._smoother_distance_remote([distance], self.all_distances_bds.value()) + smooth_distance = self._smoother_distance([distance], self.all_distances_bds.value()) ## Calculate acceptance probability: ratio_prior_prob = self.model.prior.pdf(new_theta) / self.model.prior.pdf( @@ -905,8 +1236,7 @@ def _accept_parameter(self, seed): return (new_theta, distance, all_parameters, all_distances, index, acceptance) - -class ABCsubsim: +class ABCsubsim(BaseAnnealing, InferenceMethod): """This base class implements Approximate Bayesian Computation by subset simulation (ABCsubsim) algorithm of [1]. [1] M. Chiachio, J. L. Beck, J. Chiachio, and G. Rus., Approximate Bayesian computation by subset @@ -915,17 +1245,31 @@ class ABCsubsim: Parameters ---------- model : abcpy.models.Model - Model object that conforms to the Model class. + Model object defining the model to be used. distance : abcpy.distances.Distance - Distance object that conforms to the Distance class. + Distance object defining the distance used to compare the simulated and observed data sets. kernel : abcpy.distributions.Distribution - Distribution object defining the perturbation kernel needed for the sampling + Distribution object defining the perturbation kernel needed for the sampling. backend : abcpy.backends.Backend - Backend object that conforms to the Backend class. + Backend object defining the backend to be used. seed : integer, optional Optional initial seed for the random number generator. The default value is generated randomly. """ + model = None + distance = None + kernel = None + rng = None + anneal_parameter = None + + n_samples = None + n_samples_per_param = None + chain_length = None + + observations_bds = None + accepted_parameters_bds = None + accepted_cov_mat_bds = None + def __init__(self, model, distance, kernel, backend, seed=None): self.model = model self.distance = distance @@ -937,14 +1281,10 @@ def __init__(self, model, distance, kernel, backend, seed=None): # these are usually big tables, so we broadcast them to have them once # per executor instead of once per task + self.observations_bds = None self.accepted_parameters_bds = None self.accepted_cov_mat_bds = None - def __getstate__(self): - #tells cloudpickle to not include the backend when pickling - state = self.__dict__.copy() - del state['backend'] - return state def sample(self, observations, steps, n_samples = 10000, n_samples_per_param = 1, chain_length = 10, ap_change_cutoff = 10, full_output=0): """Samples from the posterior distribution of the model parameter given the observed @@ -968,6 +1308,7 @@ def sample(self, observations, steps, n_samples = 10000, n_samples_per_param = 1 abcpy.output.Journal A journal containing simulation results, metadata and optionally intermediate results. """ + self.observations_bds = self.backend.broadcast(observations) self.chain_length = chain_length self.n_samples = n_samples @@ -998,8 +1339,8 @@ def sample(self, observations, steps, n_samples = 10000, n_samples_per_param = 1 dtype=np.uint32) index_arr = np.linspace(0, n_samples / temp_chain_length - 1, n_samples / temp_chain_length).astype( int).reshape(int(n_samples / temp_chain_length), ) - seed_index_arr = np.column_stack((seed_arr, index_arr)) - seed_index_pds = self.backend.parallelize(seed_index_arr) + seed_and_index_arr = np.column_stack((seed_arr, index_arr)) + seed_and_index_pds = self.backend.parallelize(seed_and_index_arr) # 0: update remotely required variables # print("INFO: Broadcasting parameters.") @@ -1007,7 +1348,7 @@ def sample(self, observations, steps, n_samples = 10000, n_samples_per_param = 1 # 1: Calculate parameters # print("INFO: Initial accepted parameter parameters") - params_and_dists_pds = self.backend.map(self._accept_parameter, seed_index_pds) + params_and_dists_pds = self.backend.map(self._accept_parameter, seed_and_index_pds) params_and_dists = self.backend.collect(params_and_dists_pds) new_parameters, new_distances = [list(t) for t in zip(*params_and_dists)] accepted_parameters = np.concatenate(new_parameters) @@ -1035,10 +1376,10 @@ def sample(self, observations, steps, n_samples = 10000, n_samples_per_param = 1 seed_arr = self.rng.randint(0, np.iinfo(np.uint32).max, size=10, dtype=np.uint32) index_arr = np.linspace(0, 10 - 1, 10).astype(int).reshape(10, ) - seed_index_arr = np.column_stack((seed_arr, index_arr)) - seed_index_pds = self.backend.parallelize(seed_index_arr) + seed_and_index_arr = np.column_stack((seed_arr, index_arr)) + seed_and_index_pds = self.backend.parallelize(seed_and_index_arr) - cov_mat_index_pds = self.backend.map(self._update_cov_mat, seed_index_pds) + cov_mat_index_pds = self.backend.map(self._update_cov_mat, seed_and_index_pds) cov_mat_index = self.backend.collect(cov_mat_index_pds) cov_mat, T, accept_index = [list(t) for t in zip(*cov_mat_index)] @@ -1081,18 +1422,26 @@ def destroy(bc): self.accepted_cov_mat_bds = self.backend.broadcast(accepted_cov_mat) # define helper functions for map step - def _accept_parameter(self, seed_index): + def _accept_parameter(self, seed_and_index): """ Samples a single model parameter and simulate from it until distance between simulated outcome and the observation is - smaller than eplison. + smaller than epsilon. - :type seed: int - :rtype: np.array - :return: accepted parameter + Parameters + ---------- + seed: numpy.ndarray + 2 dimensional array. The first entry defines the initial seed of therandom number generator. + The second entry defines the index in the data set. + + Returns + ------- + numpy.ndarray + accepted parameter """ - seed = seed_index[0] - index = seed_index[1] + + seed = seed_and_index[0] + index = seed_and_index[1] rng = np.random.RandomState(seed) self.model.prior.reseed(rng.randint(np.iinfo(np.uint32).max, dtype=np.uint32)) self.kernel.reseed(rng.randint(np.iinfo(np.uint32).max, dtype=np.uint32)) @@ -1146,16 +1495,36 @@ def _accept_parameter(self, seed_index): return (result_theta, result_distance) def _update_cov_mat(self, seed_t): + """ + Updates the covariance matrix. + + Parameters + ---------- + seed_t: numpy.ndarray + 2 dimensional array. The first entry defines the initial seed of the random number generator. + The second entry defines the way in which the accepted covariance matrix is transformed. + + Returns + ------- + numpy.ndarray + accepted covariance matrix + """ seed = seed_t[0] t = seed_t[1] rng = np.random.RandomState(seed) + self.model.prior.reseed(rng.randint(np.iinfo(np.uint32).max, dtype=np.uint32)) self.kernel.reseed(rng.randint(np.iinfo(np.uint32).max, dtype=np.uint32)) + acceptance = 0 + accepted_cov_mat_transformed = self.accepted_cov_mat_bds.value() * pow(2.0, -2.0 * t) + theta = self.accepted_parameters_bds.value()[0] + self.model.set_parameters(theta) + for ind in range(0, self.chain_length): while True: self.kernel.set_parameters([theta, accepted_cov_mat_transformed]) @@ -1184,8 +1553,7 @@ def _update_cov_mat(self, seed_t): else: return (accepted_cov_mat_transformed, t, 0) - -class RSMCABC: +class RSMCABC(BaseAdaptivePopulationMC, InferenceMethod): """This base class implements Adaptive Population Monte Carlo Approximate Bayesian computation of Drovandi and Pettitt [1]. @@ -1195,17 +1563,34 @@ class RSMCABC: Parameters ---------- model : abcpy.models.Model - Model object that conforms to the Model class. + Model object defining the model to be used. distance : abcpy.distances.Distance - Distance object that conforms to the Distance class. + Distance object defining the distance measure used to compare simulated and observed data sets. kernel : abcpy.distributions.Distribution - Distribution object defining the perturbation kernel needed for the sampling + Distribution object defining the perturbation kernel needed for the sampling. backend : abcpy.backends.Backend - Backend object that conforms to the Backend class. + Backend object defining the backend to be used. seed : integer, optional Optional initial seed for the random number generator. The default value is generated randomly. """ + model = None + distance = None + kernel = None + + R = None + rng = None + + n_samples = None + n_samples_per_param = None + alpha = None + + observations_bds = None + accepted_parameters_bds = None + accepted_dist_bds = None + accepted_cov_mat_bds = None + + def __init__(self, model, distance, kernel, backend, seed=None): self.model = model self.distance = distance @@ -1217,15 +1602,11 @@ def __init__(self, model, distance, kernel, backend, seed=None): # these are usually big tables, so we broadcast them to have them once # per executor instead of once per task + self.observations_bds = None self.accepted_parameters_bds = None self.accepted_dist_bds = None self.accepted_cov_mat_bds = None - def __getstate__(self): - #tells cloudpickle to not include the backend when pickling - state = self.__dict__.copy() - del state['backend'] - return state def sample(self, observations, steps, n_samples = 10000, n_samples_per_param = 1, alpha = 0.1, epsilon_init = 100, epsilon_final = 0.1, const = 1, covFactor = 2.0, full_output=0): """Samples from the posterior distribution of the model parameter given the observed @@ -1260,6 +1641,7 @@ def sample(self, observations, steps, n_samples = 10000, n_samples_per_param = 1 abcpy.output.Journal A journal containing simulation results, metadata and optionally intermediate results. """ + self.observations_bds = self.backend.broadcast(observations) self.alpha = alpha self.n_samples = n_samples @@ -1370,11 +1752,17 @@ def _accept_parameter(self, seed): """ Samples a single model parameter and simulate from it until distance between simulated outcome and the observation is - smaller than eplison. + smaller than epsilon. + + Parameters + ---------- + seed: integer + Initial seed for the random number generator. - :type seed: int - :rtype: np.array - :return: accepted parameter + Returns + ------- + numpy.ndarray + accepted parameter """ rng = np.random.RandomState(seed) @@ -1416,8 +1804,7 @@ def _accept_parameter(self, seed): return (self.model.get_parameters(), distance, index_accept) - -class APMCABC: +class APMCABC(BaseAdaptivePopulationMC, InferenceMethod): """This base class implements Adaptive Population Monte Carlo Approximate Bayesian computation of M. Lenormand et al. [1]. @@ -1427,17 +1814,34 @@ class APMCABC: Parameters ---------- model : abcpy.models.Model - Model object that conforms to the Model class. + Model object defining the model to be used. distance : abcpy.distances.Distance - Distance object that conforms to the Distance class. + Distance object defining the distance measure used to compare simulated and observed data sets. kernel : abcpy.distributions.Distribution - Distribution object defining the perturbation kernel needed for the sampling + Distribution object defining the perturbation kernel needed for the sampling. backend : abcpy.backends.Backend - Backend object that conforms to the Backend class. + Backend object defining the backend to be used. seed : integer, optional Optional initial seed for the random number generator. The default value is generated randomly. """ + model = None + distance = None + kernel = None + + epsilon = None + rng = None + + n_samples = None + n_samples_per_param = None + alpha = None + + observations_bds = None + accepted_parameters_bds = None + accepted_weights_bds = None + accepted_dist = None + accepted_cov_mat_bds = None + def __init__(self, model, distance, kernel, backend, seed=None): self.model = model self.distance = distance @@ -1449,16 +1853,12 @@ def __init__(self, model, distance, kernel, backend, seed=None): # these are usually big tables, so we broadcast them to have them once # per executor instead of once per task - self.alpha_accepted_parameters_bds = None - self.alpha_accepted_weights_bds = None - self.alpha_accepted_dist = None + self.observations_bds = None + self.accepted_parameters_bds = None + self.accepted_weights_bds = None + self.accepted_dist = None self.accepted_cov_mat_bds = None - def __getstate__(self): - #tells cloudpickle to not include the backend when pickling - state = self.__dict__.copy() - del state['backend'] - return state def sample(self, observations, steps, n_samples = 10000, n_samples_per_param = 1, alpha = 0.9, acceptance_cutoff = 0.2, covFactor = 2.0, full_output=0): """Samples from the posterior distribution of the model parameter given the observed @@ -1489,12 +1889,12 @@ def sample(self, observations, steps, n_samples = 10000, n_samples_per_param = 1 abcpy.output.Journal A journal containing simulation results, metadata and optionally intermediate results. """ + self.observations_bds = self.backend.broadcast(observations) self.alpha = alpha self.n_samples = n_samples self.n_samples_per_param = n_samples_per_param - journal = Journal(full_output) journal.configuration["type_model"] = type(self.model) journal.configuration["type_dist_func"] = type(self.distance) @@ -1582,19 +1982,19 @@ def sample(self, observations, steps, n_samples = 10000, n_samples_per_param = 1 return journal - def _update_broadcasts(self, alpha_accepted_parameters, alpha_accepted_weights, alpha_accepted_dist, + def _update_broadcasts(self, accepted_parameters, accepted_weights, accepted_dist, accepted_cov_mat): def destroy(bc): if bc != None: bc.unpersist # bc.destroy - if not alpha_accepted_parameters is None: - self.alpha_accepted_parameters_bds = self.backend.broadcast(alpha_accepted_parameters) - if not alpha_accepted_weights is None: - self.alpha_accepted_weights_bds = self.backend.broadcast(alpha_accepted_weights) - if not alpha_accepted_dist is None: - self.alpha_accepted_dist_bds = self.backend.broadcast(alpha_accepted_dist) + if not accepted_parameters is None: + self.accepted_parameters_bds = self.backend.broadcast(accepted_parameters) + if not accepted_weights is None: + self.accepted_weights_bds = self.backend.broadcast(accepted_weights) + if not accepted_dist is None: + self.accepted_dist_bds = self.backend.broadcast(accepted_dist) if not accepted_cov_mat is None: self.accepted_cov_mat_bds = self.backend.broadcast(accepted_cov_mat) @@ -1603,26 +2003,32 @@ def _accept_parameter(self, seed): """ Samples a single model parameter and simulate from it until distance between simulated outcome and the observation is - smaller than eplison. + smaller than epsilon. - :type seed: int - :rtype: np.array - :return: accepted parameter + Parameters + ---------- + seed: integer + Initial seed for the random number generator. + + Returns + ------- + numpy.ndarray + accepted parameter """ rng = np.random.RandomState(seed) self.model.prior.reseed(rng.randint(np.iinfo(np.uint32).max, dtype=np.uint32)) self.kernel.reseed(rng.randint(np.iinfo(np.uint32).max, dtype=np.uint32)) - if self.alpha_accepted_parameters_bds == None: + if self.accepted_parameters_bds == None: self.model.sample_from_prior() y_sim = self.model.simulate(self.n_samples_per_param) dist = self.distance.distance(self.observations_bds.value(), y_sim) weight = 1.0 else: - index = rng.choice(len(self.alpha_accepted_weights_bds.value()), size=1, - p=self.alpha_accepted_weights_bds.value().reshape(-1)) - theta = self.alpha_accepted_parameters_bds.value()[index[0]] + index = rng.choice(len(self.accepted_weights_bds.value()), size=1, + p=self.accepted_weights_bds.value().reshape(-1)) + theta = self.accepted_parameters_bds.value()[index[0]] # trucate the normal to the bounds of parameter space of the model # truncating the normal like this is fine: https://arxiv.org/pdf/0907.4010v1.pdf while True: @@ -1637,17 +2043,16 @@ def _accept_parameter(self, seed): prior_prob = self.model.prior.pdf(new_theta) denominator = 0.0 - for i in range(0, len(self.alpha_accepted_weights_bds.value())): + for i in range(0, len(self.accepted_weights_bds.value())): self.kernel.set_parameters( - [self.alpha_accepted_parameters_bds.value()[i, :], self.accepted_cov_mat_bds.value()]) + [self.accepted_parameters_bds.value()[i, :], self.accepted_cov_mat_bds.value()]) pdf_value = self.kernel.pdf(new_theta) - denominator += self.alpha_accepted_weights_bds.value()[i, 0] * pdf_value + denominator += self.accepted_weights_bds.value()[i, 0] * pdf_value weight = 1.0 * prior_prob / denominator return (self.model.get_parameters(), dist, weight) - -class SMCABC: +class SMCABC(BaseAdaptivePopulationMC, InferenceMethod): """This base class implements Adaptive Population Monte Carlo Approximate Bayesian computation of Del Moral et al. [1]. @@ -1657,17 +2062,33 @@ class SMCABC: Parameters ---------- model : abcpy.models.Model - Model object that conforms to the Model class. + Model object defining the model to be used. distance : abcpy.distances.Distance - Distance object that conforms to the Distance class. + Distance object defining the distance measure used to compare simulated and observed data sets. kernel : abcpy.distributions.Distribution - Distribution object defining the perturbation kernel needed for the sampling + Distribution object defining the perturbation kernel needed for the sampling. backend : abcpy.backends.Backend - Backend object that conforms to the Backend class. + Backend object defining the backend to be used. seed : integer, optional Optional initial seed for the random number generator. The default value is generated randomly. """ + model = None + distance = None + kernel = None + + epsilon = None + rng = None + + n_samples = None + n_samples_per_param = None + + observations_bds = None + accepted_parameters_bds = None + accepted_weights_bds = None + accepted_cov_mat_bds = None + accepted_y_sim_bds = None + def __init__(self, model, distance, kernel, backend, seed=None): self.model = model self.distance = distance @@ -1685,11 +2106,6 @@ def __init__(self, model, distance, kernel, backend, seed=None): self.accepted_cov_mat_bds = None self.accepted_y_sim_bds = None - def __getstate__(self): - #tells cloudpickle to not include the backend when pickling - state = self.__dict__.copy() - del state['backend'] - return state def sample(self, observations, steps, n_samples = 10000, n_samples_per_param = 1, epsilon_final = 0.1, alpha = 0.95, covFactor = 2, resample = None, full_output=0): """Samples from the posterior distribution of the model parameter given the observed @@ -1721,9 +2137,11 @@ def sample(self, observations, steps, n_samples = 10000, n_samples_per_param = 1 abcpy.output.Journal A journal containing simulation results, metadata and optionally intermediate results. """ + self.observations_bds= self.backend.broadcast(observations) self.n_samples = n_samples self.n_samples_per_param = n_samples_per_param + journal = Journal(full_output) journal.configuration["type_model"] = type(self.model) journal.configuration["type_dist_func"] = type(self.distance) @@ -1740,7 +2158,7 @@ def sample(self, observations, steps, n_samples = 10000, n_samples_per_param = 1 if resample == None: resample = n_samples * 0.5 - # Define epsilon_init + # Define epsilon_init epsilon = [10000] # main SMC ABC algorithm @@ -1796,17 +2214,16 @@ def sample(self, observations, steps, n_samples = 10000, n_samples_per_param = 1 # print("DEBUG: Iteration " + str(aStep) + " of SMCABC algorithm.") seed_arr = self.rng.randint(0, np.iinfo(np.uint32).max, size=n_samples, dtype=np.uint32) index_arr = np.arange(n_samples) - seed_index_arr = np.column_stack((seed_arr, index_arr)) - seed_index_pds = self.backend.parallelize(seed_index_arr) + seed_and_index_arr = np.column_stack((seed_arr, index_arr)) + seed_and_index_pds = self.backend.parallelize(seed_and_index_arr) - # update remotely required variables # print("INFO: Broadcasting parameters.") self.epsilon = epsilon self._update_broadcasts(accepted_parameters, accepted_weights, accepted_cov_mat, accepted_y_sim) # calculate resample parameters # print("INFO: Resampling parameters") - params_and_ysim_pds = self.backend.map(self._accept_parameter, seed_index_pds) + params_and_ysim_pds = self.backend.map(self._accept_parameter, seed_and_index_pds) params_and_ysim = self.backend.collect(params_and_ysim_pds) new_parameters, new_y_sim = [list(t) for t in zip(*params_and_ysim)] new_parameters = np.array(new_parameters) @@ -1831,6 +2248,30 @@ def sample(self, observations, steps, n_samples = 10000, n_samples_per_param = 1 def _compute_epsilon(self, epsilon_new, epsilon, observations, accepted_y_sim, accepted_weights, n_samples, n_samples_per_param, alpha): + """ + Parameters + ---------- + epsilon_new: float + New value for epsilon. + epsilon: float + Current threshold. + observations: numpy.ndarray + Observed data. + accepted_y_sim: numpy.ndarray + Accepted simulated data. + accepted_weights: numpy.ndarray + Accepted weights. + n_samples: integer + Number of samples to generate. + n_samples_per_param: integer + Number of data points in each simulated data set. + alpha: float + + Returns + ------- + float + Newly computed value for threshold. + """ RHS = alpha * pow(sum(pow(accepted_weights, 2)), -1) LHS = np.zeros(shape=(n_samples), ) @@ -1879,18 +2320,26 @@ def destroy(bc): # define helper functions for map step - def _accept_parameter(self, seed_index): + def _accept_parameter(self, seed_and_index): """ Samples a single model parameter and simulate from it until distance between simulated outcome and the observation is - smaller than eplison. + smaller than epsilon. + + Parameters + ---------- + seed_and_index: numpy.ndarray + 2 dimensional array. The first entry specifies the initial seed for the random number generator. + The second entry defines the index in the data set. - :type seed: int - :rtype: np.array - :return: accepted parameter + Returns + ------- + Tuple + The first entry of the tuple is the accepted parameters. The second entry is the simulated data set. """ - seed = seed_index[0] - index = seed_index[1] + + seed = seed_and_index[0] + index = seed_and_index[1] rng = np.random.RandomState(seed) self.model.prior.reseed(rng.randint(np.iinfo(np.uint32).max, dtype=np.uint32)) self.kernel.reseed(rng.randint(np.iinfo(np.uint32).max, dtype=np.uint32)) @@ -1936,5 +2385,3 @@ def _accept_parameter(self, seed_index): y_sim = self.accepted_y_sim_bds.value()[index] return (self.model.get_parameters(), y_sim) - - diff --git a/abcpy/models.py b/abcpy/models.py index 9cd6179b..6ec34593 100644 --- a/abcpy/models.py +++ b/abcpy/models.py @@ -28,7 +28,7 @@ def __init__(self, prior, seed = None): Optional initial seed for the random number generator that can be used in the model. The default value is generated randomly. """ - raise NotImplemented + raise NotImplementedError @abstractmethod @@ -54,7 +54,7 @@ def set_parameters(self, theta): TRUE if model accepts the provided parameters, FALSE otherwise """ - raise NotImplemented + raise NotImplementedError @abstractmethod @@ -62,7 +62,7 @@ def sample_from_prior(): """To be overwritten by any sub-class: should resample the model parameters from the prior distribution. """ - raise NotImplemented + raise NotImplementedError @abstractmethod @@ -81,7 +81,7 @@ def simulate(self, k): An array containing k realizations of the model """ - raise NotImplemented + raise NotImplementedError @abstractmethod @@ -94,7 +94,7 @@ def get_parameters(self): An array containing the p parameters of the model """ - raise NotImplemented + raise NotImplementedError class Gaussian(Model): @@ -167,13 +167,10 @@ def simulate(self, k): class Student_t(Model): """This class implements the Student_t distribution with unknown mean :math:`\mu` and unknown degrees of freedom. - """ - def __init__(self, prior, mu=None, df=None, seed=None): - """ Parameters ---------- prior: abcpy.distributions.Distribution - Prior distribution + Prior distribution mu: float, optional Mean of the Stundent_t distribution. If the parameters is omitted, sampled from the prior. @@ -181,8 +178,10 @@ def __init__(self, prior, mu=None, df=None, seed=None): The degrees of freedom of the Student_t distribution. If the parameters is omitted, sampled from the prior. seed: int, optional - Initial seed. The default value is generated randomly. - """ + Initial seed. The default value is generated randomly. + """ + + def __init__(self, prior, mu=None, df=None, seed=None): assert(not (mu == None) != (df == None)) self.prior = prior if mu == None and df == None: @@ -224,19 +223,20 @@ class MixtureNormal(Model): """This class implements the Mixture of multivariate normal ditribution with unknown mean \ :math:`\mu` described as following, :math:`x|\mu \sim 0.5\mathcal{N}(\mu,I_p)+0.5\mathcal{N}(\mu,0.01I_p)`, where :math:`x=(x_1,x_2,\ldots,x_p)` is the - dataset simulated from the model and mean is :math:`\mu=(\mu_1,\mu_2,\ldots,\mu_p)`. """ + dataset simulated from the model and mean is :math:`\mu=(\mu_1,\mu_2,\ldots,\mu_p)`. + + Parameters + ---------- + prior: abcpy.distributions.Distribution + Prior distribution + mu: numpy.ndarray or list, optional + Mean of the mixture normal. If the parameter is omitted, sampled + from the prior. + seed: int, optional + Initial seed. The default value is generated randomly. + """ def __init__(self, prior, mu, seed = None): - """ - Parameters - ---------- - prior: abcpy.distributions.Distribution - Prior distribution - mu: numpy.ndarray or list, optional - Mean of the mixture normal. If the parameter is omitted, sampled - from the prior. - seed: int, optional - Initial seed. The default value is generated randomly. - """ + # Assign prior if not isinstance(prior, Distribution): raise TypeError('Prior is not of our defined Prior class type') @@ -291,26 +291,25 @@ class StochLorenz95(Model): [2] Lorenz, E. (1995). Predictability: a problem partly solved. In Proceedings of the Seminar on Predictability, volume 1, pages 1–18. European Center on Medium Range - Weather Forecasting, Europe + Weather Forecasting, Europe + + Parameters + ---------- + prior: abcpy.distributions.Distribution + Prior distribution + theta: list or numpy.ndarray, optional + Closure parameters. If the parameter is omitted, sampled + from the prior. + initial_state: numpy.ndarray, optional + Initial state value of the time-series, The default value is None, which assumes a previously computed + value from a full Lorenz model as the Initial value. + n_timestep: int, optional + Number of timesteps between [0,4], where 4 corresponds to 20 days. The default value is 160. + seed: int, optional + Initial seed. The default value is generated randomly. """ def __init__(self, prior, theta, initial_state = None, n_timestep = 160, seed = None): - """ - Parameters - ---------- - prior: abcpy.distributions.Distribution - Prior distribution - theta: list or numpy.ndarray, optional - Closure parameters. If the parameter is omitted, sampled - from the prior. - initial_state: numpy.ndarray, optional - Initial state value of the time-series, The default value is None, which assumes a previously computed - value from a full Lorenz model as the Initial value. - n_timestep: int, optional - Number of timesteps between [0,4], where 4 corresponds to 20 days. The default value is 160. - seed: int, optional - Initial seed. The default value is generated randomly. - """ # Assign prior if not isinstance(prior, Distribution): @@ -464,29 +463,29 @@ def _rk4ode(self, ode, timespan, timeseries_initial, parameter): timeseries[:,ind+1] = timeseries_initial # Return the solved timeseries at the values in timespan return timeseries + class Ricker(Model): """Ecological model that describes the observed size of animal population over time described in [1]. [1] S. N. Wood. Statistical inference for noisy nonlinear ecological dynamic systems. Nature, 466(7310):1102–1104, Aug. 2010. + + Parameters + ---------- + prior: abcpy.distributions.Distribution + Prior distribution + theta: list or numpy.ndarray, optional + The parameter is a vector consisting of three numbers \ + :math:`\log r` (real number), :math:`\sigma` (positive real number, > 0), :math:`\phi` (positive real number > 0) + If the parameter is ommitted, sampled from the prior. + n_timestep: int, optional + Number of timesteps. The default value is 100. + seed: int, optional + Initial seed. The default value is generated randomly. """ def __init__(self, prior, theta=None, n_timestep = 100, seed = None): - """ - Parameters - ---------- - prior: abcpy.distributions.Distribution - Prior distribution - theta: list or numpy.ndarray, optional - The parameter is a vector consisting of three numbers \ - :math:`\log r` (real number), :math:`\sigma` (positive real number, > 0), :math:`\phi` (positive real number > 0) - If the parameter is ommitted, sampled from the prior. - n_timestep: int, optional - Number of timesteps. The default value is 100. - seed: int, optional - Initial seed. The default value is generated randomly. - """ # Assign prior if not isinstance(prior, Distribution): raise TypeError('Prior is not of our defined Prior class type') diff --git a/abcpy/modelselections.py b/abcpy/modelselections.py index df0e1346..5fc61e9e 100644 --- a/abcpy/modelselections.py +++ b/abcpy/modelselections.py @@ -38,7 +38,12 @@ def __init__(self, model_array, statistics_calc, backend, seed = None): self.reference_table_calculated = None - raise NotImplemented + raise NotImplementedError + + def __getstate__(self): + state = self.__dict__.copy() + del state['backend'] + return state @abstractmethod @@ -65,7 +70,7 @@ def select_model(self, observations, n_samples = 1000, n_samples_per_param = 100 """ - raise NotImplemented + raise NotImplementedError @abstractmethod def posterior_probability(self, observations): @@ -83,7 +88,7 @@ def posterior_probability(self, observations): A vector containing the approximate posterior probability of the model chosen. """ - raise NotImplemented + raise NotImplementedError class RandomForest(ModelSelections): """ @@ -109,11 +114,16 @@ def __init__(self, model_array, statistics_calc, backend, N_tree = 100, n_try_fr self.statistics_calc = statistics_calc self.backend = backend self.rng = np.random.RandomState(seed) - self.seed = seed + self.seed = seed + + self.n_samples_per_param = None + self.reference_table_calculated = 0 self.N_tree = N_tree self.n_try_fraction = n_try_fraction + self.observations_bds = None + def select_model(self, observations, n_samples = 1000, n_samples_per_param = 1): """ Parameters @@ -130,15 +140,16 @@ def select_model(self, observations, n_samples = 1000, n_samples_per_param = 1): A model which are of type abcpy.models.Model """ + self.n_samples_per_param = n_samples_per_param + self.observations_bds = self.backend.broadcast(observations) + # Creation of reference table if self.reference_table_calculated is 0: - rc = _RemoteContextForReferenceTable(self.backend, self.model_array, self.statistics_calc, observations, n_samples_per_param) - - # Simulating the data, distance and statistics + # Simulating the data, distance and statistics seed_arr = self.rng.randint(1, n_samples*n_samples, size=n_samples, dtype=np.int32) seed_pds = self.backend.parallelize(seed_arr) - model_data_pds = self.backend.map(rc._simulate_model_data, seed_pds) + model_data_pds = self.backend.map(self._simulate_model_data, seed_pds) model_data = self.backend.collect(model_data_pds) models, data, statistics = [list(t) for t in zip(*model_data)] self.reference_table_models = models @@ -160,6 +171,36 @@ def select_model(self, observations, n_samples = 1000, n_samples_per_param = 1): return(self.model_array[int(classifier.predict(self.statistics_calc.statistics(observations)))]) + def _simulate_model_data(self, seed): + """ + Samples a single model parameter and simulates from it until + distance between simulated outcome and the observation is + smaller than eplison. + + Parameters + ---------- + seed: int + value of a seed to be used for reseeding + Returns + ------- + np.array + accepted parameter + """ + # reseed random number genearator + rng = np.random.RandomState(seed) + len_model_array = len(self.model_array) + model = self.model_array[int(sum(np.linspace(0, len_model_array - 1, len_model_array) \ + * rng.multinomial(1, (1 / len_model_array) * np.ones(len_model_array))))] + # reseed prior + model.prior.reseed(seed) + # sample from prior + model.sample_from_prior() + # sample data set, extract statistics and compute distance from y_obs + y_sim = model.simulate(self.n_samples_per_param) + statistics = self.statistics_calc.statistics(y_sim) + + return (model, y_sim, statistics) + def posterior_probability(self, observations, n_samples = 1000, n_samples_per_param = 1): """ Parameters @@ -175,16 +216,16 @@ def posterior_probability(self, observations, n_samples = 1000, n_samples_per_pa abcpy.models.Model A model which are of type abcpy.models.Model - """ + """ + self.n_samples_per_param = 1 + self.observations_bds = self.backend.broadcast(observations) # Creation of reference table if self.reference_table_calculated is 0: - rc = _RemoteContextForReferenceTable(self.backend, self.model_array, self.statistics_calc, observations, n_samples_per_param) - - # Simulating the data, distance and statistics + # Simulating the data, distance and statistics seed_arr = self.rng.randint(1, n_samples*n_samples, size=n_samples, dtype=np.int32) seed_pds = self.backend.parallelize(seed_arr) - model_data_pds = self.backend.map(rc._simulate_model_data, seed_pds) + model_data_pds = self.backend.map(self._simulate_model_data, seed_pds) model_data = self.backend.collect(model_data_pds) models, data, statistics = [list(t) for t in zip(*model_data)] self.reference_table_models = models @@ -214,47 +255,4 @@ def posterior_probability(self, observations, n_samples = 1000, n_samples_per_pa regressor.fit(self.reference_table_statistics,pred_error) return(1-regressor.predict(self.statistics_calc.statistics(observations))) - -class _RemoteContextForReferenceTable: - """ - Contains everything that is sent over the network like broadcast vars and map functions - """ - - def __init__(self, backend, model_array, stat_calc, observations, n_samples_per_param): - self.model_array = model_array - self.stat_calc = stat_calc - self.n_samples_per_param = n_samples_per_param - - # these are usually big tables, so we broadcast them to have them once - # per executor instead of once per task - self.observations_bds = backend.broadcast(observations) - - def _simulate_model_data(self, seed): - """ - Samples a single model parameter and simulates from it until - distance between simulated outcome and the observation is - smaller than eplison. - - Parameters - ---------- - seed: int - value of a seed to be used for reseeding - Returns - ------- - np.array - accepted parameter - """ - #reseed random number genearator - rng = np.random.RandomState(seed) - len_model_array = len(self.model_array) - model = self.model_array[int(sum(np.linspace(0,len_model_array-1,len_model_array)\ - *rng.multinomial(1,(1/len_model_array)*np.ones(len_model_array))))] - #reseed prior - model.prior.reseed(seed) - #sample from prior - model.sample_from_prior() - #sample data set, extract statistics and compute distance from y_obs - y_sim = model.simulate(self.n_samples_per_param) - statistics = self.stat_calc.statistics(y_sim) - - return (model, y_sim, statistics) \ No newline at end of file + diff --git a/abcpy/statistics.py b/abcpy/statistics.py index 97b80592..d2e65c38 100644 --- a/abcpy/statistics.py +++ b/abcpy/statistics.py @@ -27,7 +27,7 @@ def __init__(self, degree = 2, cross = True): Defines whether to include the cross-product terms. The default value is TRUE, meaning the cross product term is included. """ - raise NotImplemented + raise NotImplementedError @abstractmethod @@ -47,7 +47,7 @@ def statistics(self, data): """ - raise NotImplemented + raise NotImplementedError def _polynomial_expansion(self, summary_statistics): """Helper function that does the polynomial expansion and includes cross-product diff --git a/abcpy/summaryselections.py b/abcpy/summaryselections.py index f6f9901a..12f04ae5 100644 --- a/abcpy/summaryselections.py +++ b/abcpy/summaryselections.py @@ -28,12 +28,17 @@ def __init__(self, model, statistics_calc, backend, n_samples = 1000, seed = Non seed: integer, optional Optional initial seed for the random number generator. The default value is generated randomly. """ - raise NotImplemented + raise NotImplementedError + + def __getstate__(self): + state = self.__dict__.copy() + del state['backend'] + return state @abstractmethod def transformation(self, statistics): - raise NotImplemented + raise NotImplementedError class Semiautomatic(Summaryselections): @@ -49,14 +54,12 @@ def __init__(self, model, statistics_calc, backend, n_samples = 1000, seed = Non self.backend = backend self.rng = np.random.RandomState(seed) self.model.prior.reseed(self.rng.randint(np.iinfo(np.uint32).max, dtype=np.uint32)) - - rc = _RemoteContextSemiautomatic(self.backend, self.model, self.statistics_calc) - + # main algorithm seed_arr = self.rng.randint(1, n_samples*n_samples, size=n_samples, dtype=np.int32) seed_pds = self.backend.parallelize(seed_arr) - sample_parameters_statistics_pds = self.backend.map(rc._sample_parameter_statistics, seed_pds) + sample_parameters_statistics_pds = self.backend.map(self._sample_parameter_statistics, seed_pds) sample_parameters_and_statistics = self.backend.collect(sample_parameters_statistics_pds) sample_parameters, sample_statistics = [list(t) for t in zip(*sample_parameters_and_statistics)] sample_parameters = np.array(sample_parameters) @@ -72,26 +75,15 @@ def transformation(self, statistics): if not statistics.shape[1] == self.coefficients_learnt.shape[1]: raise ValueError('Mismatch in dimension of summary statistics') return np.dot(statistics,np.transpose(self.coefficients_learnt)) - - -class _RemoteContextSemiautomatic: - """ - Contains everything that is sent over the network like broadcast vars and map functions - """ - - def __init__(self, backend, model, stat_calc): - self.model = model - self.stat_calc = stat_calc - def _sample_parameter_statistics(self, seed): """ Samples a single model parameter and simulates from it until distance between simulated outcome and the observation is smaller than eplison. - + Parameters - ---------- + ---------- seed: int value of a seed to be used for reseeding Returns @@ -101,8 +93,11 @@ def _sample_parameter_statistics(self, seed): """ self.model.prior.reseed(seed) self.model.sample_from_prior() + + return (self.model.get_parameters(), self.statistics_calc.statistics(self.model.simulate(1))) + - return (self.model.get_parameters(), self.stat_calc.statistics(self.model.simulate(1))) + diff --git a/tests/pickle_tests.py b/tests/pickle_tests.py index 8c777758..63c4ef5c 100644 --- a/tests/pickle_tests.py +++ b/tests/pickle_tests.py @@ -1,6 +1,7 @@ import unittest import cloudpickle import numpy as np +import pickle '''We use pickle in our MPI backend to send a method from the master to the workers. The object with which this method is associated cotains the backend as an attribute, while the backend itself contains the data on which the workers should work. Pickling the method results in pickling the backend, which results in the whole data being pickled and sent, which is undersirable. @@ -28,7 +29,7 @@ def __getstate__(self): class PickleTests(unittest.TestCase): def test_exclusion(self,A,string): """Tests whether after pickling and unpickling the object, the attribute which should not be included exists""" - pickled_object = cloudpickle.dumps(A()) + pickled_object = cloudpickle.dumps(A(), pickle.HIGHEST_PROTOCOL) unpickled_object = cloudpickle.loads(pickled_object) assert(not(hasattr(pickled_object,string))) From 383413a68406f8de8851983b16c67c0d16ce0e8a Mon Sep 17 00:00:00 2001 From: Nicole Date: Mon, 2 Oct 2017 17:12:04 +0200 Subject: [PATCH 11/14] Fixed small typo --- abcpy/inferences.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/abcpy/inferences.py b/abcpy/inferences.py index 9a5c8923..0e6b63f2 100644 --- a/abcpy/inferences.py +++ b/abcpy/inferences.py @@ -448,7 +448,7 @@ def sample(self, observations, steps, epsilon_init, n_samples = 10000, n_samples epsilon_arr = [None] * steps epsilon_arr[0] = epsilon_init else: - raise ValueError("The length of epsilon_init can only be of 1 or steps.") + raise ValueError("The length of epsilon_init can only be equal to 1 or steps.") # main PMCABC algorithm # print("INFO: Starting PMCABC iterations.") From bdbab5e59e26b2a9a8fefff182088dd64f87044c Mon Sep 17 00:00:00 2001 From: Nicole Date: Fri, 6 Oct 2017 11:14:49 +0200 Subject: [PATCH 12/14] Some more typos, created new branch for all things related to RemoteContext refactoring --- abcpy/approx_lhd.py | 12 ++++++------ abcpy/models.py | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/abcpy/approx_lhd.py b/abcpy/approx_lhd.py index 5412acfd..27d3fec6 100644 --- a/abcpy/approx_lhd.py +++ b/abcpy/approx_lhd.py @@ -27,7 +27,7 @@ def __init__(self, statistics_calc): @abstractmethod def likelihood(y_obs, y_sim): """To be overwritten by any sub-class: should compute the approximate likelihood - value given the observed dataset y_obs and dataset y_sim simulated from + value given the observed data set y_obs and the data set y_sim simulated from model set at the parameter value. Parameters @@ -47,7 +47,7 @@ def likelihood(y_obs, y_sim): class SynLiklihood(Approx_likelihood): - """This class implements the aproximate likelihood function which computes the pproximate + """This class implements the approximate likelihood function which computes the approximate likelihood using the synthetic likelihood approach described in Wood [1]. For synthetic likelihood approximation, we compute the robust precision matrix using Ledoit and Wolf's [2] method. @@ -93,8 +93,8 @@ def likelihood(self, y_obs, y_sim): class PenLogReg(Approx_likelihood): - """This class implements the aproximate likelihood function which computes the pproximate - likelihood upto a constant using penalized logistic regression described in + """This class implements the approximate likelihood function which computes the approximate + likelihood up to a constant using penalized logistic regression described in Dutta et. al. [1]. It takes one additional function handler defining the true model and two additional parameters n_folds and n_simulate correspondingly defining number of folds used to estimate prediction error using cross-validation and the number @@ -134,7 +134,7 @@ def __init__(self, statistics_calc, model, n_simulate, n_folds=10, max_iter = 10 self.n_simulate = n_simulate self.seed = seed self.max_iter = max_iter - # Simulate reference data and extract summary statistics from the reffernce data + # Simulate reference data and extract summary statistics from the reference data self.ref_data_stat = self._simulate_ref_data() @@ -164,7 +164,7 @@ def likelihood(self, y_obs, y_sim): def _simulate_ref_data(self): """ - Simulate the reference dataset. This code is run at the initializtion of + Simulate the reference data set. This code is run at the initialization of Penlogreg """ diff --git a/abcpy/models.py b/abcpy/models.py index 6ec34593..b2d85d28 100644 --- a/abcpy/models.py +++ b/abcpy/models.py @@ -58,7 +58,7 @@ def set_parameters(self, theta): @abstractmethod - def sample_from_prior(): + def sample_from_prior(self): """To be overwritten by any sub-class: should resample the model parameters from the prior distribution. """ From 181601cd805fb3d57f08eb398b54263dc534bc27 Mon Sep 17 00:00:00 2001 From: Nicole Date: Fri, 6 Oct 2017 15:58:25 +0200 Subject: [PATCH 13/14] Fixed gitignore and test for pickling --- .gitignore | 12 ------------ tests/pickle_tests.py | 9 +++------ 2 files changed, 3 insertions(+), 18 deletions(-) diff --git a/.gitignore b/.gitignore index dedc2fec..eba075aa 100644 --- a/.gitignore +++ b/.gitignore @@ -87,17 +87,5 @@ ENV/ # Rope project settings .ropeproject -abcpy/.idea/ -abcpy/inf_no_abs.py -abcpy/inferences_old.py -abcpy/modelselection_old.py -abcpy/pymc_dist.py -abcpy/summaryselection_old.py -abcpy/test.py -abcpy/uniform.py -tests/backend.py -tests/save.p -tests/save2.p -tests/save3.p .idea/ diff --git a/tests/pickle_tests.py b/tests/pickle_tests.py index 63c4ef5c..ecbb39c8 100644 --- a/tests/pickle_tests.py +++ b/tests/pickle_tests.py @@ -27,11 +27,8 @@ def __getstate__(self): return state class PickleTests(unittest.TestCase): - def test_exclusion(self,A,string): + def test_exclusion(self): """Tests whether after pickling and unpickling the object, the attribute which should not be included exists""" - pickled_object = cloudpickle.dumps(A(), pickle.HIGHEST_PROTOCOL) + pickled_object = cloudpickle.dumps(ToBePickled(), pickle.HIGHEST_PROTOCOL) unpickled_object = cloudpickle.loads(pickled_object) - assert(not(hasattr(pickled_object,string))) - -A=PickleTests() -A.test_exclusion(ToBePickled,'notIncluded') \ No newline at end of file + assert(not(hasattr(pickled_object,'notIncluded'))) From 41efad540dc4105ed020efbda444a6753781450f Mon Sep 17 00:00:00 2001 From: Avinash Ummadisingu Date: Fri, 6 Oct 2017 16:22:37 +0200 Subject: [PATCH 14/14] Update .gitignore --- .gitignore | 2 -- 1 file changed, 2 deletions(-) diff --git a/.gitignore b/.gitignore index eba075aa..72364f99 100644 --- a/.gitignore +++ b/.gitignore @@ -87,5 +87,3 @@ ENV/ # Rope project settings .ropeproject -.idea/ -