diff --git a/abcpy/approx_lhd.py b/abcpy/approx_lhd.py index a7801361..27d3fec6 100644 --- a/abcpy/approx_lhd.py +++ b/abcpy/approx_lhd.py @@ -6,19 +6,20 @@ class Approx_likelihood(metaclass = ABCMeta): """This abstract base class defines the approximate likelihood - function. To approximate the likelihood function at a parameter value given observed dataset, - we need to pass a dataset simulated from model set at the parameter value and the observed dataset. + function. To approximate the likelihood function at a parameter value given observed data set, + we need to pass a data set simulated from model set at the parameter value and the observed data set. """ @abstractmethod def __init__(self, statistics_calc): - """ The constructor of a sub-class must accept a non-optional statistics - calculator, which is stored to self.statistics_calc. + """ + The constructor of a sub-class must accept a non-optional statistics + calculator, which is stored to self.statistics_calc. - Parameters - ---------- - statistics_calc : abcpy.stasistics.Statistics - Statistics extractor object that conforms to the Statistics class. + Parameters + ---------- + statistics_calc : abcpy.stasistics.Statistics + Statistics extractor object that conforms to the Statistics class. """ raise NotImplemented @@ -26,7 +27,7 @@ def __init__(self, statistics_calc): @abstractmethod def likelihood(y_obs, y_sim): """To be overwritten by any sub-class: should compute the approximate likelihood - value given the observed dataset y_obs and dataset y_sim simulated from + value given the observed data set y_obs and the data set y_sim simulated from model set at the parameter value. Parameters @@ -46,7 +47,7 @@ def likelihood(y_obs, y_sim): class SynLiklihood(Approx_likelihood): - """This class implements the aproximate likelihood function which computes the pproximate + """This class implements the approximate likelihood function which computes the approximate likelihood using the synthetic likelihood approach described in Wood [1]. For synthetic likelihood approximation, we compute the robust precision matrix using Ledoit and Wolf's [2] method. @@ -92,8 +93,8 @@ def likelihood(self, y_obs, y_sim): class PenLogReg(Approx_likelihood): - """This class implements the aproximate likelihood function which computes the pproximate - likelihood upto a constant using penalized logistic regression described in + """This class implements the approximate likelihood function which computes the approximate + likelihood up to a constant using penalized logistic regression described in Dutta et. al. [1]. It takes one additional function handler defining the true model and two additional parameters n_folds and n_simulate correspondingly defining number of folds used to estimate prediction error using cross-validation and the number @@ -106,10 +107,8 @@ class PenLogReg(Approx_likelihood): [2] Friedman, J., Hastie, T., and Tibshirani, R. (2010). Regularization paths for generalized linear models via coordinate descent. Journal of Statistical - Software, 33(1), 1–22. - """ - def __init__(self, statistics_calc, model, n_simulate, n_folds=10, max_iter = 100000, seed = None): - """ + Software, 33(1), 1–22. + Parameters ---------- statistics_calc : abcpy.stasistics.Statistics @@ -124,17 +123,18 @@ def __init__(self, statistics_calc, model, n_simulate, n_folds=10, max_iter = 10 Maximum passes over the data. The default is 100000. seed: int, optional Seed for the random number generator. The used glmnet solver is not - deterministic, this seed is used for determining the cv folds. The default value is + deterministic, this seed is used for determining the cv folds. The default value is None. - """ - + """ + def __init__(self, statistics_calc, model, n_simulate, n_folds=10, max_iter = 100000, seed = None): + self.model = model self.statistics_calc = statistics_calc self.n_folds = n_folds self.n_simulate = n_simulate self.seed = seed self.max_iter = max_iter - # Simulate reference data and extract summary statistics from the reffernce data + # Simulate reference data and extract summary statistics from the reference data self.ref_data_stat = self._simulate_ref_data() @@ -164,7 +164,7 @@ def likelihood(self, y_obs, y_sim): def _simulate_ref_data(self): """ - Simulate the reference dataset. This code is run at the initializtion of + Simulate the reference data set. This code is run at the initialization of Penlogreg """ diff --git a/abcpy/backends/base.py b/abcpy/backends/base.py index c525e34b..13e6d760 100644 --- a/abcpy/backends/base.py +++ b/abcpy/backends/base.py @@ -30,7 +30,7 @@ def parallelize(self, list): A reference object that represents the parallelized list """ - raise NotImplemented + raise NotImplementedError @abstractmethod @@ -49,7 +49,7 @@ def broadcast(self, object): A reference to the broadcasted object """ - raise NotImplemented + raise NotImplementedError @abstractmethod @@ -72,7 +72,7 @@ def map(self, func, pds): a new parallel data set that contains the result of the map """ - raise NotImplemented + raise NotImplementedError @abstractmethod @@ -91,7 +91,7 @@ def collect(self, pds): all elements of pds as a list """ - raise NotImplemented + raise NotImplementedError class PDS: @@ -101,7 +101,7 @@ class PDS: @abstractmethod def __init__(self): - raise NotImplemented + raise NotImplementedError class BDS: @@ -111,7 +111,7 @@ class BDS: @abstractmethod def __init__(self): - raise NotImplemented + raise NotImplementedError @abstractmethod @@ -119,7 +119,7 @@ def value(self): """ This method should return the actual object that the broadcast data set represents. """ - raise NotImplemented + raise NotImplementedError class BackendDummy(Backend): diff --git a/abcpy/backends/mpi.py b/abcpy/backends/mpi.py index 52d04acc..479d1443 100644 --- a/abcpy/backends/mpi.py +++ b/abcpy/backends/mpi.py @@ -1,5 +1,6 @@ import numpy as np import cloudpickle +import pickle from mpi4py import MPI from abcpy.backends import Backend, PDS, BDS @@ -54,7 +55,7 @@ def __command_slaves(self, command, data): elif command == self.OP_MAP: #In map we receive data as (pds_id,pds_id_new,func) #Use cloudpickle to dump the function into a string. - function_packed = self.__sanitize_and_pack_func(data[2]) + function_packed = cloudpickle.dumps(data[2], pickle.HIGHEST_PROTOCOL) data_packet = (command, data[0], data[1], function_packed) elif command == self.OP_BROADCAST: @@ -74,35 +75,6 @@ def __command_slaves(self, command, data): _ = self.comm.bcast(data_packet, root=0) - def __sanitize_and_pack_func(self, func): - """ - Prevents the function from packing the backend by temporarily - setting it to another variable and then uses cloudpickle - to pack it into a string to be sent. - - Parameters - ---------- - func: Python Function - The function we are supposed to pack while sending it along to the slaves - during the map function - - Returns - ------- - Returns a string of the function packed by cloudpickle - - """ - - #Set the backend to None to prevent it from being packed - globals()['backend'] = {} - - function_packed = cloudpickle.dumps(func) - - #Reset the backend to self after it's been packed - globals()['backend'] = self - - return function_packed - - def __generate_new_pds_id(self): """ This method generates a new pds_id to associate a PDS with it's remote counterpart diff --git a/abcpy/distances.py b/abcpy/distances.py index a2897078..7ecf5a51 100644 --- a/abcpy/distances.py +++ b/abcpy/distances.py @@ -21,7 +21,7 @@ def __init__(self, statistics_calc): Statistics extractor object that conforms to the Statistics class. """ - raise NotImplemented + raise NotImplementedError @abstractmethod @@ -58,11 +58,11 @@ def distance(d1, d2): The distance between the two input data sets. """ - raise NotImplemented + raise NotImplementedError @abstractmethod - def dist_max(): + def dist_max(self): """To be overwritten by sub-class: should return maximum possible value of the desired distance function. @@ -76,7 +76,7 @@ def dist_max(): The maximal possible value of the desired distance function. """ - raise NotImplemented + raise NotImplementedError def _calculate_summary_stat(self,d1,d2): diff --git a/abcpy/distributions.py b/abcpy/distributions.py index ff3f7d77..44d54d74 100644 --- a/abcpy/distributions.py +++ b/abcpy/distributions.py @@ -19,12 +19,12 @@ def set_parameters(self, params): Parameters ---------- - theta: list + params: list Contains all the distributions parameters. """ - raise NotImplemented + raise NotImplementedError @@ -39,7 +39,7 @@ def reseed(self, seed): """ - raise NotImplemented + raise NotImplementedError @@ -58,7 +58,7 @@ def sample(self, k): """ - raise NotImplemented + raise NotImplementedError @abstractmethod @@ -77,7 +77,7 @@ def pdf(self, x): """ - raise NotImplemented + raise NotImplementedError class MultiNormal(Distribution): @@ -130,7 +130,7 @@ class Uniform(Distribution): """ This class implements a p-dimensional uniform Prior distribution in a closed interval. """ - def __init__(self, lb : np.ndarray, ub : np.ndarray, seed=None): + def __init__(self, lb, ub, seed=None): """ Defines the upper and lower bounds of a p-dimensional uniform Prior distribution in a closed interval. diff --git a/abcpy/inferences.py b/abcpy/inferences.py index eeae277a..0e6b63f2 100644 --- a/abcpy/inferences.py +++ b/abcpy/inferences.py @@ -1,108 +1,321 @@ +from abc import ABCMeta, abstractmethod, abstractproperty + import numpy as np from abcpy.output import Journal from scipy import optimize -class RejectionABC: - """This base class implements the rejection algorithm based inference scheme [1] for - Approximate Bayesian Computation. - [1] Tavaré, S., Balding, D., Griffith, R., Donnelly, P.: Inferring coalescence - times from DNA sequence data. Genetics 145(2), 505–518 (1997). +class InferenceMethod(metaclass = ABCMeta): + """ + This abstract base class represents an inference method. + + """ + + def __getstate__(self): + """Cloudpickle is used with the MPIBackend. This function ensures that the backend itself + is not pickled + """ + state = self.__dict__.copy() + del state['backend'] + return state + + @abstractmethod + def sample(self): + """To be overwritten by any sub-class: + Samples from the posterior distribution of the model parameter given the observed + data observations. + """ + raise NotImplementedError + + @abstractproperty + def model(self): + """To be overwritten by any sub-class: an attribute specifying the model to be used + """ + raise NotImplementedError + + @abstractproperty + def rng(self): + """To be overwritten by any sub-class: an attribute specifying the random number generator to be used + """ + raise NotImplementedError + + @abstractproperty + def n_samples(self): + """To be overwritten by any sub-class: an attribute specifying the number of samples to be generated + """ + raise NotImplementedError + + @abstractproperty + def n_samples_per_param(self): + """To be overwritten by any sub-class: an attribute specifying the number of data points in each simulated data set.""" + raise NotImplementedError + + @abstractproperty + def observations_bds(self): + """To be overwritten by any sub-class: an attribute saving the observations as bds + """ + raise NotImplementedError + + +class BasePMC(InferenceMethod, metaclass = ABCMeta): + """ + This abstract base class represents inference methods that use Population Monte Carlo. + + """ + @abstractmethod + def _update_broadcasts(self, accepted_parameters, accepted_weights, accepted_cov_mat): + """ + To be overwritten by any sub-class: broadcasts updated values + + Parameters + ---------- + accepted_parameters: numpy.array + Contains all new accepted parameters. + accepted_weights: numpy.array + Contains all the new accepted weights. + accepted_cov_mat: numpy.ndarray + Contains the new accepted covariance matrix + + Returns + ------- + None + """ + raise NotImplementedError + + @abstractmethod + def _calculate_weight(self, theta): + """ + To be overwritten by any sub-class: + Calculates the weight for the given parameter using + accepted_parameters, accepted_cov_mat + + Parameters + ---------- + theta: np.array + 1xp matrix containing the model parameters, where p is the dimension of parameters + + Returns + ------- + float + the new weight for theta + """ + raise NotImplementedError + + @abstractproperty + def kernel(self): + """To be overwritten by any sub-class: an attribute specifying the kernel to be used + """ + raise NotImplementedError + + @abstractproperty + def accepted_parameters_bds(self): + """To be overwritten by any sub-class: an attribute saving the accepted parameters as bds + """ + raise NotImplementedError + + @abstractproperty + def accepted_weights_bds(self): + """To be overwritten by any sub-class: an attribute saving the accepted weights as bds + """ + raise NotImplementedError + + @abstractproperty + def accepted_cov_mat_bds(self): + """To be overwritten by any sub-class: an attribute saving the accepted covariance matrix as bds + """ + raise NotImplementedError + + + +class BaseAnnealing(InferenceMethod, metaclass = ABCMeta): + """ + This abstract base class represents inference methods that use annealing. + + """ + + @abstractmethod + def _update_broadcasts(self): + raise NotImplementedError + + @abstractmethod + def _accept_parameter(self): + raise NotImplementedError + + @abstractproperty + def distance(self): + """To be overwritten by any sub-class: an attribute specifying the distance measure to be used + """ + raise NotImplementedError + + @abstractproperty + def kernel(self): + """To be overwritten by any sub-class: an attribute specifying the kernel to be used + """ + raise NotImplementedError + + @abstractproperty + def accepted_parameters_bds(self): + """To be overwritten by any sub-class: an attribute saving the accepted parameters as bds + """ + raise NotImplementedError + + @abstractproperty + def accepted_cov_mat_bds(self): + """To be overwritten by any sub-class: an attribute saving the accepted covariance matrix as bds + """ + raise NotImplementedError + +class BaseAdaptivePopulationMC(InferenceMethod, metaclass = ABCMeta): + """ + This abstract base class represents inference methods that use Adaptive Population Monte Carlo. - Parameters - ---------- - model: abcpy.models.Model - Model object that conforms to the Model class. - distance: abcpy.distances.Distance - Distance object that conforms to the Distance class. - backend: abcpy.backends.Backend - Backend object that conforms to the Backend class. - seed: integer, optional - Optional initial seed for the random number generator. The default value is generated randomly. """ + + @abstractmethod + def _update_broadcasts(self): + """ + To be overwritten by any sub-class: broadcasts updated values + + Parameters + ---------- + accepted_parameters: numpy.array + Contains all new accepted parameters. + accepted_weights: numpy.array + Contains all the new accepted weights. + accepted_cov_mat: numpy.ndarray + Contains the new accepted covariance matrix + + Returns + ------- + None + """ + raise NotImplementedError + + @abstractmethod + def _accept_parameter(self): + """ + To be overwritten by any sub-class: + Samples a single model parameter and simulate from it until + accepted with some probability. + + """ + raise NotImplementedError + + @abstractproperty + def distance(self): + """To be overwritten by any sub-class: an attribute specifying the distance measure to be used + """ + raise NotImplementedError + + @abstractproperty + def kernel(self): + """To be overwritten by any sub-class: an attribute specifying the kernel to be used + """ + raise NotImplementedError + + @abstractproperty + def accepted_parameters_bds(self): + """To be overwritten by any sub-class: an attribute saving the accepted parameters as bds + """ + raise NotImplementedError + + @abstractproperty + def accepted_cov_mat_bds(self): + """To be overwritten by any sub-class: an attribute saving the accepted covariance matrix as bds + """ + raise NotImplementedError + +class RejectionABC(InferenceMethod): + """This base class implements the rejection algorithm based inference scheme [1] for + Approximate Bayesian Computation. + + [1] Tavaré, S., Balding, D., Griffith, R., Donnelly, P.: Inferring coalescence + times from DNA sequence data. Genetics 145(2), 505–518 (1997). + + Parameters + ---------- + model: abcpy.models.Model + Model object defining the model to be used. + distance: abcpy.distances.Distance + Distance object defining the distance measure to compare simulated and observed data sets. + backend: abcpy.backends.Backend + Backend object defining the backend to be used. + seed: integer, optional + Optional initial seed for the random number generator. The default value is generated randomly. + """ + + model = None + distance = None + rng = None + + n_samples = None + n_samples_per_param = None + epsilon = None + + observations_bds = None + def __init__(self, model, distance, backend, seed=None): self.model = model - self.dist_calc = distance + self.distance = distance self.backend = backend self.rng = np.random.RandomState(seed) - def sample(self, observations, n_samples, n_samples_per_param, epsilon, full_output=0): - """Samples from the posterior distribution of the model parameter given the observed + """ + Samples from the posterior distribution of the model parameter given the observed data observations. - Parameters ---------- - observations : python list - The observed data set. - n_samples : integer - Number of samples to generate. - n_samples_per_param : integer - Number of data points in each simulated dataset. - epsilon: float - Value of threshold. + observations: numpy.ndarray + Observed data. + n_samples: integer + Number of samples to generate + n_samples_per_param: integer + Number of data points in each simulated data set. + epsilon: float + Value of threshold full_output: integer, optional - If full_output==1, intermediate results are included in output journal. + If full_output==1, intermediate results are included in output journal. The default value is 0, meaning the intermediate results are not saved. - Returns ------- abcpy.output.Journal a journal containing simulation results, metadata and optionally intermediate results. """ + self.observations_bds = self.backend.broadcast(observations) + self.n_samples = n_samples + self.n_samples_per_param = n_samples_per_param + self.epsilon = epsilon + journal = Journal(full_output) - journal.configuration["n_samples"] = n_samples - journal.configuration["n_samples_per_param"] = n_samples_per_param - journal.configuration["epsilon"] = epsilon - - accepted_parameters = None - - # Initialize variables that need to be available remotely - rc = _RemoteContextRejectionABC(self.backend, self.model, self.dist_calc, observations, n_samples, n_samples_per_param, epsilon) - - # main Rejection ABC algorithm - seed_arr = self.rng.randint(1, n_samples*n_samples, size=n_samples, dtype=np.int32) - seed_pds = self.backend.parallelize(seed_arr) - - accepted_parameters_pds = self.backend.map(rc._sample_parameter, seed_pds) + journal.configuration["n_samples"] = self.n_samples + journal.configuration["n_samples_per_param"] = self.n_samples_per_param + journal.configuration["epsilon"] = self.epsilon + + accepted_parameters = None + + # main Rejection ABC algorithm + seed_arr = self.rng.randint(1, n_samples * n_samples, size=n_samples, dtype=np.int32) + seed_pds = self.backend.parallelize(seed_arr) + + accepted_parameters_pds = self.backend.map(self._sample_parameter, seed_pds) accepted_parameters = self.backend.collect(accepted_parameters_pds) accepted_parameters = np.array(accepted_parameters) journal.add_parameters(accepted_parameters) journal.add_weights(np.ones((n_samples, 1))) - - return journal - - - -class _RemoteContextRejectionABC: - """ - Contains everything that is sent over the network like broadcast vars and map functions - """ - - def __init__(self, backend, model, dist_calc, observations, n_samples, n_samples_per_param, epsilon): - self.model = model - self.dist_calc = dist_calc - self.n_samples = n_samples - self.n_samples_per_param = n_samples_per_param - - self.epsilon = epsilon - - # these are usually big tables, so we broadcast them to have them once - # per executor instead of once per task - self.observations_bds = backend.broadcast(observations) + return journal - def _sample_parameter(self, seed): """ Samples a single model parameter and simulates from it until distance between simulated outcome and the observation is - smaller than eplison. - + smaller than epsilon. + Parameters - ---------- + ---------- seed: int value of a seed to be used for reseeding Returns @@ -110,109 +323,132 @@ def _sample_parameter(self, seed): np.array accepted parameter """ - - distance = self.dist_calc.dist_max() + + distance = self.distance.dist_max() self.model.prior.reseed(seed) - + while distance > self.epsilon: # Accept new parameter value if the distance is less than epsilon self.model.sample_from_prior() y_sim = self.model.simulate(self.n_samples_per_param) - distance = self.dist_calc.distance(self.observations_bds.value(), y_sim) + distance = self.distance.distance(self.observations_bds.value(), y_sim) return self.model.get_parameters() - - - -class PMCABC: - """This base class implements a modified version of Population Monte Carlo based inference scheme - for Approximate Bayesian computation of Beaumont et. al. [1]. Here the threshold value at `t`-th generation are adaptively chosen - by taking the maximum between the epsilon_percentile-th value of discrepancies of the accepted - parameters at `t-1`-th generation and the threshold value provided for this generation by the user. If we take the +class PMCABC(BasePMC, InferenceMethod): + """ + This base class implements a modified version of Population Monte Carlo based inference scheme + for Approximate Bayesian computation of Beaumont et. al. [1]. Here the threshold value at `t`-th generation are adaptively chosen + by taking the maximum between the epsilon_percentile-th value of discrepancies of the accepted + parameters at `t-1`-th generation and the threshold value provided for this generation by the user. If we take the value of epsilon_percentile to be zero (default), this method becomes the inference scheme described in [1], where - the threshold values considered at each generation are the ones provided by the user. - + the threshold values considered at each generation are the ones provided by the user. + [1] M. A. Beaumont. Approximate Bayesian computation in evolution and ecology. Annual Review of Ecology, Evolution, and Systematics, 41(1):379–406, Nov. 2010. - + Parameters ---------- model : abcpy.models.Model - Model object that conforms to the Model class. + Model object defining the model to be used. distance : abcpy.distances.Distance - Distance object that conforms to the Distance class. + Distance object defining the distance measure to compare simulated and observed data sets. kernel : abcpy.distributions.Distribution - Distribution object defining the perturbation kernel needed for the sampling + Distribution object defining the perturbation kernel needed for the sampling. backend : abcpy.backends.Backend - Backend object that conforms to the Backend class. + Backend object defining the backend to be used. seed : integer, optional Optional initial seed for the random number generator. The default value is generated randomly. """ - def __init__(self, model, distance, kernel, backend, seed=None): + + model = None + distance = None + kernel = None + rng = None + + #default value, set so that testing works + n_samples = 2 + n_samples_per_param = None + + observations_bds = None + accepted_parameters_bds = None + accepted_weights_bds = None + accepted_cov_mat_bds = None + + + def __init__(self, model, distance, kernel, backend, seed=None): + self.model = model self.distance = distance self.kernel = kernel self.backend = backend self.rng = np.random.RandomState(seed) - + + # these are usually big tables, so we broadcast them to have them once + # per executor instead of once per task + self.observations_bds = None + self.accepted_parameters_bds = None + self.accepted_weights_bds = None + self.accepted_cov_mat_bds = None + def sample(self, observations, steps, epsilon_init, n_samples = 10000, n_samples_per_param = 1, epsilon_percentile = 0, covFactor = 2, full_output=0): - """Samples from the posterior distribution of the model parameter given the observed + """Samples from the posterior distribution of the model parameter given the observed data observations. - + Parameters ---------- - observations : numpy.ndarray + observations : numpy.ndarray Observed data. - steps : integer - Number of iterations in the sequential algoritm ("generations") - epsilon_init : numpy.ndarray + steps : integer + Number of iterations in the sequential algoritm ("generations") + epsilon_init : numpy.ndarray An array of proposed values of epsilon to be used at each steps. Can be supplied - A single value to be used as the threshold in Step 1 or a `steps`-dimensional array of values to be + A single value to be used as the threshold in Step 1 or a `steps`-dimensional array of values to be used as the threshold in evry steps. n_samples : integer, optional Number of samples to generate. The default value is 10000. - n_samples_per_param : integer, optional + n_samples_per_param : integer, optional Number of data points in each simulated data set. The default value is 1. - epsilon_percentile : float, optional + epsilon_percentile : float, optional A value between [0, 100]. The default value is 0, meaning the threshold value provided by the user being used. - covFactor : float, optional + covFactor : float, optional scaling parameter of the covariance matrix. The default value is 2 as considered in [1]. full_output: integer, optional - If full_output==1, intermediate results are included in output journal. + If full_output==1, intermediate results are included in output journal. The default value is 0, meaning the intermediate results are not saved. Returns - ------- + ------- abcpy.output.Journal A journal containing simulation results, metadata and optionally intermediate results. """ - + + self.observations_bds = self.backend.broadcast(observations) + self.n_samples = n_samples + self.n_samples_per_param=n_samples_per_param + journal = Journal(full_output) journal.configuration["type_model"] = type(self.model) journal.configuration["type_dist_func"] = type(self.distance) - journal.configuration["n_samples"] = n_samples - journal.configuration["n_samples_per_param"] = n_samples_per_param + journal.configuration["n_samples"] = self.n_samples + journal.configuration["n_samples_per_param"] = self.n_samples_per_param journal.configuration["steps"] = steps journal.configuration["epsilon_percentile"] = epsilon_percentile - + accepted_parameters = None accepted_weights = None accepted_cov_mat = None - #Define epsilon_arr + + # Define epsilon_arr if len(epsilon_init) == steps: epsilon_arr = epsilon_init else: if len(epsilon_init) == 1: - epsilon_arr = [None]*steps - epsilon_arr[0] = epsilon_init + epsilon_arr = [None] * steps + epsilon_arr[0] = epsilon_init else: - raise ValueError("The length of epsilon_init can only be of 1 or steps.") - - - # Initialize variables that need to be available remotely - rc = _RemoteContextPMCABC(self.backend, self.model, self.distance, self.kernel, observations, n_samples, n_samples_per_param) + raise ValueError("The length of epsilon_init can only be equal to 1 or steps.") # main PMCABC algorithm # print("INFO: Starting PMCABC iterations.") @@ -223,30 +459,31 @@ def sample(self, observations, steps, epsilon_init, n_samples = 10000, n_samples # 0: update remotely required variables # print("INFO: Broadcasting parameters.") - rc.epsilon = epsilon_arr[aStep] - rc._update_broadcasts(self.backend, accepted_parameters, accepted_weights, accepted_cov_mat) + self.epsilon = epsilon_arr[aStep] + self._update_broadcasts(accepted_parameters, accepted_weights, accepted_cov_mat) # 1: calculate resample parameters # print("INFO: Resampling parameters") - params_and_dists_and_ysim_pds = self.backend.map(rc._resample_parameter, seed_pds) + params_and_dists_and_ysim_pds = self.backend.map(self._resample_parameter, seed_pds) params_and_dists_and_ysim = self.backend.collect(params_and_dists_and_ysim_pds) new_parameters, distances = [list(t) for t in zip(*params_and_dists_and_ysim)] new_parameters = np.array(new_parameters) - rc._update_broadcasts(self.backend, accepted_parameters, accepted_weights, accepted_cov_mat) - + self._update_broadcasts(accepted_parameters, accepted_weights, accepted_cov_mat) + # Compute epsilon for next step # print("INFO: Calculating acceptance threshold (epsilon).") - if aStep < steps-1: + if aStep < steps - 1: if epsilon_arr[aStep + 1] == None: epsilon_arr[aStep + 1] = np.percentile(distances, epsilon_percentile) else: - epsilon_arr[aStep + 1] = np.max([np.percentile(distances, epsilon_percentile), epsilon_arr[aStep+1]]) + epsilon_arr[aStep + 1] = np.max( + [np.percentile(distances, epsilon_percentile), epsilon_arr[aStep + 1]]) - # 2: calculate weights for new parameters + # 2: calculate weights for new parameters # print("INFO: Calculating weights.") new_parameters_pds = self.backend.parallelize(new_parameters) - new_weights_pds = self.backend.map(rc._calculate_weight, new_parameters_pds) - new_weights = np.array(self.backend.collect(new_weights_pds)).reshape(-1,1) + new_weights_pds = self.backend.map(self._calculate_weight, new_parameters_pds) + new_weights = np.array(self.backend.collect(new_weights_pds)).reshape(-1, 1) sum_of_weights = 0.0 for w in new_weights: sum_of_weights += w @@ -254,77 +491,61 @@ def sample(self, observations, steps, epsilon_init, n_samples = 10000, n_samples # 3: calculate covariance # print("INFO: Calculating covariance matrix.") - new_cov_mat = covFactor * np.cov(new_parameters, aweights = new_weights.reshape(-1), rowvar=False) - + new_cov_mat = covFactor * np.cov(new_parameters, aweights=new_weights.reshape(-1), rowvar=False) + # 4: Update the newly computed values accepted_parameters = new_parameters accepted_weights = new_weights accepted_cov_mat = new_cov_mat # print("INFO: Saving configuration to output journal.") - if (full_output == 1 and aStep <= steps-1) or (full_output == 0 and aStep == steps-1): + if (full_output == 1 and aStep <= steps - 1) or (full_output == 0 and aStep == steps - 1): journal.add_parameters(accepted_parameters) journal.add_weights(accepted_weights) - - #Add epsilon_arr to the journal + + # Add epsilon_arr to the journal journal.configuration["epsilon_arr"] = epsilon_arr return journal -class _RemoteContextPMCABC: - """ - Contains everything that is sent over the network like broadcast vars and map functions - """ - - def __init__(self, backend, model, distance, kernel, observations, n_samples, n_samples_per_param): - self.model = model - self.distance = distance - self.n_samples = n_samples - self.n_samples_per_param = n_samples_per_param - self.kernel = kernel - - self.epsilon = None - - # these are usually big tables, so we broadcast them to have them once - # per executor instead of once per task - self.observations_bds = backend.broadcast(observations) - self.accepted_parameters_bds = None - self.accepted_weights_bds = None - self.accepted_cov_mat_bds = None - - - def _update_broadcasts(self, backend, accepted_parameters, accepted_weights, accepted_cov_mat): + def _update_broadcasts(self, accepted_parameters, accepted_weights, accepted_cov_mat): def destroy(bc): if bc != None: bc.unpersist - #bc.destroy - + # bc.destroy + if not accepted_parameters is None: - self.accepted_parameters_bds = backend.broadcast(accepted_parameters) + self.accepted_parameters_bds = self.backend.broadcast(accepted_parameters) if not accepted_weights is None: - self.accepted_weights_bds = backend.broadcast(accepted_weights) + self.accepted_weights_bds = self.backend.broadcast(accepted_weights) if not accepted_cov_mat is None: - self.accepted_cov_mat_bds = backend.broadcast(accepted_cov_mat) - + self.accepted_cov_mat_bds = self.backend.broadcast(accepted_cov_mat) # define helper functions for map step def _resample_parameter(self, seed): """ Samples a single model parameter and simulate from it until distance between simulated outcome and the observation is - smaller than eplison. - - :type seed: int - :rtype: np.array - :return: accepted parameter + smaller than epsilon. + + Parameters + ---------- + seed: integer + initial seed for the random number generator. + + Returns + ------- + np.array + accepted parameter """ + rng = np.random.RandomState(seed) self.model.prior.reseed(rng.randint(np.iinfo(np.uint32).max, dtype=np.uint32)) self.kernel.reseed(rng.randint(np.iinfo(np.uint32).max, dtype=np.uint32)) - + distance = self.distance.dist_max() while distance > self.epsilon: - #print("on seed " + str(seed) + " distance: " + str(distance) + " epsilon: " + str(self.epsilon)) + # print("on seed " + str(seed) + " distance: " + str(distance) + " epsilon: " + str(self.epsilon)) if self.accepted_parameters_bds == None: self.model.sample_from_prior() else: @@ -334,7 +555,7 @@ def _resample_parameter(self, seed): # truncating the normal like this is fine: https://arxiv.org/pdf/0907.4010v1.pdf while True: self.kernel.set_parameters([theta, self.accepted_cov_mat_bds.value()]) - new_theta = self.kernel.sample(1)[0,:] + new_theta = self.kernel.sample(1)[0, :] theta_is_accepted = self.model.set_parameters(new_theta) if theta_is_accepted and self.model.prior.pdf(self.model.get_parameters()) != 0: break @@ -342,8 +563,9 @@ def _resample_parameter(self, seed): y_sim = self.model.simulate(self.n_samples_per_param) distance = self.distance.distance(self.observations_bds.value(), y_sim) + return (self.model.get_parameters(), distance) - + def _calculate_weight(self, theta): """ Calculates the weight for the given parameter using @@ -352,14 +574,14 @@ def _calculate_weight(self, theta): Parameters ---------- theta: np.array - 1xp matrix containing model parameter, where p is the dimension of parameter - + 1xp matrix containing model parameter, where p is the number of parameters + Returns ------- float the new weight for theta """ - + if self.accepted_weights_bds is None: return 1.0 / self.n_samples else: @@ -367,14 +589,16 @@ def _calculate_weight(self, theta): denominator = 0.0 for i in range(0, self.n_samples): - self.kernel.set_parameters([self.accepted_parameters_bds.value()[i,:], self.accepted_cov_mat_bds.value()]) + self.kernel.set_parameters( + [self.accepted_parameters_bds.value()[i, :], self.accepted_cov_mat_bds.value()]) pdf_value = self.kernel.pdf(theta) - denominator += self.accepted_weights_bds.value()[i,0] * pdf_value - return 1.0 * prior_prob / denominator + denominator += self.accepted_weights_bds.value()[i, 0] * pdf_value + return 1.0 * prior_prob / denominator -class PMC: - """Population Monte Carlo based inference scheme of Cappé et. al. [1]. +class PMC(BasePMC, InferenceMethod): + """ + Population Monte Carlo based inference scheme of Cappé et. al. [1]. This algorithm assumes a likelihood function is available and can be evaluated at any parameter value given the oberved dataset. In absence of the @@ -384,233 +608,245 @@ class PMC: inference schemes are based on Andrieu and Roberts [2]. [1] Cappé, O., Guillin, A., Marin, J.-M., and Robert, C. P. (2004). Population Monte Carlo. - Journal of Computational and Graphical Statistics, 13(4), 907–929. - + Journal of Computational and Graphical Statistics, 13(4), 907–929. + [2] C. Andrieu and G. O. Roberts. The pseudo-marginal approach for efficient Monte Carlo computations. Annals of Statistics, 37(2):697–725, 04 2009. - - """ - - def __init__(self, model, likfun, kernel, backend, seed=None): - """Constructor of PMC inference schemes. - Parameters - ---------- - model : abcpy.models.Model - Model object that conforms to the Model class - likfun : abcpy.approx_lhd.Approx_likelihood - Approx_likelihood object that conforms to the Approx_likelihood class - kernel : abcpy.distributions.Distribution - Distribution object defining the perturbation kernel needed for the sampling - backend : abcpy.backends.Backend - Backend object that conforms to the Backend class - seed : integer, optional - Optional initial seed for the random number generator. The default value is generated randomly. + Parameters + ---------- + model : abcpy.models.Model + Model object defining the model to be used. + likfun : abcpy.approx_lhd.Approx_likelihood + Approx_likelihood object defining the approximated likelihood to be used. + kernel : abcpy.distributions.Distribution + Distribution object defining the perturbation kernel needed for the sampling. + backend : abcpy.backends.Backend + Backend object defining the backend to be used. + seed : integer, optional + Optional initial seed for the random number generator. The default value is generated randomly. - """ + """ + + model = None + likfun = None + kernel = None + rng = None + + n_samples = None + n_samples_per_param = None + + observations_bds = None + accepted_parameters_bds = None + accepted_weights_bds = None + accepted_cov_mat_bds = None + + def __init__(self, model, likfun, kernel, backend, seed=None): self.model = model self.likfun = likfun self.kernel = kernel self.backend = backend self.rng = np.random.RandomState(seed) - + + # these are usually big tables, so we broadcast them to have them once + # per executor instead of once per task + self.observations_bds = None + self.accepted_parameters_bds = None + self.accepted_weights_bds = None + self.accepted_cov_mat_bds = None + + def sample(self, observations, steps, n_samples = 10000, n_samples_per_param = 100, covFactor = None, iniPoints = None, full_output=0): - """Samples from the posterior distribution of the model parameter given the observed + """Samples from the posterior distribution of the model parameter given the observed data observations. - + Parameters ---------- - observations : python list + observations : python list Observed data - steps : integer - number of iterations in the sequential algoritm ("generations") - n_sample : integer, optional + steps : integer + number of iterations in the sequential algoritm ("generations") + n_samples : integer, optional number of samples to generate. The default value is 10000. - n_samples_per_param : integer, optional + n_samples_per_param : integer, optional number of data points in each simulated data set. The default value is 100. - covFactor : float, optional + covFactor : float, optional scaling parameter of the covariance matrix. The default is a p dimensional array of 1 when p is the dimension of the parameter. inipoints : numpy.ndarray, optional parameter vaulues from where the sampling starts. By default sampled from the prior. full_output: integer, optional - If full_output==1, intermediate results are included in output journal. + If full_output==1, intermediate results are included in output journal. The default value is 0, meaning the intermediate results are not saved. Returns - ------- + ------- abcpy.output.Journal - A journal containing simulation results, metadata and optionally intermediate results. - - Parameters - ---------- - + A journal containing simulation results, metadata and optionally intermediate results. """ + self.observations_bds = self.backend.broadcast(observations) + self.n_samples = n_samples + self.n_samples_per_param = n_samples_per_param + journal = Journal(full_output) journal.configuration["type_model"] = type(self.model) journal.configuration["type_lhd_func"] = type(self.likfun) - journal.configuration["n_samples"] = n_samples - journal.configuration["n_samples_per_param"] = n_samples_per_param + journal.configuration["n_samples"] = self.n_samples + journal.configuration["n_samples_per_param"] = self.n_samples_per_param journal.configuration["steps"] = steps journal.configuration["covFactor"] = covFactor journal.configuration["iniPoints"] = iniPoints - + accepted_parameters = None accepted_weights = None accepted_cov_mat = None new_theta = None + dim = len(self.model.get_parameters()) - - # Initialize variables that need to be available remotely - rc = _RemoteContextPMC(self.backend, self.model, self.likfun, self.kernel, observations, n_samples, n_samples_per_param) - + # Initialize particles: When not supplied, randomly draw them from prior distribution # Weights of particles: Assign equal weights for each of the particles if iniPoints == None: - accepted_parameters = np.zeros(shape = (n_samples,dim)) - for ind in range(0,n_samples): + accepted_parameters = np.zeros(shape=(n_samples, dim)) + for ind in range(0, n_samples): self.model.sample_from_prior() - accepted_parameters[ind,:] = self.model.get_parameters() - accepted_weights = np.ones((n_samples,1), dtype=np.float)/n_samples + accepted_parameters[ind, :] = self.model.get_parameters() + accepted_weights = np.ones((n_samples, 1), dtype=np.float) / n_samples else: accepted_parameters = iniPoints - accepted_weights = np.ones((iniPoints.shape[0],1), dtype=np.float)/iniPoints.shape[0] - + accepted_weights = np.ones((iniPoints.shape[0], 1), dtype=np.float) / iniPoints.shape[0] + if covFactor is None: covFactor = np.ones(shape=(dim,)) # Calculate initial covariance matrix - accepted_cov_mat = covFactor * np.cov(accepted_parameters, aweights = accepted_weights.reshape(-1), rowvar=False) - - # main SMC algorithm + accepted_cov_mat = covFactor * np.cov(accepted_parameters, aweights=accepted_weights.reshape(-1), rowvar=False) + + # main SMC algorithm # print("INFO: Starting PMC iterations.") for aStep in range(0, steps): # print("DEBUG: Iteration " + str(aStep) + " of PMC algorithm.") - + # 0: update remotely required variables # print("INFO: Broadcasting parameters.") - rc._update_broadcasts(self.backend, accepted_parameters, accepted_weights, accepted_cov_mat) + self._update_broadcasts(accepted_parameters, accepted_weights, accepted_cov_mat) # 1: calculate resample parameters # print("INFO: Resample parameters.") - index = self.rng.choice(accepted_parameters.shape[0], size = n_samples, p = accepted_weights.reshape(-1)) - #Choose a new particle using the resampled particle (make the boundary proper) - #Initialize new_parameters - new_parameters = np.zeros((n_samples,dim), dtype=np.float) - for ind in range(0,n_samples): - self.kernel.set_parameters([accepted_parameters[index[ind],:], accepted_cov_mat]) + index = self.rng.choice(accepted_parameters.shape[0], size=n_samples, p=accepted_weights.reshape(-1)) + # Choose a new particle using the resampled particle (make the boundary proper) + # Initialize new_parameters + new_parameters = np.zeros((n_samples, dim), dtype=np.float) + for ind in range(0, self.n_samples): + self.kernel.set_parameters([accepted_parameters[index[ind], :], accepted_cov_mat]) while True: - new_theta = self.kernel.sample(1)[0,:] + new_theta = self.kernel.sample(1)[0, :] theta_is_accepted = self.model.set_parameters(new_theta) if theta_is_accepted and self.model.prior.pdf(self.model.get_parameters()) != 0: - new_parameters[ind,:] = new_theta + new_parameters[ind, :] = new_theta break + # 2: calculate approximate lieklihood for new parameters # print("INFO: Calculate approximate likelihood.") new_parameters_pds = self.backend.parallelize(new_parameters) - approx_likelihood_new_parameters_pds = self.backend.map(rc._approx_lik_calc, new_parameters_pds) + approx_likelihood_new_parameters_pds = self.backend.map(self._approx_lik_calc, new_parameters_pds) # print("DEBUG: Collect approximate likelihood from pds.") - approx_likelihood_new_parameters = np.array(self.backend.collect(approx_likelihood_new_parameters_pds)).reshape(-1,1) - - # 3: calculate new weights for new parameters + approx_likelihood_new_parameters = np.array( + self.backend.collect(approx_likelihood_new_parameters_pds)).reshape(-1, 1) + + # 3: calculate new weights for new parameters # print("INFO: Calculating weights.") - new_weights_pds = self.backend.map(rc._calculate_weight, new_parameters_pds) - new_weights = np.array(self.backend.collect(new_weights_pds)).reshape(-1,1) - + new_weights_pds = self.backend.map(self._calculate_weight, new_parameters_pds) + new_weights = np.array(self.backend.collect(new_weights_pds)).reshape(-1, 1) + sum_of_weights = 0.0 - for i in range(0, n_samples): - new_weights[i] = new_weights[i]*approx_likelihood_new_parameters[i] + for i in range(0, self.n_samples): + new_weights[i] = new_weights[i] * approx_likelihood_new_parameters[i] sum_of_weights += new_weights[i] new_weights = new_weights / sum_of_weights accepted_parameters = new_parameters # 4: calculate covariance # print("INFO: Calculating covariance matrix.") - new_cov_mat = covFactor * np.cov(accepted_parameters, aweights = accepted_weights.reshape(-1), rowvar=False) - + new_cov_mat = covFactor * np.cov(accepted_parameters, aweights=accepted_weights.reshape(-1), rowvar=False) + # 5: Update the newly computed values accepted_parameters = new_parameters accepted_weights = new_weights accepted_cov_mat = new_cov_mat - + # print("INFO: Saving configuration to output journal.") - if (full_output == 1 and aStep <= steps-1) or (full_output == 0 and aStep == steps-1): + if (full_output == 1 and aStep <= steps - 1) or (full_output == 0 and aStep == steps - 1): journal.add_parameters(accepted_parameters) journal.add_weights(accepted_weights) journal.add_opt_values(approx_likelihood_new_parameters) return journal -class _RemoteContextPMC: - """ - Contains everything that is sent over the network like broadcast vars and map functions - """ - - def __init__(self, backend, model, likfun, kernel, observations, n_samples, n_samples_per_param,): - self.model = model - self.likfun = likfun - self.n_samples = n_samples - self.n_samples_per_param = n_samples_per_param - self.kernel = kernel - - # these are usually big tables, so we broadcast them to have them once - # per executor instead of once per task - self.observations_bds = backend.broadcast(observations) - self.accepted_parameters_bds = None - self.accepted_weights_bds = None - self.accepted_cov_mat_bds = None - - def _update_broadcasts(self, backend, accepted_parameters, accepted_weights, accepted_cov_mat): + def _update_broadcasts(self, accepted_parameters, accepted_weights, accepted_cov_mat): def destroy(bc): if bc != None: bc.unpersist - #bc.destroy - + # bc.destroy + if not accepted_parameters is None: - self.accepted_parameters_bds = backend.broadcast(accepted_parameters) + self.accepted_parameters_bds = self.backend.broadcast(accepted_parameters) if not accepted_weights is None: - self.accepted_weights_bds = backend.broadcast(accepted_weights) + self.accepted_weights_bds = self.backend.broadcast(accepted_weights) if not accepted_cov_mat is None: - self.accepted_cov_mat_bds = backend.broadcast(accepted_cov_mat) + self.accepted_cov_mat_bds = self.backend.broadcast(accepted_cov_mat) # define helper functions for map step def _approx_lik_calc(self, theta): """ Compute likelihood for new parameters using approximate likelihood function - - :seed = aTuple[0] - :theta = aTuple[1] - :type seed: int - :rtype: np.arraybg - :return: likelihood values of new parameter - """ + Parameters + ---------- + theta: numpy.ndarray + 1xp matrix containing the model parameters, where p is the number of parameters + + Returns + ------- + float + The approximated likelihood function + """ # Assign theta to model self.model.set_parameters(theta) + # Simulate the fake data from the model given the parameter value theta # print("DEBUG: Simulate model for parameter " + str(theta)) y_sim = self.model.simulate(self.n_samples_per_param) + # print("DEBUG: Extracting observation.") obs = self.observations_bds.value() + # print("DEBUG: Computing likelihood...") lhd = self.likfun.likelihood(obs, y_sim) + # print("DEBUG: Likelihood is :" + str(lhd)) pdf_at_theta = self.model.prior.pdf(theta) + # print("DEBUG: prior pdf evaluated at theta is :" + str(pdf_at_theta)) return pdf_at_theta * lhd - + def _calculate_weight(self, theta): """ Calculates the weight for the given parameter using accepted_parameters, accepted_cov_mat - :type theta: np.array - :param theta: 1xp matrix containing model parameters, where p is the number of parameters - :rtype: float - :return: the new weight for theta + Parameters + ---------- + theta: np.ndarray + 1xp matrix containing the model parameters, where p is the number of parameters + + Returns + ------- + float + The new weight for theta """ - + if self.accepted_weights_bds is None: return 1.0 / self.n_samples else: @@ -618,54 +854,80 @@ def _calculate_weight(self, theta): denominator = 0.0 for i in range(0, self.n_samples): - self.kernel.set_parameters([self.accepted_parameters_bds.value()[i,:], self.accepted_cov_mat_bds.value()]) + self.kernel.set_parameters( + [self.accepted_parameters_bds.value()[i, :], self.accepted_cov_mat_bds.value()]) pdf_value = self.kernel.pdf(theta) - denominator += self.accepted_weights_bds.value()[i,0] * pdf_value - return 1.0 * prior_prob / denominator + denominator += self.accepted_weights_bds.value()[i, 0] * pdf_value + return 1.0 * prior_prob / denominator + +class SABC(BaseAnnealing, InferenceMethod): + """ + This base class implements a modified version of Simulated Annealing Approximate Bayesian Computation (SABC) of [1] when the prior is non-informative. + + [1] C. Albert, H. R. Kuensch and A. Scheidegger. A Simulated Annealing Approach to + Approximate Bayes Computations. Statistics and Computing, (2014). -class SABC: - """This base class implements a modified version of Simulated Annealing Approximate Bayesian Computation (SABC) of [1] when the prior is non-informative. - - [1] C. Albert, H. R. Kuensch and A. Scheidegger. A Simulated Annealing Approach to - Approximate Bayes Computations. Statistics and Computing, (2014). - Parameters ---------- model : abcpy.models.Model - Model object that conforms to the Model class. + Model object defining the model to be used. distance : abcpy.distances.Distance - Distance object that conforms to the Distance class. + Distance object defining the distance measure used to compare simulated and observed data sets. kernel : abcpy.distributions.Distribution - Distribution object defining the perturbation kernel needed for the sampling + Distribution object defining the perturbation kernel needed for the sampling. backend : abcpy.backends.Backend - Backend object that conforms to the Backend class. + Backend object defining the backend to be used. seed : integer, optional Optional initial seed for the random number generator. The default value is generated randomly. """ - def __init__(self, model, distance, kernel, backend, seed=None): + + model = None + distance = None + kernel = None + rng = None + + n_samples = None + n_samples_per_param = None + epsilon = None + + observations_bds = None + accepted_parameters_bds = None + accepted_cov_mat_bds = None + smooth_distances_bds = None + all_distances_bds = None + + def __init__(self, model, distance, kernel, backend, seed=None): self.model = model self.distance = distance self.kernel = kernel self.backend = backend self.rng = np.random.RandomState(seed) - + + # these are usually big tables, so we broadcast them to have them once + # per executor instead of once per task + self.observations_bds = None + self.accepted_parameters_bds = None + self.accepted_cov_mat_bds = None + self.smooth_distances_bds = None + self.all_distances_bds = None + def sample(self, observations, steps, epsilon, n_samples = 10000, n_samples_per_param = 1, beta = 2, delta = 0.2, v = 0.3, ar_cutoff = 0.5, resample = None, n_update = None, adaptcov = 1, full_output=0): - """Samples from the posterior distribution of the model parameter given the observed + """Samples from the posterior distribution of the model parameter given the observed data observations. - + Parameters ---------- - observations : numpy.ndarray + observations : numpy.ndarray Observed data. - steps : integer - Number of maximum iterations in the sequential algoritm ("generations") - epsilon : numpy.float - An array of proposed values of epsilon to be used at each steps. + steps : integer + Number of maximum iterations in the sequential algoritm ("generations") + epsilon : numpy.float + An array of proposed values of epsilon to be used at each steps. n_samples : integer, optional Number of samples to generate. The default value is 10000. - n_samples_per_param : integer, optional + n_samples_per_param : integer, optional Number of data points in each simulated data set. The default value is 1. beta : numpy.float Tuning parameter of SABC @@ -679,23 +941,29 @@ def sample(self, observations, steps, epsilon, n_samples = 10000, n_samples_per_ Resample after this many acceptance, The default value if n_samples n_update: int, optional Number of perturbed parameters at each step, The default value if n_samples - adaptcov : boolean, optional + adaptcov : boolean, optional Whether we adapt the covariance matrix in iteration stage. The default value TRUE. full_output: integer, optional - If full_output==1, intermediate results are included in output journal. + If full_output==1, intermediate results are included in output journal. The default value is 0, meaning the intermediate results are not saved. Returns - ------- + ------- abcpy.output.Journal A journal containing simulation results, metadata and optionally intermediate results. """ + + self.observations_bds = self.backend.broadcast(observations) + self.epsilon = epsilon + self.n_samples = n_samples + self.n_samples_per_param = n_samples_per_param + journal = Journal(full_output) journal.configuration["type_model"] = type(self.model) journal.configuration["type_dist_func"] = type(self.distance) journal.configuration["type_kernel_func"] = type(self.kernel) - journal.configuration["n_samples"] = n_samples - journal.configuration["n_samples_per_param"] = n_samples_per_param + journal.configuration["n_samples"] = self.n_samples + journal.configuration["n_samples_per_param"] = self.n_samples_per_param journal.configuration["beta"] = beta journal.configuration["delta"] = delta journal.configuration["v"] = v @@ -704,13 +972,13 @@ def sample(self, observations, steps, epsilon, n_samples = 10000, n_samples_per_ journal.configuration["n_update"] = n_update journal.configuration["adaptcov"] = adaptcov journal.configuration["full_output"] = full_output - - accepted_parameters = np.zeros(shape=(n_samples,len(self.model.get_parameters()))) + + accepted_parameters = np.zeros(shape=(n_samples, len(self.model.get_parameters()))) distances = np.zeros(shape=(n_samples,)) smooth_distances = np.zeros(shape=(n_samples,)) - accepted_weights = np.ones(shape=(n_samples,1)) + accepted_weights = np.ones(shape=(n_samples, 1)) all_distances = None - accepted_cov_mat = None + accepted_cov_mat = None if resample == None: resample = n_samples @@ -719,223 +987,213 @@ def sample(self, observations, steps, epsilon, n_samples = 10000, n_samples_per_ sample_array = np.ones(shape=(steps,)) sample_array[0] = n_samples sample_array[1:] = n_update + ## Acceptance counter to determine the resampling step accept = 0 samples_until = 0 - - # Initialize variables that need to be available remotely - rc = _RemoteContextSABC(self.backend, self.model, self.distance, self.kernel, \ - observations, n_samples, n_samples_per_param) - - for aStep in range(0,steps): - # main SABC algorithm + + for aStep in range(0, steps): + # main SABC algorithm # print("INFO: Initialization of SABC") seed_arr = self.rng.randint(0, np.iinfo(np.uint32).max, size=int(sample_array[aStep]), dtype=np.uint32) seed_pds = self.backend.parallelize(seed_arr) # 0: update remotely required variables # print("INFO: Broadcasting parameters.") - rc.epsilon = epsilon - rc._update_broadcasts(self.backend, accepted_parameters, accepted_cov_mat, smooth_distances, all_distances) - + self.epsilon = epsilon + self._update_broadcasts(accepted_parameters, accepted_cov_mat, smooth_distances, all_distances) # 1: Calculate parameters # print("INFO: Initial accepted parameter parameters") - params_and_dists_pds = self.backend.map(rc._accept_parameter, seed_pds) + params_and_dists_pds = self.backend.map(self._accept_parameter, seed_pds) params_and_dists = self.backend.collect(params_and_dists_pds) - new_parameters, new_distances, new_all_parameters, new_all_distances, index, acceptance = [list(t) for t in zip(*params_and_dists)] - new_parameters = np.array(new_parameters) + new_parameters, new_distances, new_all_parameters, new_all_distances, index, acceptance = [list(t) for t in + zip( + *params_and_dists)] + new_parameters = np.array(new_parameters) new_distances = np.array(new_distances) new_all_distances = np.concatenate(new_all_distances) index = np.array(index) acceptance = np.array(acceptance) - - # Reading all_distances at Initial step + + # Reading all_distances at Initial step if aStep == 0: - index = np.linspace(0,n_samples-1,n_samples).astype(int).reshape(n_samples,) + index = np.linspace(0, n_samples - 1, n_samples).astype(int).reshape(n_samples, ) accept = 0 all_distances = new_all_distances - #print(index[acceptance == 1]) - # Initialize/Update the accepted parameters and their corresponding distances - accepted_parameters[index[acceptance==1],:] = new_parameters[acceptance==1,:] - distances[index[acceptance==1]] = new_distances[acceptance==1] - - + # print(index[acceptance == 1]) + # Initialize/Update the accepted parameters and their corresponding distances + accepted_parameters[index[acceptance == 1], :] = new_parameters[acceptance == 1, :] + distances[index[acceptance == 1]] = new_distances[acceptance == 1] + # 2: Smoothing of the distances - smooth_distances[index[acceptance==1]] = self._smoother_distance(distances[index[acceptance==1]],all_distances) - + smooth_distances[index[acceptance == 1]] = self._smoother_distance(distances[index[acceptance == 1]], + all_distances) + # 3: Initialize/Update U, epsilon and covariance of perturbation kernel if aStep == 0: - U = self._avergae_redefined_distance(self._smoother_distance(all_distances,all_distances), epsilon) - else: + U = self._average_redefined_distance(self._smoother_distance(all_distances, all_distances), epsilon) + else: U = np.mean(smooth_distances) - epsilon = self._schedule(U,v) + epsilon = self._schedule(U, v) if accepted_parameters.shape[1] > 1: - accepted_cov_mat = beta*np.cov(np.transpose(accepted_parameters)) + \ - 0.0001*np.trace(np.cov(np.transpose(accepted_parameters)))*np.eye(accepted_parameters.shape[1]) + accepted_cov_mat = beta * np.cov(np.transpose(accepted_parameters)) + \ + 0.0001 * np.trace(np.cov(np.transpose(accepted_parameters))) * np.eye( + accepted_parameters.shape[1]) else: - accepted_cov_mat = beta*np.var(np.transpose(accepted_parameters)) + \ - 0.0001*(np.var(np.transpose(accepted_parameters)))*np.eye(accepted_parameters.shape[1]) - - ## 4: Show progress and if acceptance rate smaller than a value break the iteration - + accepted_cov_mat = beta * np.var(np.transpose(accepted_parameters)) + \ + 0.0001 * (np.var(np.transpose(accepted_parameters))) * np.eye( + accepted_parameters.shape[1]) + + # 4: Show progress and if acceptance rate smaller than a value break the iteration + # print("INFO: Saving intermediate configuration to output journal.") if full_output == 1: journal.add_parameters(accepted_parameters) journal.add_weights(accepted_weights) - + if aStep > 0: accept = accept + np.sum(acceptance) samples_until = samples_until + sample_array[aStep] - acceptance_rate = accept/samples_until - print('updates: ',np.sum(sample_array[1:aStep+1])/np.sum(sample_array[1:])*100,' epsilon: ' ,epsilon,\ + acceptance_rate = accept / samples_until + print( + 'updates: ', np.sum(sample_array[1:aStep + 1]) / np.sum(sample_array[1:]) * 100, ' epsilon: ', epsilon, \ 'u.mean: ', U, 'acceptance rate: ', acceptance_rate) if acceptance_rate < ar_cutoff: break - + # 5: Resampling if number of accepted particles greater than resample if accept >= resample and U > 1e-100: ## Weighted resampling: - weight = np.exp(-smooth_distances*delta/U) - weight = weight/sum(weight) - index_resampled = self.rng.choice(np.arange(n_samples), n_samples, replace = 1, p = weight) - accepted_parameters = accepted_parameters[index_resampled,:] + weight = np.exp(-smooth_distances * delta / U) + weight = weight / sum(weight) + index_resampled = self.rng.choice(np.arange(n_samples), n_samples, replace=1, p=weight) + accepted_parameters = accepted_parameters[index_resampled, :] smooth_distances = smooth_distances[index_resampled] - + ## Update U and epsilon: - epsilon = epsilon*(1-delta) + epsilon = epsilon * (1 - delta) U = np.mean(smooth_distances) - epsilon = self._schedule(U,v) - + epsilon = self._schedule(U, v) + ## Print effective sampling size - print('Resampling: Effective sampling size: ', 1/sum(pow(weight/sum(weight),2))) - accept = 0 + print('Resampling: Effective sampling size: ', 1 / sum(pow(weight / sum(weight), 2))) + accept = 0 samples_until = 0 - - #Add epsilon_arr, number of final steps and final output to the journal + + # Add epsilon_arr, number of final steps and final output to the journal # print("INFO: Saving final configuration to output journal.") if full_output == 0: journal.add_parameters(accepted_parameters) journal.add_weights(accepted_weights) - journal.configuration["steps"] = aStep + 1 + journal.configuration["steps"] = aStep + 1 journal.configuration["epsilon"] = epsilon return journal - - + def _smoother_distance(self, distance, old_distance): """Smooths the distance using the Equation 14 of [1]. - - [1] C. Albert, H. R. Kuensch and A. Scheidegger. A Simulated Annealing Approach to - Approximate Bayes Computations. Statistics and Computing 0960-3174 (2014). + + [1] C. Albert, H. R. Kuensch and A. Scheidegger. A Simulated Annealing Approach to + Approximate Bayes Computations. Statistics and Computing 0960-3174 (2014). + + Parameters + ---------- + distance: numpy.ndarray + Current distance between the simulated and observed data + old_distance: numpy.ndarray + Last distance between the simulated and observed data + + Returns + ------- + numpy.ndarray + Smoothed distance + """ - - smoothed_distance = np.zeros(shape=(len(distance),)) - - for ind in range(0,len(distance)): + + smoothed_distance = np.zeros(shape=(len(distance),)) + + for ind in range(0, len(distance)): if distance[ind] < np.min(old_distance): - smoothed_distance[ind] = (distance[ind]/np.min(old_distance))/len(old_distance) + smoothed_distance[ind] = (distance[ind] / np.min(old_distance)) / len(old_distance) else: - smoothed_distance[ind] = np.mean(np.array(old_distance) 1: - new_theta = self.kernel.sample(1)[0,:] + new_theta = self.kernel.sample(1)[0, :] else: new_theta = self.kernel.sample(1) theta_is_accepted = self.model.set_parameters(new_theta) if theta_is_accepted and self.model.prior.pdf(self.model.get_parameters()) != 0: break y_sim = self.model.simulate(self.n_samples_per_param) - distance = self.distance.distance(self.observations_bds.value(), y_sim) - smooth_distance = self._smoother_distance_remote([distance],self.all_distances_bds.value()) - + distance = self.distance.distance(self.observations_bds.value(), y_sim) + smooth_distance = self._smoother_distance([distance], self.all_distances_bds.value()) + ## Calculate acceptance probability: - ratio_prior_prob = self.model.prior.pdf(new_theta)/self.model.prior.pdf(self.accepted_parameters_bds.value()[index,:]) + ratio_prior_prob = self.model.prior.pdf(new_theta) / self.model.prior.pdf( + self.accepted_parameters_bds.value()[index, :]) ratio_likelihood_prob = np.exp((self.smooth_distances_bds.value()[index] - smooth_distance) / self.epsilon) - acceptance_prob = ratio_prior_prob*ratio_likelihood_prob - + acceptance_prob = ratio_prior_prob * ratio_likelihood_prob + ## If accepted if rng.rand(1) < acceptance_prob: acceptance = 1 else: distance = np.inf - + return (new_theta, distance, all_parameters, all_distances, index, acceptance) -class ABCsubsim: +class ABCsubsim(BaseAnnealing, InferenceMethod): """This base class implements Approximate Bayesian Computation by subset simulation (ABCsubsim) algorithm of [1]. - + [1] M. Chiachio, J. L. Beck, J. Chiachio, and G. Rus., Approximate Bayesian computation by subset simulation. SIAM J. Sci. Comput., 36(3):A1339–A1358, 2014/10/03 2014. - + Parameters ---------- model : abcpy.models.Model - Model object that conforms to the Model class. + Model object defining the model to be used. distance : abcpy.distances.Distance - Distance object that conforms to the Distance class. + Distance object defining the distance used to compare the simulated and observed data sets. kernel : abcpy.distributions.Distribution - Distribution object defining the perturbation kernel needed for the sampling + Distribution object defining the perturbation kernel needed for the sampling. backend : abcpy.backends.Backend - Backend object that conforms to the Backend class. + Backend object defining the backend to be used. seed : integer, optional Optional initial seed for the random number generator. The default value is generated randomly. """ - def __init__(self, model, distance, kernel, backend, seed=None): + + model = None + distance = None + kernel = None + rng = None + anneal_parameter = None + + n_samples = None + n_samples_per_param = None + chain_length = None + + observations_bds = None + accepted_parameters_bds = None + accepted_cov_mat_bds = None + + def __init__(self, model, distance, kernel, backend, seed=None): self.model = model self.distance = distance self.kernel = kernel self.backend = backend self.rng = np.random.RandomState(seed) - + self.anneal_parameter = None + + + # these are usually big tables, so we broadcast them to have them once + # per executor instead of once per task + self.observations_bds = None + self.accepted_parameters_bds = None + self.accepted_cov_mat_bds = None + def sample(self, observations, steps, n_samples = 10000, n_samples_per_param = 1, chain_length = 10, ap_change_cutoff = 10, full_output=0): - """Samples from the posterior distribution of the model parameter given the observed + """Samples from the posterior distribution of the model parameter given the observed data observations. - + Parameters ---------- - observations : numpy.ndarray + observations : numpy.ndarray Observed data. - steps : integer - Number of iterations in the sequential algoritm ("generations") - n_samples : integer, optional - Number of samples to generate. The default value is 10000. - n_samples_per_param : integer, optional - Number of data points in each simulated data set. The default value is 1. - chain_length : integer, optional - Chain length of the MCMC. n_samples should be divisable by chain_length. The default value is 10. + steps : integer + Number of iterations in the sequential algoritm ("generations") ap_change_cutoff : float, optional - The cutoff value for the percentage change in the anneal parameter. If the change is less than + The cutoff value for the percentage change in the anneal parameter. If the change is less than ap_change_cutoff the iterations are stopped. The default value is 10. full_output: integer, optional - If full_output==1, intermediate results are included in output journal. + If full_output==1, intermediate results are included in output journal. The default value is 0, meaning the intermediate results are not saved. Returns - ------- + ------- abcpy.output.Journal A journal containing simulation results, metadata and optionally intermediate results. """ - + + self.observations_bds = self.backend.broadcast(observations) + self.chain_length = chain_length + self.n_samples = n_samples + self.n_samples_per_param = n_samples_per_param + journal = Journal(full_output) journal.configuration["type_model"] = type(self.model) journal.configuration["type_dist_func"] = type(self.distance) journal.configuration["type_kernel_func"] = type(self.kernel) - journal.configuration["n_samples"] = n_samples - journal.configuration["n_samples_per_param"] = n_samples_per_param - journal.configuration["chain_length"] = chain_length + journal.configuration["n_samples"] = self.n_samples + journal.configuration["n_samples_per_param"] = self.n_samples_per_param + journal.configuration["chain_length"] = self.chain_length journal.configuration["ap_change_cutoff"] = ap_change_cutoff journal.configuration["full_output"] = full_output - + accepted_parameters = None - accepted_weights = np.ones(shape=(n_samples,1)) - accepted_cov_mat = None + accepted_weights = np.ones(shape=(n_samples, 1)) + accepted_cov_mat = None anneal_parameter = 0 anneal_parameter_old = 0 temp_chain_length = 1 - - # Initialize variables that need to be available remotely - rc = _RemoteContextABCsubsim(self.backend, self.model, self.distance, self.kernel, observations, n_samples, n_samples_per_param, chain_length) - - for aStep in range(0,steps): - # main ABCsubsim algorithm + + + for aStep in range(0, steps): + # main ABCsubsim algorithm # print("INFO: Initialization of ABCsubsim") - seed_arr = self.rng.randint(0, np.iinfo(np.uint32).max, size=int(n_samples/temp_chain_length), dtype=np.uint32) - index_arr = np.linspace(0,n_samples/temp_chain_length-1,n_samples/temp_chain_length).astype(int).reshape(int(n_samples/temp_chain_length),) - seed_index_arr = np.column_stack((seed_arr,index_arr)) - seed_index_pds = self.backend.parallelize(seed_index_arr) - + seed_arr = self.rng.randint(0, np.iinfo(np.uint32).max, size=int(n_samples / temp_chain_length), + dtype=np.uint32) + index_arr = np.linspace(0, n_samples / temp_chain_length - 1, n_samples / temp_chain_length).astype( + int).reshape(int(n_samples / temp_chain_length), ) + seed_and_index_arr = np.column_stack((seed_arr, index_arr)) + seed_and_index_pds = self.backend.parallelize(seed_and_index_arr) # 0: update remotely required variables # print("INFO: Broadcasting parameters.") - rc._update_broadcasts(self.backend, accepted_parameters, accepted_cov_mat) + self._update_broadcasts(accepted_parameters, accepted_cov_mat) # 1: Calculate parameters # print("INFO: Initial accepted parameter parameters") - params_and_dists_pds = self.backend.map(rc._accept_parameter, seed_index_pds) + params_and_dists_pds = self.backend.map(self._accept_parameter, seed_and_index_pds) params_and_dists = self.backend.collect(params_and_dists_pds) new_parameters, new_distances = [list(t) for t in zip(*params_and_dists)] - accepted_parameters = np.concatenate(new_parameters) + accepted_parameters = np.concatenate(new_parameters) distances = np.concatenate(new_distances) - # 2: Sort and renumber samples + # 2: Sort and renumber samples SortIndex = sorted(range(len(distances)), key=lambda k: distances[k]) distances = distances[SortIndex] - accepted_parameters = accepted_parameters[SortIndex,:] - + accepted_parameters = accepted_parameters[SortIndex, :] + # 3: Calculate and broadcast annealling parameters temp_chain_length = chain_length if aStep > 0: anneal_parameter_old = anneal_parameter - anneal_parameter = 0.5*(distances[int(n_samples/temp_chain_length)]+distances[int(n_samples/temp_chain_length)+1]) - rc.anneal_parameter = anneal_parameter - + anneal_parameter = 0.5 * ( + distances[int(n_samples / temp_chain_length)] + distances[int(n_samples / temp_chain_length) + 1]) + self.anneal_parameter = anneal_parameter + # 4: Update proposal covariance matrix (Parallelized) if aStep == 0: accepted_cov_mat = np.cov(accepted_parameters, rowvar=False) else: - accepted_cov_mat = pow(2,1)*accepted_cov_mat - rc._update_broadcasts(self.backend, accepted_parameters, accepted_cov_mat) - + accepted_cov_mat = pow(2, 1) * accepted_cov_mat + self._update_broadcasts(accepted_parameters, accepted_cov_mat) + seed_arr = self.rng.randint(0, np.iinfo(np.uint32).max, size=10, dtype=np.uint32) - index_arr = np.linspace(0,10-1,10).astype(int).reshape(10,) - seed_index_arr = np.column_stack((seed_arr,index_arr)) - seed_index_pds = self.backend.parallelize(seed_index_arr) - - cov_mat_index_pds = self.backend.map(rc._update_cov_mat, seed_index_pds) + index_arr = np.linspace(0, 10 - 1, 10).astype(int).reshape(10, ) + seed_and_index_arr = np.column_stack((seed_arr, index_arr)) + seed_and_index_pds = self.backend.parallelize(seed_and_index_arr) + + cov_mat_index_pds = self.backend.map(self._update_cov_mat, seed_and_index_pds) cov_mat_index = self.backend.collect(cov_mat_index_pds) cov_mat, T, accept_index = [list(t) for t in zip(*cov_mat_index)] - + for ind in range(10): if accept_index[ind] == 1: accepted_cov_mat = cov_mat[ind] break - + # print("INFO: Saving intermediate configuration to output journal.") if full_output == 1: journal.add_parameters(accepted_parameters) journal.add_weights(accepted_weights) # Show progress - anneal_parameter_change_percentage = 100*abs(anneal_parameter_old-anneal_parameter)/anneal_parameter - print('Steps: ', aStep, 'annealing parameter: ', anneal_parameter, 'change (%) in annealing parameter: ', anneal_parameter_change_percentage ) + anneal_parameter_change_percentage = 100 * abs(anneal_parameter_old - anneal_parameter) / anneal_parameter + print('Steps: ', aStep, 'annealing parameter: ', anneal_parameter, 'change (%) in annealing parameter: ', + anneal_parameter_change_percentage) if anneal_parameter_change_percentage < ap_change_cutoff: break - - #Add anneal_parameter, number of final steps and final output to the journal + # Add anneal_parameter, number of final steps and final output to the journal # print("INFO: Saving final configuration to output journal.") if full_output == 0: journal.add_parameters(accepted_parameters) journal.add_weights(accepted_weights) - journal.configuration["steps"] = aStep+1 + journal.configuration["steps"] = aStep + 1 journal.configuration["anneal_parameter"] = anneal_parameter return journal -class _RemoteContextABCsubsim: - """ - Contains everything that is sent over the network like broadcast vars and map functions - """ - - def __init__(self, backend, model, distance, kernel, observations, n_samples, n_samples_per_param, chain_length): - self.model = model - self.distance = distance - self.n_samples = n_samples - self.n_samples_per_param = n_samples_per_param - self.kernel = kernel - self.chain_length = chain_length - - self.anneal_parameter = None - - # these are usually big tables, so we broadcast them to have them once - # per executor instead of once per task - self.observations_bds = backend.broadcast(observations) - self.accepted_parameters_bds = None - self.accepted_cov_mat_bds = None - - - def _update_broadcasts(self, backend, accepted_parameters, accepted_cov_mat): + def _update_broadcasts(self, accepted_parameters, accepted_cov_mat): def destroy(bc): if bc != None: bc.unpersist - #bc.destroy - + # bc.destroy + if not accepted_parameters is None: - self.accepted_parameters_bds = backend.broadcast(accepted_parameters) + self.accepted_parameters_bds = self.backend.broadcast(accepted_parameters) if not accepted_cov_mat is None: - self.accepted_cov_mat_bds = backend.broadcast(accepted_cov_mat) - + self.accepted_cov_mat_bds = self.backend.broadcast(accepted_cov_mat) # define helper functions for map step - def _accept_parameter(self, seed_index): + def _accept_parameter(self, seed_and_index): """ Samples a single model parameter and simulate from it until distance between simulated outcome and the observation is - smaller than eplison. - - :type seed: int - :rtype: np.array - :return: accepted parameter + smaller than epsilon. + + Parameters + ---------- + seed: numpy.ndarray + 2 dimensional array. The first entry defines the initial seed of therandom number generator. + The second entry defines the index in the data set. + + Returns + ------- + numpy.ndarray + accepted parameter """ - seed = seed_index[0] - index = seed_index[1] + + seed = seed_and_index[0] + index = seed_and_index[1] rng = np.random.RandomState(seed) self.model.prior.reseed(rng.randint(np.iinfo(np.uint32).max, dtype=np.uint32)) self.kernel.reseed(rng.randint(np.iinfo(np.uint32).max, dtype=np.uint32)) - result_theta = [] - result_distance = [] - + result_theta = [] + result_distance = [] + if self.accepted_parameters_bds == None: self.model.sample_from_prior() y_sim = self.model.simulate(self.n_samples_per_param) @@ -1196,10 +1462,10 @@ def _accept_parameter(self, seed_index): distance = self.distance.distance(self.observations_bds.value(), y_sim) result_theta.append(theta) result_distance.append(distance) - for ind in range(0,self.chain_length-1): + for ind in range(0, self.chain_length - 1): while True: self.kernel.set_parameters([theta, self.accepted_cov_mat_bds.value()]) - new_theta = self.kernel.sample(1)[0,:] + new_theta = self.kernel.sample(1)[0, :] theta_is_accepted = self.model.set_parameters(new_theta) if theta_is_accepted and self.model.prior.pdf(self.model.get_parameters()) != 0: break @@ -1207,16 +1473,17 @@ def _accept_parameter(self, seed_index): new_distance = self.distance.distance(self.observations_bds.value(), y_sim) ## Calculate acceptance probability: - ratio_prior_prob = self.model.prior.pdf(new_theta)/self.model.prior.pdf(theta) + ratio_prior_prob = self.model.prior.pdf(new_theta) / self.model.prior.pdf(theta) self.kernel.set_parameters([new_theta, self.accepted_cov_mat_bds.value()]) kernel_numerator = self.kernel.pdf(theta) - self.kernel.set_parameters([theta, self.accepted_cov_mat_bds.value()]) + self.kernel.set_parameters([theta, self.accepted_cov_mat_bds.value()]) kernel_denominator = self.kernel.pdf(new_theta) - ratio_likelihood_prob = kernel_numerator/kernel_denominator - acceptance_prob = min(1,ratio_prior_prob*ratio_likelihood_prob)*(new_distance=0.3: - return(accepted_cov_mat_transformed, t, 1) + if acceptance / 10 <= 0.5 and acceptance / 10 >= 0.3: + return (accepted_cov_mat_transformed, t, 1) else: - return(accepted_cov_mat_transformed, t, 0) - -class RSMCABC: - """This base class implements Adaptive Population Monte Carlo Approximate Bayesian computation of - Drovandi and Pettitt [1]. - + return (accepted_cov_mat_transformed, t, 0) + +class RSMCABC(BaseAdaptivePopulationMC, InferenceMethod): + """This base class implements Adaptive Population Monte Carlo Approximate Bayesian computation of + Drovandi and Pettitt [1]. + [1] CC. Drovandi CC and AN. Pettitt, Estimation of parameters for macroparasite population evolution using approximate Bayesian computation. Biometrics 67(1):225–233, 2011. - + Parameters ---------- model : abcpy.models.Model - Model object that conforms to the Model class. + Model object defining the model to be used. distance : abcpy.distances.Distance - Distance object that conforms to the Distance class. + Distance object defining the distance measure used to compare simulated and observed data sets. kernel : abcpy.distributions.Distribution - Distribution object defining the perturbation kernel needed for the sampling + Distribution object defining the perturbation kernel needed for the sampling. backend : abcpy.backends.Backend - Backend object that conforms to the Backend class. + Backend object defining the backend to be used. seed : integer, optional Optional initial seed for the random number generator. The default value is generated randomly. """ - def __init__(self, model, distance, kernel, backend, seed=None): + + model = None + distance = None + kernel = None + + R = None + rng = None + + n_samples = None + n_samples_per_param = None + alpha = None + + observations_bds = None + accepted_parameters_bds = None + accepted_dist_bds = None + accepted_cov_mat_bds = None + + + def __init__(self, model, distance, kernel, backend, seed=None): self.model = model self.distance = distance self.kernel = kernel self.backend = backend + + self.R=None self.rng = np.random.RandomState(seed) - + + # these are usually big tables, so we broadcast them to have them once + # per executor instead of once per task + self.observations_bds = None + self.accepted_parameters_bds = None + self.accepted_dist_bds = None + self.accepted_cov_mat_bds = None + def sample(self, observations, steps, n_samples = 10000, n_samples_per_param = 1, alpha = 0.1, epsilon_init = 100, epsilon_final = 0.1, const = 1, covFactor = 2.0, full_output=0): - """Samples from the posterior distribution of the model parameter given the observed + """Samples from the posterior distribution of the model parameter given the observed data observations. - + Parameters ---------- - observations : numpy.ndarray + observations : numpy.ndarray Observed data. - steps : integer - Number of iterations in the sequential algoritm ("generations") + steps : integer + Number of iterations in the sequential algoritm ("generations") n_samples : integer, optional Number of samples to generate. The default value is 10000. - n_samples_per_param : integer, optional + n_samples_per_param : integer, optional Number of data points in each simulated data set. The default value is 1. alpha : float, optional A parameter taking values between [0,1], the default value is 0.1. @@ -1313,169 +1628,153 @@ def sample(self, observations, steps, n_samples = 10000, n_samples_per_param = 1 Initial value of threshold, the default is 100 epsilon_final : float, optional Terminal value of threshold, the default is 0.1 - const : float, optional + const : float, optional A constant to compute acceptance probabilty - covFactor : float, optional + covFactor : float, optional scaling parameter of the covariance matrix. The default value is 2. full_output: integer, optional - If full_output==1, intermediate results are included in output journal. + If full_output==1, intermediate results are included in output journal. The default value is 0, meaning the intermediate results are not saved. Returns - ------- + ------- abcpy.output.Journal A journal containing simulation results, metadata and optionally intermediate results. """ - + + self.observations_bds = self.backend.broadcast(observations) + self.alpha = alpha + self.n_samples = n_samples + self.n_samples_per_param = n_samples_per_param + journal = Journal(full_output) journal.configuration["type_model"] = type(self.model) journal.configuration["type_dist_func"] = type(self.distance) - journal.configuration["n_samples"] = n_samples - journal.configuration["n_samples_per_param"] = n_samples_per_param + journal.configuration["n_samples"] = self.n_samples + journal.configuration["n_samples_per_param"] = self.n_samples_per_param journal.configuration["steps"] = steps - + accepted_parameters = None accepted_cov_mat = None accepted_dist = None - - # Initialize variables that need to be available remotely - rc = _RemoteContextRSMCABC(self.backend, self.model, self.distance, self.kernel, observations, n_samples, n_samples_per_param, alpha) # main RSMCABC algorithm # print("INFO: Starting RSMCABC iterations.") for aStep in range(steps): - - # 0: Compute epsilon, compute new covariance matrix for Kernel, + + # 0: Compute epsilon, compute new covariance matrix for Kernel, # and finally Drawing new new/perturbed samples using prior or MCMC Kernel # print("DEBUG: Iteration " + str(aStep) + " of RSMCABC algorithm.") - if aStep == 0: + if aStep == 0: n_replenish = n_samples - # Compute epsilon + # Compute epsilon epsilon = [epsilon_init] R = int(1) else: - n_replenish = round(n_samples*alpha) - # Throw away N_alpha particles with largest dist - accepted_parameters = np.delete(accepted_parameters, np.arange(round(n_samples*alpha))+(n_samples-round(n_samples*alpha)), 0) - accepted_dist = np.delete(accepted_dist, np.arange(round(n_samples*alpha))+(n_samples-round(n_samples*alpha)), 0) + n_replenish = round(n_samples * alpha) + # Throw away N_alpha particles with largest dist + accepted_parameters = np.delete(accepted_parameters, np.arange(round(n_samples * alpha)) + ( + self.n_samples - round(n_samples * alpha)), 0) + accepted_dist = np.delete(accepted_dist, + np.arange(round(n_samples * alpha)) + (n_samples - round(n_samples * alpha)), + 0) # Compute epsilon epsilon.append(accepted_dist[-1]) # Calculate covariance # print("INFO: Calculating covariance matrix.") - new_cov_mat = covFactor * np.cov(accepted_parameters, rowvar=False) - accepted_cov_mat = new_cov_mat - - + new_cov_mat = covFactor * np.cov(accepted_parameters, rowvar=False) + accepted_cov_mat = new_cov_mat + if epsilon[-1] < epsilon_final: break - - seed_arr = self.rng.randint(0, np.iinfo(np.uint32).max, size = n_replenish, dtype=np.uint32) + + seed_arr = self.rng.randint(0, np.iinfo(np.uint32).max, size=n_replenish, dtype=np.uint32) seed_pds = self.backend.parallelize(seed_arr) - #update remotely required variables + # update remotely required variables # print("INFO: Broadcasting parameters.") - rc.epsilon = epsilon - rc.R = R + self.epsilon = epsilon + self.R = R # Broadcast updated variable - rc._update_broadcasts(self.backend, accepted_parameters, accepted_dist, accepted_cov_mat) + self._update_broadcasts(accepted_parameters, accepted_dist, accepted_cov_mat) - #calculate resample parameters - #print("INFO: Resampling parameters") - params_and_dist_index_pds = self.backend.map(rc._accept_parameter, seed_pds) + # calculate resample parameters + # print("INFO: Resampling parameters") + params_and_dist_index_pds = self.backend.map(self._accept_parameter, seed_pds) params_and_dist_index = self.backend.collect(params_and_dist_index_pds) new_parameters, new_dist, new_index = [list(t) for t in zip(*params_and_dist_index)] new_parameters = np.array(new_parameters) new_dist = np.array(new_dist) new_index = np.array(new_index) - + # 1: Update all parameters, compute acceptance probability, compute epsilon - if len(new_dist) == n_samples: + if len(new_dist) == self.n_samples: accepted_parameters = new_parameters - accepted_dist = new_dist + accepted_dist = new_dist else: - accepted_parameters = np.concatenate((accepted_parameters,new_parameters)) + accepted_parameters = np.concatenate((accepted_parameters, new_parameters)) accepted_dist = np.concatenate((accepted_dist, new_dist)) - # 2: Compute acceptance probabilty and set R - #print(aStep) - #print(new_index) - prob_acceptance = sum(new_index)/(R*n_replenish) + # 2: Compute acceptance probabilty and set R + # print(aStep) + # print(new_index) + prob_acceptance = sum(new_index) / (R * n_replenish) if prob_acceptance == 1 or prob_acceptance == 0: R = 1 else: - R = int(np.log(const)/np.log(1-prob_acceptance)) - - + R = int(np.log(const) / np.log(1 - prob_acceptance)) + # print("INFO: Saving configuration to output journal.") - if (full_output == 1 and aStep <= steps-1) or (full_output == 0 and aStep == steps-1): + if (full_output == 1 and aStep <= steps - 1) or (full_output == 0 and aStep == steps - 1): journal.add_parameters(accepted_parameters) - journal.add_weights(np.ones(shape=(n_samples,1))*(1/n_samples)) + journal.add_weights(np.ones(shape=(n_samples, 1)) * (1 / n_samples)) - #Add epsilon_arr to the journal + # Add epsilon_arr to the journal journal.configuration["epsilon_arr"] = epsilon return journal - - - -class _RemoteContextRSMCABC: - """ - Contains everything that is sent over the network like broadcast vars and map functions - """ - - def __init__(self, backend, model, distance, kernel, observations, n_samples, n_samples_per_param, alpha): - self.model = model - self.distance = distance - self.n_samples = n_samples - self.n_samples_per_param = n_samples_per_param - self.kernel = kernel - self.alpha = alpha - - self.epsilon = None - self.R = None - - # these are usually big tables, so we broadcast them to have them once - # per executor instead of once per task - self.observations_bds = backend.broadcast(observations) - self.accepted_parameters_bds = None - self.accepted_dist_bds = None - self.accepted_cov_mat_bds = None - def _update_broadcasts(self, backend, accepted_parameters, accepted_dist, accepted_cov_mat): + def _update_broadcasts(self, accepted_parameters, accepted_dist, accepted_cov_mat): def destroy(bc): if bc != None: bc.unpersist - #bc.destroy - + # bc.destroy + if not accepted_parameters is None: - self.accepted_parameters_bds = backend.broadcast(accepted_parameters) + self.accepted_parameters_bds = self.backend.broadcast(accepted_parameters) if not accepted_dist is None: - self.accepted_dist_bds = backend.broadcast(accepted_dist) + self.accepted_dist_bds = self.backend.broadcast(accepted_dist) if not accepted_cov_mat is None: - self.accepted_cov_mat_bds = backend.broadcast(accepted_cov_mat) + self.accepted_cov_mat_bds = self.backend.broadcast(accepted_cov_mat) # define helper functions for map step def _accept_parameter(self, seed): """ Samples a single model parameter and simulate from it until distance between simulated outcome and the observation is - smaller than eplison. - - :type seed: int - :rtype: np.array - :return: accepted parameter + smaller than epsilon. + + Parameters + ---------- + seed: integer + Initial seed for the random number generator. + + Returns + ------- + numpy.ndarray + accepted parameter """ rng = np.random.RandomState(seed) self.model.prior.reseed(rng.randint(np.iinfo(np.uint32).max, dtype=np.uint32)) self.kernel.reseed(rng.randint(np.iinfo(np.uint32).max, dtype=np.uint32)) - + distance = self.distance.dist_max() if self.accepted_parameters_bds == None: while distance > self.epsilon[-1]: self.model.sample_from_prior() y_sim = self.model.simulate(self.n_samples_per_param) - distance = self.distance.distance(self.observations_bds.value(), y_sim) + distance = self.distance.distance(self.observations_bds.value(), y_sim) index_accept = 1 else: index = rng.choice(len(self.accepted_parameters_bds.value()), size=1) @@ -1484,92 +1783,125 @@ def _accept_parameter(self, seed): for ind in range(self.R): while True: self.kernel.set_parameters([theta, self.accepted_cov_mat_bds.value()]) - new_theta = self.kernel.sample(1)[0,:] + new_theta = self.kernel.sample(1)[0, :] theta_is_accepted = self.model.set_parameters(new_theta) if theta_is_accepted and self.model.prior.pdf(self.model.get_parameters()) != 0: - break + break y_sim = self.model.simulate(self.n_samples_per_param) distance = self.distance.distance(self.observations_bds.value(), y_sim) - ratio_prior_prob = self.model.prior.pdf(new_theta)/self.model.prior.pdf(theta) + ratio_prior_prob = self.model.prior.pdf(new_theta) / self.model.prior.pdf(theta) self.kernel.set_parameters([new_theta, self.accepted_cov_mat_bds.value()]) kernel_numerator = self.kernel.pdf(theta) - self.kernel.set_parameters([theta, self.accepted_cov_mat_bds.value()]) + self.kernel.set_parameters([theta, self.accepted_cov_mat_bds.value()]) kernel_denominator = self.kernel.pdf(new_theta) - ratio_kernel_prob = kernel_numerator/kernel_denominator - probability_acceptance = min(1,ratio_prior_prob*ratio_kernel_prob) - if distance < self.epsilon[-1] and rng.binomial(1,probability_acceptance) == 1: + ratio_kernel_prob = kernel_numerator / kernel_denominator + probability_acceptance = min(1, ratio_prior_prob * ratio_kernel_prob) + if distance < self.epsilon[-1] and rng.binomial(1, probability_acceptance) == 1: index_accept += 1 else: self.model.set_parameters(theta) distance = self.accepted_dist_bds.value()[index[0]] - + return (self.model.get_parameters(), distance, index_accept) -class APMCABC: - """This base class implements Adaptive Population Monte Carlo Approximate Bayesian computation of - M. Lenormand et al. [1]. - +class APMCABC(BaseAdaptivePopulationMC, InferenceMethod): + """This base class implements Adaptive Population Monte Carlo Approximate Bayesian computation of + M. Lenormand et al. [1]. + [1] M. Lenormand, F. Jabot and G. Deffuant, Adaptive approximate Bayesian computation for complex models. Computational Statistics, 28:2777–2796, 2013. - + Parameters ---------- model : abcpy.models.Model - Model object that conforms to the Model class. + Model object defining the model to be used. distance : abcpy.distances.Distance - Distance object that conforms to the Distance class. + Distance object defining the distance measure used to compare simulated and observed data sets. kernel : abcpy.distributions.Distribution - Distribution object defining the perturbation kernel needed for the sampling + Distribution object defining the perturbation kernel needed for the sampling. backend : abcpy.backends.Backend - Backend object that conforms to the Backend class. + Backend object defining the backend to be used. seed : integer, optional Optional initial seed for the random number generator. The default value is generated randomly. """ - def __init__(self, model, distance, kernel, backend, seed=None): + + model = None + distance = None + kernel = None + + epsilon = None + rng = None + + n_samples = None + n_samples_per_param = None + alpha = None + + observations_bds = None + accepted_parameters_bds = None + accepted_weights_bds = None + accepted_dist = None + accepted_cov_mat_bds = None + + def __init__(self, model, distance, kernel, backend, seed=None): self.model = model self.distance = distance self.kernel = kernel self.backend = backend + + self.epsilon= None self.rng = np.random.RandomState(seed) - + + # these are usually big tables, so we broadcast them to have them once + # per executor instead of once per task + self.observations_bds = None + self.accepted_parameters_bds = None + self.accepted_weights_bds = None + self.accepted_dist = None + self.accepted_cov_mat_bds = None + def sample(self, observations, steps, n_samples = 10000, n_samples_per_param = 1, alpha = 0.9, acceptance_cutoff = 0.2, covFactor = 2.0, full_output=0): - """Samples from the posterior distribution of the model parameter given the observed + """Samples from the posterior distribution of the model parameter given the observed data observations. - + Parameters ---------- - observations : numpy.ndarray + observations : numpy.ndarray Observed data. - steps : integer - Number of iterations in the sequential algoritm ("generations") + steps : integer + Number of iterations in the sequential algoritm ("generations") n_samples : integer, optional Number of samples to generate. The default value is 10000. - n_samples_per_param : integer, optional + n_samples_per_param : integer, optional Number of data points in each simulated data set. The default value is 1. alpha : float, optional A parameter taking values between [0,1], the default value is 0.1. acceptance_cutoff : float, optional Acceptance ratio cutoff, The default value is 0.2 - covFactor : float, optional + covFactor : float, optional scaling parameter of the covariance matrix. The default value is 2. full_output: integer, optional - If full_output==1, intermediate results are included in output journal. + If full_output==1, intermediate results are included in output journal. The default value is 0, meaning the intermediate results are not saved. Returns - ------- + ------- abcpy.output.Journal A journal containing simulation results, metadata and optionally intermediate results. """ - + + self.observations_bds = self.backend.broadcast(observations) + self.alpha = alpha + self.n_samples = n_samples + self.n_samples_per_param = n_samples_per_param + journal = Journal(full_output) journal.configuration["type_model"] = type(self.model) journal.configuration["type_dist_func"] = type(self.distance) - journal.configuration["n_samples"] = n_samples - journal.configuration["n_samples_per_param"] = n_samples_per_param + journal.configuration["n_samples"] = self.n_samples + journal.configuration["n_samples_per_param"] = self.n_samples_per_param journal.configuration["steps"] = steps - + accepted_parameters = None accepted_weights = None accepted_cov_mat = None @@ -1577,37 +1909,35 @@ def sample(self, observations, steps, n_samples = 10000, n_samples_per_param = 1 alpha_accepted_parameters = None alpha_accepted_weights = None alpha_accepted_dist = None - - # Initialize variables that need to be available remotely - rc = _RemoteContextAPMCABC(self.backend, self.model, self.distance, self.kernel, observations, n_samples, n_samples_per_param, alpha) # main APMCABC algorithm # print("INFO: Starting APMCABC iterations.") for aStep in range(steps): - + # 0: Drawing new new/perturbed samples using prior or MCMC Kernel # print("DEBUG: Iteration " + str(aStep) + " of APMCABC algorithm.") if aStep > 0: - n_additional_samples = n_samples - round(n_samples*alpha) + n_additional_samples = n_samples - round(n_samples * alpha) else: n_additional_samples = n_samples - - seed_arr = self.rng.randint(0, np.iinfo(np.uint32).max, size = n_additional_samples, dtype=np.uint32) + + seed_arr = self.rng.randint(0, np.iinfo(np.uint32).max, size=n_additional_samples, dtype=np.uint32) seed_pds = self.backend.parallelize(seed_arr) - #update remotely required variables + # update remotely required variables # print("INFO: Broadcasting parameters.") - rc._update_broadcasts(self.backend, alpha_accepted_parameters, alpha_accepted_weights, alpha_accepted_dist, accepted_cov_mat) + self._update_broadcasts(alpha_accepted_parameters, alpha_accepted_weights, alpha_accepted_dist, + accepted_cov_mat) - #calculate resample parameters - #print("INFO: Resampling parameters") - params_and_dist_weights_pds = self.backend.map(rc._accept_parameter, seed_pds) + # calculate resample parameters + # print("INFO: Resampling parameters") + params_and_dist_weights_pds = self.backend.map(self._accept_parameter, seed_pds) params_and_dist_weights = self.backend.collect(params_and_dist_weights_pds) new_parameters, new_dist, new_weights = [list(t) for t in zip(*params_and_dist_weights)] new_parameters = np.array(new_parameters) new_dist = np.array(new_dist) - new_weights = np.array(new_weights).reshape(n_additional_samples,1) - + new_weights = np.array(new_weights).reshape(n_additional_samples, 1) + # 1: Update all parameters, compute acceptance probability, compute epsilon if len(new_weights) == n_samples: accepted_parameters = new_parameters @@ -1616,29 +1946,30 @@ def sample(self, observations, steps, n_samples = 10000, n_samples_per_param = 1 # Compute acceptance probability prob_acceptance = 1 # Compute epsilon - epsilon = [np.percentile(accepted_dist, alpha*100)] + epsilon = [np.percentile(accepted_dist, alpha * 100)] else: - accepted_parameters = np.concatenate((alpha_accepted_parameters,new_parameters)) + accepted_parameters = np.concatenate((alpha_accepted_parameters, new_parameters)) accepted_dist = np.concatenate((alpha_accepted_dist, new_dist)) accepted_weights = np.concatenate((alpha_accepted_weights, new_weights)) # Compute acceptance probability - prob_acceptance = sum(new_dist < epsilon[-1])/len(new_dist) + prob_acceptance = sum(new_dist < epsilon[-1]) / len(new_dist) # Compute epsilon - epsilon.append(np.percentile(accepted_dist, alpha*100)) - + epsilon.append(np.percentile(accepted_dist, alpha * 100)) + # 2: Update alpha_parameters, alpha_dist and alpha_weights index_alpha = accepted_dist < epsilon[-1] - alpha_accepted_parameters = accepted_parameters[index_alpha,:] - alpha_accepted_weights = accepted_weights[index_alpha]/sum(accepted_weights[index_alpha]) + alpha_accepted_parameters = accepted_parameters[index_alpha, :] + alpha_accepted_weights = accepted_weights[index_alpha] / sum(accepted_weights[index_alpha]) alpha_accepted_dist = accepted_dist[index_alpha] - + # 3: calculate covariance # print("INFO: Calculating covariance matrix.") - new_cov_mat = covFactor * np.cov(alpha_accepted_parameters, aweights = alpha_accepted_weights.reshape(-1), rowvar=False) - accepted_cov_mat = new_cov_mat + new_cov_mat = covFactor * np.cov(alpha_accepted_parameters, aweights=alpha_accepted_weights.reshape(-1), + rowvar=False) + accepted_cov_mat = new_cov_mat # print("INFO: Saving configuration to output journal.") - if (full_output == 1 and aStep <= steps-1) or (full_output == 0 and aStep == steps-1): + if (full_output == 1 and aStep <= steps - 1) or (full_output == 0 and aStep == steps - 1): journal.add_parameters(accepted_parameters) journal.add_weights(accepted_weights) @@ -1646,352 +1977,374 @@ def sample(self, observations, steps, n_samples = 10000, n_samples_per_param = 1 if prob_acceptance < acceptance_cutoff: break - #Add epsilon_arr to the journal + # Add epsilon_arr to the journal journal.configuration["epsilon_arr"] = epsilon return journal - - - -class _RemoteContextAPMCABC: - """ - Contains everything that is sent over the network like broadcast vars and map functions - """ - - def __init__(self, backend, model, distance, kernel, observations, n_samples, n_samples_per_param, alpha): - self.model = model - self.distance = distance - self.n_samples = n_samples - self.n_samples_per_param = n_samples_per_param - self.kernel = kernel - self.alpha = alpha - - self.epsilon = None - # these are usually big tables, so we broadcast them to have them once - # per executor instead of once per task - self.observations_bds = backend.broadcast(observations) - self.alpha_accepted_parameters_bds = None - self.alpha_accepted_weights_bds = None - self.alpha_accepted_dist = None - self.accepted_cov_mat_bds = None - - def _update_broadcasts(self, backend, alpha_accepted_parameters, alpha_accepted_weights, alpha_accepted_dist, accepted_cov_mat): + def _update_broadcasts(self, accepted_parameters, accepted_weights, accepted_dist, + accepted_cov_mat): def destroy(bc): if bc != None: bc.unpersist - #bc.destroy - - if not alpha_accepted_parameters is None: - self.alpha_accepted_parameters_bds = backend.broadcast(alpha_accepted_parameters) - if not alpha_accepted_weights is None: - self.alpha_accepted_weights_bds = backend.broadcast(alpha_accepted_weights) - if not alpha_accepted_dist is None: - self.alpha_accepted_dist_bds = backend.broadcast(alpha_accepted_dist) + # bc.destroy + + if not accepted_parameters is None: + self.accepted_parameters_bds = self.backend.broadcast(accepted_parameters) + if not accepted_weights is None: + self.accepted_weights_bds = self.backend.broadcast(accepted_weights) + if not accepted_dist is None: + self.accepted_dist_bds = self.backend.broadcast(accepted_dist) if not accepted_cov_mat is None: - self.accepted_cov_mat_bds = backend.broadcast(accepted_cov_mat) + self.accepted_cov_mat_bds = self.backend.broadcast(accepted_cov_mat) # define helper functions for map step def _accept_parameter(self, seed): """ Samples a single model parameter and simulate from it until distance between simulated outcome and the observation is - smaller than eplison. - - :type seed: int - :rtype: np.array - :return: accepted parameter + smaller than epsilon. + + Parameters + ---------- + seed: integer + Initial seed for the random number generator. + + Returns + ------- + numpy.ndarray + accepted parameter """ rng = np.random.RandomState(seed) self.model.prior.reseed(rng.randint(np.iinfo(np.uint32).max, dtype=np.uint32)) self.kernel.reseed(rng.randint(np.iinfo(np.uint32).max, dtype=np.uint32)) - - if self.alpha_accepted_parameters_bds == None: + + if self.accepted_parameters_bds == None: self.model.sample_from_prior() y_sim = self.model.simulate(self.n_samples_per_param) dist = self.distance.distance(self.observations_bds.value(), y_sim) weight = 1.0 else: - index = rng.choice(len(self.alpha_accepted_weights_bds.value()), size=1, p=self.alpha_accepted_weights_bds.value().reshape(-1)) - theta = self.alpha_accepted_parameters_bds.value()[index[0]] + index = rng.choice(len(self.accepted_weights_bds.value()), size=1, + p=self.accepted_weights_bds.value().reshape(-1)) + theta = self.accepted_parameters_bds.value()[index[0]] # trucate the normal to the bounds of parameter space of the model # truncating the normal like this is fine: https://arxiv.org/pdf/0907.4010v1.pdf while True: self.kernel.set_parameters([theta, self.accepted_cov_mat_bds.value()]) - new_theta = self.kernel.sample(1)[0,:] + new_theta = self.kernel.sample(1)[0, :] theta_is_accepted = self.model.set_parameters(new_theta) if theta_is_accepted and self.model.prior.pdf(self.model.get_parameters()) != 0: break y_sim = self.model.simulate(self.n_samples_per_param) dist = self.distance.distance(self.observations_bds.value(), y_sim) - + prior_prob = self.model.prior.pdf(new_theta) denominator = 0.0 - for i in range(0, len(self.alpha_accepted_weights_bds.value())): - self.kernel.set_parameters([self.alpha_accepted_parameters_bds.value()[i,:], self.accepted_cov_mat_bds.value()]) + for i in range(0, len(self.accepted_weights_bds.value())): + self.kernel.set_parameters( + [self.accepted_parameters_bds.value()[i, :], self.accepted_cov_mat_bds.value()]) pdf_value = self.kernel.pdf(new_theta) - denominator += self.alpha_accepted_weights_bds.value()[i,0] * pdf_value - weight = 1.0 * prior_prob / denominator - - return (self.model.get_parameters(), dist, weight) + denominator += self.accepted_weights_bds.value()[i, 0] * pdf_value + weight = 1.0 * prior_prob / denominator + + return (self.model.get_parameters(), dist, weight) +class SMCABC(BaseAdaptivePopulationMC, InferenceMethod): + """This base class implements Adaptive Population Monte Carlo Approximate Bayesian computation of + Del Moral et al. [1]. -class SMCABC: - """This base class implements Adaptive Population Monte Carlo Approximate Bayesian computation of - Del Moral et al. [1]. - [1] P. Del Moral, A. Doucet, A. Jasra, An adaptive sequential Monte Carlo method for approximate Bayesian computation. Statistics and Computing, 22(5):1009–1020, 2012. - + Parameters ---------- model : abcpy.models.Model - Model object that conforms to the Model class. + Model object defining the model to be used. distance : abcpy.distances.Distance - Distance object that conforms to the Distance class. + Distance object defining the distance measure used to compare simulated and observed data sets. kernel : abcpy.distributions.Distribution - Distribution object defining the perturbation kernel needed for the sampling + Distribution object defining the perturbation kernel needed for the sampling. backend : abcpy.backends.Backend - Backend object that conforms to the Backend class. + Backend object defining the backend to be used. seed : integer, optional Optional initial seed for the random number generator. The default value is generated randomly. """ - def __init__(self, model, distance, kernel, backend, seed=None): + + model = None + distance = None + kernel = None + + epsilon = None + rng = None + + n_samples = None + n_samples_per_param = None + + observations_bds = None + accepted_parameters_bds = None + accepted_weights_bds = None + accepted_cov_mat_bds = None + accepted_y_sim_bds = None + + def __init__(self, model, distance, kernel, backend, seed=None): self.model = model self.distance = distance self.kernel = kernel self.backend = backend + + self.epsilon = None self.rng = np.random.RandomState(seed) - + + # these are usually big tables, so we broadcast them to have them once + # per executor instead of once per task + self.observations_bds = None + self.accepted_parameters_bds = None + self.accepted_weights_bds = None + self.accepted_cov_mat_bds = None + self.accepted_y_sim_bds = None + def sample(self, observations, steps, n_samples = 10000, n_samples_per_param = 1, epsilon_final = 0.1, alpha = 0.95, covFactor = 2, resample = None, full_output=0): - """Samples from the posterior distribution of the model parameter given the observed + """Samples from the posterior distribution of the model parameter given the observed data observations. - + Parameters ---------- - observations : numpy.ndarray + observations : numpy.ndarray Observed data. - steps : integer - Number of iterations in the sequential algoritm ("generations") + steps : integer + Number of iterations in the sequential algoritm ("generations") epsilon_final : float, optional The final threshold value of epsilon to be reached. The default value is 0.1. n_samples : integer, optional Number of samples to generate. The default value is 10000. - n_samples_per_param : integer, optional + n_samples_per_param : integer, optional Number of data points in each simulated data set. The default value is 1. alpha : float, optional A parameter taking values between [0,1], determinining the rate of change of the threshold epsilon. The default value is 0.5. - covFactor : float, optional + covFactor : float, optional scaling parameter of the covariance matrix. The default value is 2. full_output: integer, optional - If full_output==1, intermediate results are included in output journal. + If full_output==1, intermediate results are included in output journal. The default value is 0, meaning the intermediate results are not saved. Returns - ------- + ------- abcpy.output.Journal A journal containing simulation results, metadata and optionally intermediate results. """ - + + self.observations_bds= self.backend.broadcast(observations) + self.n_samples = n_samples + self.n_samples_per_param = n_samples_per_param + journal = Journal(full_output) journal.configuration["type_model"] = type(self.model) journal.configuration["type_dist_func"] = type(self.distance) - journal.configuration["n_samples"] = n_samples - journal.configuration["n_samples_per_param"] = n_samples_per_param + journal.configuration["n_samples"] = self.n_samples + journal.configuration["n_samples_per_param"] = self.n_samples_per_param journal.configuration["steps"] = steps - + accepted_parameters = None accepted_weights = None accepted_cov_mat = None accepted_y_sim = None - + # Define the resmaple parameter if resample == None: - resample = n_samples*0.5 - - #Define epsilon_init + resample = n_samples * 0.5 + + # Define epsilon_init epsilon = [10000] - - - # Initialize variables that need to be available remotely - rc = _RemoteContextSMCABC(self.backend, self.model, self.distance, self.kernel, observations, n_samples, n_samples_per_param) # main SMC ABC algorithm # print("INFO: Starting SMCABC iterations.") for aStep in range(0, steps): - + # Break if epsilon in previous step is less than epsilon_final if epsilon[-1] == epsilon_final: break - + # 0: Compute the Epsilon if accepted_y_sim != None: # Compute epsilon for next step fun = lambda epsilon_var: self._compute_epsilon(epsilon_var, \ - epsilon, observations, accepted_y_sim, accepted_weights, n_samples, n_samples_per_param, alpha) - epsilon_new = self._bisection(fun, epsilon_final, epsilon[-1], 0.001) + epsilon, observations, accepted_y_sim, accepted_weights, + n_samples, n_samples_per_param, alpha) + epsilon_new = self._bisection(fun, epsilon_final, epsilon[-1], 0.001) if epsilon_new < epsilon_final: epsilon_new = epsilon_final epsilon.append(epsilon_new) - # 1: calculate weights for new parameters + # 1: calculate weights for new parameters # print("INFO: Calculating weights.") if accepted_y_sim != None: - new_weights = np.zeros(shape=(n_samples),) + new_weights = np.zeros(shape=(n_samples), ) for ind1 in range(n_samples): numerator = 0.0 denominator = 0.0 for ind2 in range(n_samples_per_param): numerator += (self.distance.distance(observations, [accepted_y_sim[ind1][ind2]]) < epsilon[-1]) - denominator += (self.distance.distance(observations, [accepted_y_sim[ind1][ind2]]) < epsilon[-2]) + denominator += ( + self.distance.distance(observations, [accepted_y_sim[ind1][ind2]]) < epsilon[-2]) if denominator != 0.0: - new_weights[ind1] = accepted_weights[ind1]*(numerator/denominator) + new_weights[ind1] = accepted_weights[ind1] * (numerator / denominator) else: new_weights[ind1] = 0 new_weights = new_weights / sum(new_weights) else: - new_weights = np.ones(shape=(n_samples),)*(1.0 /n_samples) - - # 2: Resample - if accepted_y_sim != None and pow(sum(pow(new_weights,2)),-1) < resample: + new_weights = np.ones(shape=(n_samples), ) * (1.0 / n_samples) + + # 2: Resample + if accepted_y_sim != None and pow(sum(pow(new_weights, 2)), -1) < resample: print('Resampling') # Weighted resampling: - index_resampled = self.rng.choice(np.arange(n_samples), n_samples, replace = 1, p = new_weights) - accepted_parameters = accepted_parameters[index_resampled,:] - new_weights = np.ones(shape=(n_samples),)*(1.0 /n_samples) - + index_resampled = self.rng.choice(np.arange(n_samples), n_samples, replace=1, p=new_weights) + accepted_parameters = accepted_parameters[index_resampled, :] + new_weights = np.ones(shape=(n_samples), ) * (1.0 / n_samples) + # Update the weights - accepted_weights = new_weights.reshape(len(new_weights),1) + accepted_weights = new_weights.reshape(len(new_weights), 1) # 3: Drawing new perturbed samples using MCMC Kernel # print("DEBUG: Iteration " + str(aStep) + " of SMCABC algorithm.") seed_arr = self.rng.randint(0, np.iinfo(np.uint32).max, size=n_samples, dtype=np.uint32) index_arr = np.arange(n_samples) - seed_index_arr = np.column_stack((seed_arr,index_arr)) - seed_index_pds = self.backend.parallelize(seed_index_arr) + seed_and_index_arr = np.column_stack((seed_arr, index_arr)) + seed_and_index_pds = self.backend.parallelize(seed_and_index_arr) - #update remotely required variables # print("INFO: Broadcasting parameters.") - rc.epsilon = epsilon - rc._update_broadcasts(self.backend, accepted_parameters, accepted_weights, accepted_cov_mat, accepted_y_sim) + self.epsilon = epsilon + self._update_broadcasts(accepted_parameters, accepted_weights, accepted_cov_mat, accepted_y_sim) - #calculate resample parameters - #print("INFO: Resampling parameters") - params_and_ysim_pds = self.backend.map(rc._accept_parameter, seed_index_pds) + # calculate resample parameters + # print("INFO: Resampling parameters") + params_and_ysim_pds = self.backend.map(self._accept_parameter, seed_and_index_pds) params_and_ysim = self.backend.collect(params_and_ysim_pds) new_parameters, new_y_sim = [list(t) for t in zip(*params_and_ysim)] new_parameters = np.array(new_parameters) - #Update the parameters + # Update the parameters accepted_parameters = new_parameters accepted_y_sim = new_y_sim - + # 4: calculate covariance # print("INFO: Calculating covariance matrix.") - new_cov_mat = covFactor * np.cov(accepted_parameters, aweights = accepted_weights.reshape(-1), rowvar=False) - accepted_cov_mat = new_cov_mat + new_cov_mat = covFactor * np.cov(accepted_parameters, aweights=accepted_weights.reshape(-1), rowvar=False) + accepted_cov_mat = new_cov_mat # print("INFO: Saving configuration to output journal.") - if (full_output == 1 and aStep <= steps-1) or (full_output == 0 and aStep == steps-1): + if (full_output == 1 and aStep <= steps - 1) or (full_output == 0 and aStep == steps - 1): journal.add_parameters(accepted_parameters) journal.add_weights(accepted_weights) - #Add epsilon_arr to the journal + # Add epsilon_arr to the journal journal.configuration["epsilon_arr"] = epsilon return journal - - def _compute_epsilon(self, epsilon_new, epsilon, observations, accepted_y_sim, accepted_weights, n_samples, n_samples_per_param, alpha): - RHS = alpha*pow(sum(pow(accepted_weights,2)),-1) - LHS = np.zeros(shape=(n_samples),) + def _compute_epsilon(self, epsilon_new, epsilon, observations, accepted_y_sim, accepted_weights, n_samples, + n_samples_per_param, alpha): + """ + Parameters + ---------- + epsilon_new: float + New value for epsilon. + epsilon: float + Current threshold. + observations: numpy.ndarray + Observed data. + accepted_y_sim: numpy.ndarray + Accepted simulated data. + accepted_weights: numpy.ndarray + Accepted weights. + n_samples: integer + Number of samples to generate. + n_samples_per_param: integer + Number of data points in each simulated data set. + alpha: float + + Returns + ------- + float + Newly computed value for threshold. + """ + + RHS = alpha * pow(sum(pow(accepted_weights, 2)), -1) + LHS = np.zeros(shape=(n_samples), ) for ind1 in range(n_samples): numerator = 0.0 denominator = 0.0 for ind2 in range(n_samples_per_param): numerator += (self.distance.distance(observations, [accepted_y_sim[ind1][ind2]]) < epsilon_new) denominator += (self.distance.distance(observations, [accepted_y_sim[ind1][ind2]]) < epsilon[-1]) - LHS[ind1] = accepted_weights[ind1]*(numerator/denominator) + LHS[ind1] = accepted_weights[ind1] * (numerator / denominator) if sum(LHS) == 0: result = RHS else: - LHS = LHS/sum(LHS) - LHS = pow(sum(pow(LHS,2)),-1) - result = RHS-LHS - return(result) - + LHS = LHS / sum(LHS) + LHS = pow(sum(pow(LHS, 2)), -1) + result = RHS - LHS + return (result) + def _bisection(self, func, low, high, tol): - midpoint = (low+high)/2.0 - while (high-low)/2.0 > tol: + midpoint = (low + high) / 2.0 + while (high - low) / 2.0 > tol: if func(midpoint) == 0: return midpoint - elif func(low)*func(midpoint) < 0: + elif func(low) * func(midpoint) < 0: high = midpoint - else : + else: low = midpoint - midpoint = (low+high)/2.0 - - return midpoint - - -class _RemoteContextSMCABC: - """ - Contains everything that is sent over the network like broadcast vars and map functions - """ - - def __init__(self, backend, model, distance, kernel, observations, n_samples, n_samples_per_param): - self.model = model - self.distance = distance - self.n_samples = n_samples - self.n_samples_per_param = n_samples_per_param - self.kernel = kernel - - self.epsilon = None + midpoint = (low + high) / 2.0 - # these are usually big tables, so we broadcast them to have them once - # per executor instead of once per task - self.observations_bds = backend.broadcast(observations) - self.accepted_parameters_bds = None - self.accepted_weights_bds = None - self.accepted_cov_mat_bds = None - self.accepted_y_sim_bds = None + return midpoint - def _update_broadcasts(self, backend, accepted_parameters, accepted_weights, accepted_cov_mat, accepted_y_sim): + def _update_broadcasts(self, accepted_parameters, accepted_weights, accepted_cov_mat, accepted_y_sim): def destroy(bc): if bc != None: bc.unpersist - #bc.destroy - + # bc.destroy + if not accepted_parameters is None: - self.accepted_parameters_bds = backend.broadcast(accepted_parameters) + self.accepted_parameters_bds = self.backend.broadcast(accepted_parameters) if not accepted_weights is None: - self.accepted_weights_bds = backend.broadcast(accepted_weights) + self.accepted_weights_bds = self.backend.broadcast(accepted_weights) if not accepted_cov_mat is None: - self.accepted_cov_mat_bds = backend.broadcast(accepted_cov_mat) + self.accepted_cov_mat_bds = self.backend.broadcast(accepted_cov_mat) if not accepted_y_sim is None: - self.accepted_y_sim_bds = backend.broadcast(accepted_y_sim) + self.accepted_y_sim_bds = self.backend.broadcast(accepted_y_sim) - # define helper functions for map step - def _accept_parameter(self, seed_index): + # define helper functions for map step + + def _accept_parameter(self, seed_and_index): """ Samples a single model parameter and simulate from it until distance between simulated outcome and the observation is - smaller than eplison. - - :type seed: int - :rtype: np.array - :return: accepted parameter + smaller than epsilon. + + Parameters + ---------- + seed_and_index: numpy.ndarray + 2 dimensional array. The first entry specifies the initial seed for the random number generator. + The second entry defines the index in the data set. + + Returns + ------- + Tuple + The first entry of the tuple is the accepted parameters. The second entry is the simulated data set. """ - seed = seed_index[0] - index = seed_index[1] + + seed = seed_and_index[0] + index = seed_and_index[1] rng = np.random.RandomState(seed) self.model.prior.reseed(rng.randint(np.iinfo(np.uint32).max, dtype=np.uint32)) self.kernel.reseed(rng.randint(np.iinfo(np.uint32).max, dtype=np.uint32)) - - #print("on seed " + str(seed) + " distance: " + str(distance) + " epsilon: " + str(self.epsilon)) + + # print("on seed " + str(seed) + " distance: " + str(distance) + " epsilon: " + str(self.epsilon)) if self.accepted_parameters_bds == None: self.model.sample_from_prior() y_sim = self.model.simulate(self.n_samples_per_param) @@ -2000,7 +2353,7 @@ def _accept_parameter(self, seed_index): theta = self.accepted_parameters_bds.value()[index] while True: self.kernel.set_parameters([theta, self.accepted_cov_mat_bds.value()]) - new_theta = self.kernel.sample(1)[0,:] + new_theta = self.kernel.sample(1)[0, :] theta_is_accepted = self.model.set_parameters(new_theta) if theta_is_accepted and self.model.prior.pdf(self.model.get_parameters()) != 0: break @@ -2010,17 +2363,19 @@ def _accept_parameter(self, seed_index): numerator = 0.0 denominator = 0.0 for ind in range(self.n_samples_per_param): - numerator += (self.distance.distance(self.observations_bds.value(), [y_sim[ind]]) < self.epsilon[-1]) - denominator += (self.distance.distance(self.observations_bds.value(), [y_sim_old[ind]]) < self.epsilon[-1]) - ratio_data_epsilon = numerator/denominator - ratio_prior_prob = self.model.prior.pdf(new_theta)/self.model.prior.pdf(theta) + numerator += ( + self.distance.distance(self.observations_bds.value(), [y_sim[ind]]) < self.epsilon[-1]) + denominator += ( + self.distance.distance(self.observations_bds.value(), [y_sim_old[ind]]) < self.epsilon[-1]) + ratio_data_epsilon = numerator / denominator + ratio_prior_prob = self.model.prior.pdf(new_theta) / self.model.prior.pdf(theta) self.kernel.set_parameters([new_theta, self.accepted_cov_mat_bds.value()]) kernel_numerator = self.kernel.pdf(theta) - self.kernel.set_parameters([theta, self.accepted_cov_mat_bds.value()]) + self.kernel.set_parameters([theta, self.accepted_cov_mat_bds.value()]) kernel_denominator = self.kernel.pdf(new_theta) - ratio_likelihood_prob = kernel_numerator/kernel_denominator - acceptance_prob = min(1,ratio_data_epsilon*ratio_prior_prob*ratio_likelihood_prob) - if rng.binomial(1,acceptance_prob) == 1: + ratio_likelihood_prob = kernel_numerator / kernel_denominator + acceptance_prob = min(1, ratio_data_epsilon * ratio_prior_prob * ratio_likelihood_prob) + if rng.binomial(1, acceptance_prob) == 1: self.model.set_parameters(new_theta) else: self.model.set_parameters(theta) @@ -2028,5 +2383,5 @@ def _accept_parameter(self, seed_index): else: self.model.set_parameters(self.accepted_parameters_bds.value()[index]) y_sim = self.accepted_y_sim_bds.value()[index] - - return (self.model.get_parameters(), y_sim) \ No newline at end of file + + return (self.model.get_parameters(), y_sim) diff --git a/abcpy/models.py b/abcpy/models.py index 9cd6179b..b2d85d28 100644 --- a/abcpy/models.py +++ b/abcpy/models.py @@ -28,7 +28,7 @@ def __init__(self, prior, seed = None): Optional initial seed for the random number generator that can be used in the model. The default value is generated randomly. """ - raise NotImplemented + raise NotImplementedError @abstractmethod @@ -54,15 +54,15 @@ def set_parameters(self, theta): TRUE if model accepts the provided parameters, FALSE otherwise """ - raise NotImplemented + raise NotImplementedError @abstractmethod - def sample_from_prior(): + def sample_from_prior(self): """To be overwritten by any sub-class: should resample the model parameters from the prior distribution. """ - raise NotImplemented + raise NotImplementedError @abstractmethod @@ -81,7 +81,7 @@ def simulate(self, k): An array containing k realizations of the model """ - raise NotImplemented + raise NotImplementedError @abstractmethod @@ -94,7 +94,7 @@ def get_parameters(self): An array containing the p parameters of the model """ - raise NotImplemented + raise NotImplementedError class Gaussian(Model): @@ -167,13 +167,10 @@ def simulate(self, k): class Student_t(Model): """This class implements the Student_t distribution with unknown mean :math:`\mu` and unknown degrees of freedom. - """ - def __init__(self, prior, mu=None, df=None, seed=None): - """ Parameters ---------- prior: abcpy.distributions.Distribution - Prior distribution + Prior distribution mu: float, optional Mean of the Stundent_t distribution. If the parameters is omitted, sampled from the prior. @@ -181,8 +178,10 @@ def __init__(self, prior, mu=None, df=None, seed=None): The degrees of freedom of the Student_t distribution. If the parameters is omitted, sampled from the prior. seed: int, optional - Initial seed. The default value is generated randomly. - """ + Initial seed. The default value is generated randomly. + """ + + def __init__(self, prior, mu=None, df=None, seed=None): assert(not (mu == None) != (df == None)) self.prior = prior if mu == None and df == None: @@ -224,19 +223,20 @@ class MixtureNormal(Model): """This class implements the Mixture of multivariate normal ditribution with unknown mean \ :math:`\mu` described as following, :math:`x|\mu \sim 0.5\mathcal{N}(\mu,I_p)+0.5\mathcal{N}(\mu,0.01I_p)`, where :math:`x=(x_1,x_2,\ldots,x_p)` is the - dataset simulated from the model and mean is :math:`\mu=(\mu_1,\mu_2,\ldots,\mu_p)`. """ + dataset simulated from the model and mean is :math:`\mu=(\mu_1,\mu_2,\ldots,\mu_p)`. + + Parameters + ---------- + prior: abcpy.distributions.Distribution + Prior distribution + mu: numpy.ndarray or list, optional + Mean of the mixture normal. If the parameter is omitted, sampled + from the prior. + seed: int, optional + Initial seed. The default value is generated randomly. + """ def __init__(self, prior, mu, seed = None): - """ - Parameters - ---------- - prior: abcpy.distributions.Distribution - Prior distribution - mu: numpy.ndarray or list, optional - Mean of the mixture normal. If the parameter is omitted, sampled - from the prior. - seed: int, optional - Initial seed. The default value is generated randomly. - """ + # Assign prior if not isinstance(prior, Distribution): raise TypeError('Prior is not of our defined Prior class type') @@ -291,26 +291,25 @@ class StochLorenz95(Model): [2] Lorenz, E. (1995). Predictability: a problem partly solved. In Proceedings of the Seminar on Predictability, volume 1, pages 1–18. European Center on Medium Range - Weather Forecasting, Europe + Weather Forecasting, Europe + + Parameters + ---------- + prior: abcpy.distributions.Distribution + Prior distribution + theta: list or numpy.ndarray, optional + Closure parameters. If the parameter is omitted, sampled + from the prior. + initial_state: numpy.ndarray, optional + Initial state value of the time-series, The default value is None, which assumes a previously computed + value from a full Lorenz model as the Initial value. + n_timestep: int, optional + Number of timesteps between [0,4], where 4 corresponds to 20 days. The default value is 160. + seed: int, optional + Initial seed. The default value is generated randomly. """ def __init__(self, prior, theta, initial_state = None, n_timestep = 160, seed = None): - """ - Parameters - ---------- - prior: abcpy.distributions.Distribution - Prior distribution - theta: list or numpy.ndarray, optional - Closure parameters. If the parameter is omitted, sampled - from the prior. - initial_state: numpy.ndarray, optional - Initial state value of the time-series, The default value is None, which assumes a previously computed - value from a full Lorenz model as the Initial value. - n_timestep: int, optional - Number of timesteps between [0,4], where 4 corresponds to 20 days. The default value is 160. - seed: int, optional - Initial seed. The default value is generated randomly. - """ # Assign prior if not isinstance(prior, Distribution): @@ -464,29 +463,29 @@ def _rk4ode(self, ode, timespan, timeseries_initial, parameter): timeseries[:,ind+1] = timeseries_initial # Return the solved timeseries at the values in timespan return timeseries + class Ricker(Model): """Ecological model that describes the observed size of animal population over time described in [1]. [1] S. N. Wood. Statistical inference for noisy nonlinear ecological dynamic systems. Nature, 466(7310):1102–1104, Aug. 2010. + + Parameters + ---------- + prior: abcpy.distributions.Distribution + Prior distribution + theta: list or numpy.ndarray, optional + The parameter is a vector consisting of three numbers \ + :math:`\log r` (real number), :math:`\sigma` (positive real number, > 0), :math:`\phi` (positive real number > 0) + If the parameter is ommitted, sampled from the prior. + n_timestep: int, optional + Number of timesteps. The default value is 100. + seed: int, optional + Initial seed. The default value is generated randomly. """ def __init__(self, prior, theta=None, n_timestep = 100, seed = None): - """ - Parameters - ---------- - prior: abcpy.distributions.Distribution - Prior distribution - theta: list or numpy.ndarray, optional - The parameter is a vector consisting of three numbers \ - :math:`\log r` (real number), :math:`\sigma` (positive real number, > 0), :math:`\phi` (positive real number > 0) - If the parameter is ommitted, sampled from the prior. - n_timestep: int, optional - Number of timesteps. The default value is 100. - seed: int, optional - Initial seed. The default value is generated randomly. - """ # Assign prior if not isinstance(prior, Distribution): raise TypeError('Prior is not of our defined Prior class type') diff --git a/abcpy/modelselections.py b/abcpy/modelselections.py index df0e1346..5fc61e9e 100644 --- a/abcpy/modelselections.py +++ b/abcpy/modelselections.py @@ -38,7 +38,12 @@ def __init__(self, model_array, statistics_calc, backend, seed = None): self.reference_table_calculated = None - raise NotImplemented + raise NotImplementedError + + def __getstate__(self): + state = self.__dict__.copy() + del state['backend'] + return state @abstractmethod @@ -65,7 +70,7 @@ def select_model(self, observations, n_samples = 1000, n_samples_per_param = 100 """ - raise NotImplemented + raise NotImplementedError @abstractmethod def posterior_probability(self, observations): @@ -83,7 +88,7 @@ def posterior_probability(self, observations): A vector containing the approximate posterior probability of the model chosen. """ - raise NotImplemented + raise NotImplementedError class RandomForest(ModelSelections): """ @@ -109,11 +114,16 @@ def __init__(self, model_array, statistics_calc, backend, N_tree = 100, n_try_fr self.statistics_calc = statistics_calc self.backend = backend self.rng = np.random.RandomState(seed) - self.seed = seed + self.seed = seed + + self.n_samples_per_param = None + self.reference_table_calculated = 0 self.N_tree = N_tree self.n_try_fraction = n_try_fraction + self.observations_bds = None + def select_model(self, observations, n_samples = 1000, n_samples_per_param = 1): """ Parameters @@ -130,15 +140,16 @@ def select_model(self, observations, n_samples = 1000, n_samples_per_param = 1): A model which are of type abcpy.models.Model """ + self.n_samples_per_param = n_samples_per_param + self.observations_bds = self.backend.broadcast(observations) + # Creation of reference table if self.reference_table_calculated is 0: - rc = _RemoteContextForReferenceTable(self.backend, self.model_array, self.statistics_calc, observations, n_samples_per_param) - - # Simulating the data, distance and statistics + # Simulating the data, distance and statistics seed_arr = self.rng.randint(1, n_samples*n_samples, size=n_samples, dtype=np.int32) seed_pds = self.backend.parallelize(seed_arr) - model_data_pds = self.backend.map(rc._simulate_model_data, seed_pds) + model_data_pds = self.backend.map(self._simulate_model_data, seed_pds) model_data = self.backend.collect(model_data_pds) models, data, statistics = [list(t) for t in zip(*model_data)] self.reference_table_models = models @@ -160,6 +171,36 @@ def select_model(self, observations, n_samples = 1000, n_samples_per_param = 1): return(self.model_array[int(classifier.predict(self.statistics_calc.statistics(observations)))]) + def _simulate_model_data(self, seed): + """ + Samples a single model parameter and simulates from it until + distance between simulated outcome and the observation is + smaller than eplison. + + Parameters + ---------- + seed: int + value of a seed to be used for reseeding + Returns + ------- + np.array + accepted parameter + """ + # reseed random number genearator + rng = np.random.RandomState(seed) + len_model_array = len(self.model_array) + model = self.model_array[int(sum(np.linspace(0, len_model_array - 1, len_model_array) \ + * rng.multinomial(1, (1 / len_model_array) * np.ones(len_model_array))))] + # reseed prior + model.prior.reseed(seed) + # sample from prior + model.sample_from_prior() + # sample data set, extract statistics and compute distance from y_obs + y_sim = model.simulate(self.n_samples_per_param) + statistics = self.statistics_calc.statistics(y_sim) + + return (model, y_sim, statistics) + def posterior_probability(self, observations, n_samples = 1000, n_samples_per_param = 1): """ Parameters @@ -175,16 +216,16 @@ def posterior_probability(self, observations, n_samples = 1000, n_samples_per_pa abcpy.models.Model A model which are of type abcpy.models.Model - """ + """ + self.n_samples_per_param = 1 + self.observations_bds = self.backend.broadcast(observations) # Creation of reference table if self.reference_table_calculated is 0: - rc = _RemoteContextForReferenceTable(self.backend, self.model_array, self.statistics_calc, observations, n_samples_per_param) - - # Simulating the data, distance and statistics + # Simulating the data, distance and statistics seed_arr = self.rng.randint(1, n_samples*n_samples, size=n_samples, dtype=np.int32) seed_pds = self.backend.parallelize(seed_arr) - model_data_pds = self.backend.map(rc._simulate_model_data, seed_pds) + model_data_pds = self.backend.map(self._simulate_model_data, seed_pds) model_data = self.backend.collect(model_data_pds) models, data, statistics = [list(t) for t in zip(*model_data)] self.reference_table_models = models @@ -214,47 +255,4 @@ def posterior_probability(self, observations, n_samples = 1000, n_samples_per_pa regressor.fit(self.reference_table_statistics,pred_error) return(1-regressor.predict(self.statistics_calc.statistics(observations))) - -class _RemoteContextForReferenceTable: - """ - Contains everything that is sent over the network like broadcast vars and map functions - """ - - def __init__(self, backend, model_array, stat_calc, observations, n_samples_per_param): - self.model_array = model_array - self.stat_calc = stat_calc - self.n_samples_per_param = n_samples_per_param - - # these are usually big tables, so we broadcast them to have them once - # per executor instead of once per task - self.observations_bds = backend.broadcast(observations) - - def _simulate_model_data(self, seed): - """ - Samples a single model parameter and simulates from it until - distance between simulated outcome and the observation is - smaller than eplison. - - Parameters - ---------- - seed: int - value of a seed to be used for reseeding - Returns - ------- - np.array - accepted parameter - """ - #reseed random number genearator - rng = np.random.RandomState(seed) - len_model_array = len(self.model_array) - model = self.model_array[int(sum(np.linspace(0,len_model_array-1,len_model_array)\ - *rng.multinomial(1,(1/len_model_array)*np.ones(len_model_array))))] - #reseed prior - model.prior.reseed(seed) - #sample from prior - model.sample_from_prior() - #sample data set, extract statistics and compute distance from y_obs - y_sim = model.simulate(self.n_samples_per_param) - statistics = self.stat_calc.statistics(y_sim) - - return (model, y_sim, statistics) \ No newline at end of file + diff --git a/abcpy/statistics.py b/abcpy/statistics.py index 97b80592..d2e65c38 100644 --- a/abcpy/statistics.py +++ b/abcpy/statistics.py @@ -27,7 +27,7 @@ def __init__(self, degree = 2, cross = True): Defines whether to include the cross-product terms. The default value is TRUE, meaning the cross product term is included. """ - raise NotImplemented + raise NotImplementedError @abstractmethod @@ -47,7 +47,7 @@ def statistics(self, data): """ - raise NotImplemented + raise NotImplementedError def _polynomial_expansion(self, summary_statistics): """Helper function that does the polynomial expansion and includes cross-product diff --git a/abcpy/summaryselections.py b/abcpy/summaryselections.py index 741ea891..12f04ae5 100644 --- a/abcpy/summaryselections.py +++ b/abcpy/summaryselections.py @@ -28,12 +28,17 @@ def __init__(self, model, statistics_calc, backend, n_samples = 1000, seed = Non seed: integer, optional Optional initial seed for the random number generator. The default value is generated randomly. """ - raise NotImplemented + raise NotImplementedError + + def __getstate__(self): + state = self.__dict__.copy() + del state['backend'] + return state @abstractmethod def transformation(self, statistics): - raise NotImplemented + raise NotImplementedError class Semiautomatic(Summaryselections): @@ -49,14 +54,12 @@ def __init__(self, model, statistics_calc, backend, n_samples = 1000, seed = Non self.backend = backend self.rng = np.random.RandomState(seed) self.model.prior.reseed(self.rng.randint(np.iinfo(np.uint32).max, dtype=np.uint32)) - - rc = _RemoteContextSemiautomatic(self.backend, self.model, self.statistics_calc) - + # main algorithm seed_arr = self.rng.randint(1, n_samples*n_samples, size=n_samples, dtype=np.int32) seed_pds = self.backend.parallelize(seed_arr) - sample_parameters_statistics_pds = self.backend.map(rc._sample_parameter_statistics, seed_pds) + sample_parameters_statistics_pds = self.backend.map(self._sample_parameter_statistics, seed_pds) sample_parameters_and_statistics = self.backend.collect(sample_parameters_statistics_pds) sample_parameters, sample_statistics = [list(t) for t in zip(*sample_parameters_and_statistics)] sample_parameters = np.array(sample_parameters) @@ -66,32 +69,21 @@ def __init__(self, model, statistics_calc, backend, n_samples = 1000, seed = Non regr = linear_model.LinearRegression(fit_intercept=True) for ind in range(sample_parameters.shape[1]): regr.fit(sample_statistics, sample_parameters[:,ind]) - self.coefficients_learnt[:,ind] = regr.coef_ + self.coefficients_learnt[ind,:] = regr.coef_ def transformation(self, statistics): - if not statistics.shape[1] == self.coefficients_learnt.shape[0]: + if not statistics.shape[1] == self.coefficients_learnt.shape[1]: raise ValueError('Mismatch in dimension of summary statistics') - return np.dot(statistics,self.coefficients_learnt) - - -class _RemoteContextSemiautomatic: - """ - Contains everything that is sent over the network like broadcast vars and map functions - """ - - def __init__(self, backend, model, stat_calc): - self.model = model - self.stat_calc = stat_calc + return np.dot(statistics,np.transpose(self.coefficients_learnt)) - def _sample_parameter_statistics(self, seed): """ Samples a single model parameter and simulates from it until distance between simulated outcome and the observation is smaller than eplison. - + Parameters - ---------- + ---------- seed: int value of a seed to be used for reseeding Returns @@ -101,8 +93,11 @@ def _sample_parameter_statistics(self, seed): """ self.model.prior.reseed(seed) self.model.sample_from_prior() + + return (self.model.get_parameters(), self.statistics_calc.statistics(self.model.simulate(1))) + + + - return (self.model.get_parameters(), self.stat_calc.statistics(self.model.simulate(1))) - \ No newline at end of file diff --git a/tests/inferences_tests.py b/tests/inferences_tests.py index 4c3d7047..d544b081 100644 --- a/tests/inferences_tests.py +++ b/tests/inferences_tests.py @@ -13,7 +13,7 @@ from abcpy.statistics import Identity -from abcpy.inferences import RejectionABC, PMC, PMCABC, _RemoteContextPMCABC, SABC, ABCsubsim, SMCABC, APMCABC, RSMCABC +from abcpy.inferences import RejectionABC, PMC, PMCABC, SABC, ABCsubsim, SMCABC, APMCABC, RSMCABC class RejectionABCTest(unittest.TestCase): def test_sample(self): @@ -115,8 +115,6 @@ def test_sample(self): self.assertLess(abs(mu_post_mean - (-1.4033145848) ), 1e-10) self.assertLess(abs(sigma_post_mean - 7.05175546876), 1e-10) - - class PMCABCTests(unittest.TestCase): def setUp(self): @@ -147,7 +145,7 @@ def setUp(self): def test_calculate_weight(self): n_samples = 2 - rc = _RemoteContextPMCABC(self.backend, self.model, self.dist_calc, self.kernel, self.observation, n_samples, 1) + rc = PMCABC(self.model, self.dist_calc, self.kernel, self.backend, 1) theta = np.array([1.0]) @@ -157,7 +155,7 @@ def test_calculate_weight(self): accepted_parameters = np.array([[1.0], [1.0 + np.sqrt(2)]]) accepted_weights = np.array([[.5], [.5]]) accepted_cov_mat = np.array([[1.0]]) - rc._update_broadcasts(self.backend, accepted_parameters, accepted_weights, accepted_cov_mat) + rc._update_broadcasts(accepted_parameters, accepted_weights, accepted_cov_mat) weight = rc._calculate_weight(theta) expected_weight = (2.0 * np.sqrt(2.0 * np.pi)) /(( 1 + np.exp(-1))*100) self.assertEqual(weight, expected_weight) @@ -325,7 +323,8 @@ def test_sample(self): self.assertEqual(sigma_sample_shape, (10,)) self.assertEqual(weights_sample_shape, (10,)) self.assertLess(mu_post_mean - (-2.98633946126), 10e-2) - self.assertLess(sigma_post_mean - 6.40146881524, 10e-2) + self.assertLess(sigma_post_mean - 6.40146881524, 10e-2) + class SMCABCTests(unittest.TestCase): def setUp(self): diff --git a/tests/pickle_tests.py b/tests/pickle_tests.py new file mode 100644 index 00000000..ecbb39c8 --- /dev/null +++ b/tests/pickle_tests.py @@ -0,0 +1,34 @@ +import unittest +import cloudpickle +import numpy as np +import pickle + +'''We use pickle in our MPI backend to send a method from the master to the workers. The object with which this method is associated cotains the backend as an attribute, while the backend itself contains the data on which the workers should work. Pickling the method results in pickling the backend, which results in the whole data being pickled and sent, which is undersirable. + +In pickle, the "getstate" method can be specified. When an object cotaining a "getstate" method is pickled, only the attributes specified within that method are pickled. + +This test checks whether everything is working correctly with cloudpickle. +''' + +class ToBePickled: + def __init__(self): + self.included = 5 + self.notIncluded = np.zeros(10**5) + + def __getstate__(self): + """Method that tells cloudpickle which attributes should be pickled + Returns + ------- + state + all the attributes that should be pickled + """ + state = self.__dict__.copy() + del state['notIncluded'] + return state + +class PickleTests(unittest.TestCase): + def test_exclusion(self): + """Tests whether after pickling and unpickling the object, the attribute which should not be included exists""" + pickled_object = cloudpickle.dumps(ToBePickled(), pickle.HIGHEST_PROTOCOL) + unpickled_object = cloudpickle.loads(pickled_object) + assert(not(hasattr(pickled_object,'notIncluded')))