Source code for heron.training

"""

These are functions designed to be used for training a Gaussian
process made using heron.


"""

import scipy.optimize
from scipy.optimize import minimize
import emcee
import numpy as np
from functools import partial


[docs]def ln_likelihood(p, gp): """ Returns to log-likelihood of the Gaussian process, which can be used to learn the hyperparameters of the GP. Parameters ---------- gp : heron `Regressor` object The gaussian process to be evaluated p : array-like An array of the hyper-parameters at which the model is to be evaluated. Returns ------- ln_likelihood : float The log-likelihood for the Gaussian process Notes ----- * TODO Add the ability to specify the priors on each hyperparameter. """ #return gp.loghyperpriors(p) + return gp._lnlikelihood(p)
# Nested sampling import nestle
[docs]def prior_transform(x): return x
[docs]def run_nested(gp, metric="loglikelihood", **kwargs): result = nestle.sample(gp.neg_ln_likelihood, prior_transform, 2, method='multi') #gp.set_hyperparameters(MAP.x) #gp.update() return result
# MCMC Stuff using emcee try: from IPython.core.display import clear_output ipython_available = True except: ipython_available = False
[docs]def run_sampler(sampler, initial, iterations): """ Run the MCMC sampler for some number of iterations, but output a progress bar so you can keep track of what's going on """ #sampler.run_mcmc(initial, 1) #for iteration in xrange(iterations/10-1): # sampler.run_mcmc(None, 10) import progressbar with progressbar.ProgressBar(max_value=iterations) as bar: for iteration, sample in enumerate(sampler.sample(initial, iterations=iterations)): position = sample[0] with open("chain.dat", "a") as f: np.save(f, position) # for k in range(position.shape[0]): # f.write("{0:4d} {1:s}\n".format(str(k), " ".join(position[k]))) if ipython_available: clear_output() bar.update(iteration) return sampler
[docs]def run_training_map(gp, metric = "loglikelihood", repeats=20, **kwargs): """ Find the maximum a posteriori training values for the Gaussian Process. Parameters ---------- gp : heron.GaussianProcess, The Gaussian process object. metric : {"loglikelihood", "cv"} The metric to be used to train the MCMC. Defaults to log likelihood (loglikelihood), which is the more traditionally Bayesian manner, but cross-validation (cv) is also available. repeats : int, optional The number of times that the optimisation should be repeated in order to partially combat having the optimiser choose a local rather than the global maximum log_like. Notes ----- The current implementation has no way of specifying the optimisation algorithm. * TODO Add an option to change the optimisation algorithm. """ if metric=="loglikelihood": minfunc = gp.neg_ln_likelihood elif metric=="cv": minfunc = gp.neg_cross_validation minima, locs = [], [] # non-basin-hopping if "basinhopping" not in kwargs: for run in range(repeats): MAP = minimize(minfunc, gp.get_hyperparameters(),) minima.append(MAP.fun) locs.append(MAP.x) gp.set_hyperparameters(locs[np.argmin(minima)]) else: # Use basin hopping to find the global extreme # First remove the basin-hopping keyword though del kwargs['basinhopping'] # Then run the optimisation MAP = scipy.optimize.basinhopping(minfunc, gp.gp.get_parameter_vector(), niter=repeats, **kwargs) gp.set_hyperparameters(MAP.x) gp.update() return MAP
[docs]def run_training_nested(gp, method="multi", maxiter=None, npoints = 1000): """Train the Gaussian Process model using nested sampling. Parameters ---------- gp : heron.Regressor The Gaussian Process object. method : {"single", "multi"} The nested sampling method to be used. maxiter : int The maximum number of iterations which should be carried out on the marginal likelihood. Optional. npoints : int The number of live-points to use in the optimisation. """ if not maxiter: maxiter = 10000000 ndim = len(gp.gp.get_parameter_vector()) prior_transform = gp.hyperpriortransform nest = nestle.sample(gp.neg_ln_likelihood, prior_transform, ndim, method=method, callback=nestle.print_progress, maxiter=maxiter, npoints=1000) #decline_factor = 0.5) nest_max = np.argmax(nest['logl']) gp.set_hyperparameters(nest['samples'][nest_max])
[docs]def run_training_mcmc(gp, walkers = 200, burn = 500, samples = 1000, metric = "loglikelihood", samplertype="ensemble"): """ Train a Gaussian process using an MCMC process to find the maximum evidence. Parameters ---------- gp : heron.Regressor The Gaussian process object. walkers : int The number of MCMC walkers. burn : int The number of samples to be used to evaluate the burn-in for the MCMC. samples : int The number of samples to be used for the production sampling. metric : {"loglikelihood", "cv"} The metric to be used to train the MCMC. Defaults to log likelihood (loglikelihood), which is the more traditionally Bayesian manner, but cross-validation (cv) is also available. samplertype : str {"ensemble", "pt"} The sampler to be used on the model. Returns ------- probs : array The log probabilities. samples : array The array of samples from the sampling chains. Notes ----- At present the algorithm assigns the median of the samples to the value of the kernel vector; this may not ultimately be the best way to do this, and so it should be possible to specify the desired value to be used from the distribution. * TODO Add ability to change median to other statistics for training """ start = gp.gp.get_parameter_vector() ndim, nwalkers, ntemps = len(start), walkers, 20 # if metric=="loglikelihood": minfunc = ln_likelihood elif metric=="cv": minfunc = cross_validation if samplertype == "ensemble": p0 = [start for i in range(nwalkers)] p0 = np.random.uniform(low=-1.0, high=1.0, size=(nwalkers, ndim)) sampler = emcee.EnsembleSampler(nwalkers, ndim, minfunc, args=[gp], threads=4) elif samplertype == "pt": p0 = np.random.uniform(low=-1.0, high=1.0, size=(ntemps, nwalkers, ndim)) sampler = emcee.PTSampler(ntemps, nwalkers, ndim, minfunc, logp, loglargs=[gp], threads=4) burn = run_sampler(sampler, p0, burn, ) sampler.reset() sampler = run_sampler(sampler, p0, samples) probs = sampler.lnprobability[:,:].reshape((-1)) samples = sampler.chain[:, :, :].reshape((-1, ndim)) gp.gp.set_parameter_vector(samples[np.argmax(probs)]) return gp.gp, probs, samples
[docs]def cross_validation(p, gp): """ Calculate the cross-validation factor between the training set and the test set. Parameters ---------- gp : heron.Regressor The Gaussian process object. p : array, optional The hyperparameters for the Gaussian process kernel. Defaults to None, which causes the current values for the hyperparameters to be used. Returns ------- cv : float The cross validation of the test data and the model. """ old_p = gp.get_hyperparameters() gp.set_hyperparameters(p) prediction = gp.prediction(gp.training_object.test_targets.T) return (gp.training_object.test_labels-np.array(prediction[0]).T).max()
[docs]def train_cv(gp): cross_validation_f = partial(cross_validation, gp=gp) MAP = minimize(cross_validation_f, gp.get_hyperparameters()) gp.set_hyperparameters(MAP.x) #gp.compute(training_x_batch, yerr=1e-6, seed=1234) return MAP
[docs]def logp(x): if np.any(np.abs(x) > 20): return -np.inf return 0.0