Source code for pmlearn.base

"""Base classes for all Bayesian models."""

# Authors: Daniel Emaasit <daniel.emaasit@gmail.com>
#          Nicole Carlson <nicole@parsingscience.com>
# License: BSD 3 clause

import joblib
import numpy as np
import matplotlib.pyplot as plt
import pymc3 as pm
import seaborn as sns
from sklearn.base import BaseEstimator
from sklearn.base import RegressorMixin, ClassifierMixin, DensityMixin

from .exceptions import NotFittedError


[docs]class BayesianModel(BaseEstimator): """Base class for all Bayesian models in pymc-learn Notes ----- All Bayesian models should specify all the parameters that can be set at the class level in their ``__init__`` as explicit keyword arguments (no ``*args`` or **kwargs``). """ def __init__(self): self.cached_model = None self.default_advi_sample_draws = 10000 self.inference_type = None self.num_pred = None self.shared_vars = None self.summary = None self.trace = None self.num_training_samples = None
[docs] def create_model(self): """Create model """ raise NotImplementedError
def _set_shared_vars(self, shared_vars): """ Sets theano shared variables for the PyMC3 model. """ for key in shared_vars.keys(): self.shared_vars[key].set_value(shared_vars[key]) def _inference(self, inference_type='advi', inference_args=None): """ Calls internal methods for two types of inferences. Raises an error if the inference_type is not supported. Parameters ========== inference_type : string, specifies which inference method to call. Defaults to 'advi'. Currently, only 'advi' and 'nuts' are supported inference_args : dict, arguments to be passed to the inference methods. Check the PyMC3 docs to see what is permitted. Defaults to None. """ if inference_type == 'advi': self._advi_inference(inference_args) elif inference_type == 'nuts': self._nuts_inference(inference_args) else: raise NotFittedError('{} is not a supported type' ' of inference'.format(inference_type)) def _advi_inference(self, inference_args): """ Runs variational ADVI and then samples from those results. Parameters ---------- inference_args : dict, arguments to be passed to the PyMC3 fit method. See PyMC3 doc for permissible values. """ with self.cached_model: inference = pm.ADVI() approx = pm.fit(method=inference, **inference_args) self.approx = approx self.trace = approx.sample(draws=self.default_advi_sample_draws) self.summary = pm.summary(self.trace) self.advi_hist = inference.hist def _nuts_inference(self, inference_args): """ Runs NUTS inference. Parameters ---------- inference_args : dict, arguments passed to the PyMC3 sample method. See PyMC3 doc for permissible values. """ with self.cached_model: step = pm.NUTS() nuts_trace = pm.sample(step=step, **inference_args) self.trace = nuts_trace self.summary = pm.summary(self.trace) def _set_default_inference_args(self): """ Set default values for inference arguments if none are provided, dependent on inference type. ADVI ----- callbacks : list containing a parameter stopping check. n : number of iterations for ADVI fit, defaults to 200000 NUTS ----- draws : the number of samples to draw, defaults to 2000 """ if self.inference_type == 'advi': inference_args = { 'n': 200000, 'callbacks': [pm.callbacks.CheckParametersConvergence()] } elif self.inference_type == 'nuts': inference_args = { 'draws': 2000 } else: inference_args = None return inference_args
[docs] def save(self, file_prefix, custom_params=None): """ Saves the trace and custom params to files with the given file_prefix. Parameters ---------- file_prefix : str, path and prefix used to identify where to save the trace for this model. Ex: given file_prefix = "path/to/file/" This will attempt to save to "path/to/file/trace.pickle" custom_params : Dictionary of custom parameters to save. Defaults to None """ fileObject = open(file_prefix + 'trace.pickle', 'wb') joblib.dump(self.trace, fileObject) fileObject.close() if custom_params: fileObject = open(file_prefix + 'params.pickle', 'wb') joblib.dump(custom_params, fileObject) fileObject.close()
[docs] def load(self, file_prefix, load_custom_params=False): """ Loads a saved version of the trace, and custom param files with the given file_prefix. Parameters ---------- file_prefix : str, path and prefix used to identify where to load the saved trace for this model. Ex: given file_prefix = "path/to/file/" This will attempt to load "path/to/file/trace.pickle" load_custom_params : Boolean flag to indicate whether custom parameters should be loaded. Defaults to False. Returns ---------- custom_params : Dictionary of custom parameters """ self.trace = joblib.load(file_prefix + 'trace.pickle') custom_params = None if load_custom_params: custom_params = joblib.load(file_prefix + 'params.pickle') return custom_params
[docs] def plot_elbo(self): """ Plot the ELBO values after running ADVI minibatch. """ if self.inference_type != 'advi': raise NotFittedError( 'This method should only be called after calling fit with ' 'ADVI minibatch.' ) sns.set_style("white") plt.plot(-self.advi_hist) plt.ylabel('ELBO') plt.xlabel('iteration') sns.despine()
[docs]class BayesianRegressorMixin(RegressorMixin): """Mixin for regression models in pmlearn """
[docs] def fit(self, X, y, inference_type='advi', minibatch_size=None, inference_args=None): """ Train the Linear Regression model Parameters ---------- X : numpy array, shape [n_samples, n_features] y : numpy array, shape [n_samples, ] inference_type : string, specifies which inference method to call. Defaults to 'advi'. Currently, only 'advi' and 'nuts' are supported minibatch_size : number of samples to include in each minibatch for ADVI, defaults to None, so minibatch is not run by default inference_args : dict, arguments to be passed to the inference methods. Check the PyMC3 docs for permissable values. If no arguments are specified, default values will be set. """ self.num_training_samples, self.num_pred = X.shape self.inference_type = inference_type if y.ndim != 1: y = np.squeeze(y) if not inference_args: inference_args = self._set_default_inference_args() if self.cached_model is None: self.cached_model = self.create_model() if minibatch_size: with self.cached_model: minibatches = { self.shared_vars['model_input']: pm.Minibatch( X, batch_size=minibatch_size), self.shared_vars['model_output']: pm.Minibatch( y, batch_size=minibatch_size), } inference_args['more_replacements'] = minibatches else: self._set_shared_vars({'model_input': X, 'model_output': y}) self._inference(inference_type, inference_args) return self
[docs] def predict(self, X, return_std=False): """ Predicts values of new data with a trained Linear Regression model Parameters ---------- X : numpy array, shape [n_samples, n_features] return_std : Boolean flag Boolean flag of whether to return standard deviations with mean values. Defaults to False. """ if self.trace is None: raise NotFittedError('Run fit on the model before predict.') num_samples = X.shape[0] if self.cached_model is None: self.cached_model = self.create_model() self._set_shared_vars({'model_input': X, 'model_output': np.zeros(num_samples)}) ppc = pm.sample_ppc(self.trace, model=self.cached_model, samples=2000) if return_std: return ppc['y'].mean(axis=0), ppc['y'].std(axis=0) else: return ppc['y'].mean(axis=0)
[docs]class BayesianClassifierMixin(ClassifierMixin): """Mixin for regression models in pmlearn """
[docs] def fit(self, X, y, inference_type='advi', minibatch_size=None, inference_args=None): """ Train the Multilayer perceptron model Parameters ---------- X : numpy array, shape [n_samples, n_features] y : numpy array, shape [n_samples, ] inference_type : string, specifies which inference method to call. Defaults to 'advi'. Currently, only 'advi' and 'nuts' are supported minibatch_size : number of samples to include in each minibatch for ADVI, defaults to None, so minibatch is not run by default inference_args : dict, arguments to be passed to the inference methods. Check the PyMC3 docs for permissable values. If no arguments are specified, default values will be set. """ self.num_training_samples, self.num_pred = X.shape self.inference_type = inference_type if y.ndim != 1: y = np.squeeze(y) if not inference_args: inference_args = self._set_default_inference_args() if self.cached_model is None: self.cached_model = self.create_model() if minibatch_size: with self.cached_model: minibatches = { self.shared_vars['model_input']: pm.Minibatch( X, batch_size=minibatch_size), self.shared_vars['model_output']: pm.Minibatch( y, batch_size=minibatch_size), } inference_args['more_replacements'] = minibatches else: self._set_shared_vars({'model_input': X, 'model_output': y}) self._inference(inference_type, inference_args) return self
[docs] def predict_proba(self, X, return_std=False): """ Perform Prediction Predicts values of new data with a trained Gaussian Process Regression model Parameters ---------- X : numpy array, shape [n_samples, n_features] return_std : Boolean Whether to return standard deviations with mean values. Defaults to False. """ if self.trace is None: raise NotFittedError('Run fit on the model before predict.') num_samples = X.shape[0] if self.cached_model is None: self.cached_model = self.create_model() self._set_shared_vars({'model_input': X, 'model_output': np.zeros(num_samples)}) ppc = pm.sample_ppc(self.trace, model=self.cached_model, samples=2000) if return_std: return ppc['y'].mean(axis=0), ppc['y'].std(axis=0) else: return ppc['y'].mean(axis=0)
[docs] def predict(self, X): """ Predicts labels of new data with a trained model Parameters ---------- X : numpy array, shape [n_samples, n_features] """ ppc_mean = self.predict_proba(X) pred = ppc_mean > 0.5 return pred
[docs]class BayesianDensityMixin(DensityMixin): """Mixin for regression models in pmlearn """
[docs] def fit(self, X, num_components, inference_type='advi', minibatch_size=None, inference_args=None): """ Train the Gaussian Mixture Model model Parameters ---------- X : numpy array, shape [n_samples, n_features] n_truncate : numpy array, shape [n_samples, ] inference_type : string, specifies which inference method to call. Defaults to 'advi'. Currently, only 'advi' and 'nuts' are supported minibatch_size : number of samples to include in each minibatch for ADVI, defaults to None, so minibatch is not run by default inference_args : dict, arguments to be passed to the inference methods. Check the PyMC3 docs for permissable values. If no arguments are specified, default values will be set. """ self.num_components = num_components self.num_training_samples, self.num_pred = X.shape self.inference_type = inference_type # if y.ndim != 1: # y = np.squeeze(y) if not inference_args: inference_args = self._set_default_inference_args() if self.cached_model is None: self.cached_model = self.create_model() if minibatch_size: with self.cached_model: minibatches = { self.shared_vars['model_input']: pm.Minibatch( X, batch_size=minibatch_size) # , # self.shared_vars['model_output']: pm.Minibatch( # y, batch_size=minibatch_size), # self.shared_vars['model_components']: pm.Minibatch( # components, batch_size=minibatch_size) } inference_args['more_replacements'] = minibatches else: self._set_shared_vars({'model_input': X}) self._inference(inference_type, inference_args) return self
[docs] def predict_proba(self, X, return_std=False): """ Predicts probabilities of new data with a trained GaussianMixture Model Parameters ---------- X : numpy array, shape [n_samples, n_features] cats : numpy array, shape [n_samples, ] return_std : Boolean flag of whether to return standard deviations with mean probabilities. Defaults to False. """ if self.trace is None: raise NotFittedError('Run fit on the model before predict.') # num_samples = X.shape[0] if self.cached_model is None: self.cached_model = self.create_model() self._set_shared_vars({'model_input': X}) K = self.num_components with self.cached_model: pi = pm.Dirichlet("probability", a=np.array([1.0, 1.0, 1.0]), shape=K) _vars = [pi] ppc = pm.sample_ppc(self.trace, # model=self.cached_model, vars=_vars, samples=2000, size=len(X)) if return_std: return ppc['probability'].mean(axis=0), \ ppc['probability'].std(axis=0) else: return ppc['probability'].mean(axis=0)
[docs] def predict(self, X): """ Predicts labels of new data with a trained model Parameters ---------- X : numpy array, shape [n_samples, n_features] cats : numpy array, shape [n_samples, ] """ ppc_mean = self.predict_proba(X) # pred = ppc_mean > 0.5 # # return pred return ppc_mean
[docs] def score(self, X, y, cats): """ Scores new data with a trained model. Parameters ---------- X : numpy array, shape [n_samples, n_features] y : numpy array, shape [n_samples, ] cats : numpy array, shape [n_samples, ] """ from sklearn.metrics import accuracy_score return accuracy_score(y, self.predict(X, cats))