Source code for pmlearn.base

"""Base classes for all Bayesian models."""

# Authors: Daniel Emaasit <daniel.emaasit@gmail.com>
#          Nicole Carlson <nicole@parsingscience.com>
# License: BSD 3 clause

import joblib
import numpy as np
import matplotlib.pyplot as plt
import pymc3 as pm
import seaborn as sns
from sklearn.base import BaseEstimator
from sklearn.base import RegressorMixin, ClassifierMixin, DensityMixin

from .exceptions import NotFittedError


[docs]class BayesianModel(BaseEstimator):
    """Base class for all Bayesian models in pymc-learn

    Notes
    -----
    All Bayesian models should specify all the parameters that can be set at
    the class level in their ``__init__`` as explicit keyword arguments
    (no ``*args`` or **kwargs``).
    """
    def __init__(self):
        self.cached_model = None
        self.default_advi_sample_draws = 10000
        self.inference_type = None
        self.num_pred = None
        self.shared_vars = None
        self.summary = None
        self.trace = None
        self.num_training_samples = None

[docs]    def create_model(self):
        """Create model
        """
        raise NotImplementedError

    def _set_shared_vars(self, shared_vars):
        """
        Sets theano shared variables for the PyMC3 model.
        """
        for key in shared_vars.keys():
            self.shared_vars[key].set_value(shared_vars[key])

    def _inference(self, inference_type='advi', inference_args=None):
        """
        Calls internal methods for two types of inferences. Raises an error
        if the inference_type is not supported.

        Parameters
        ==========
        inference_type : string, specifies which inference method to call.
           Defaults to 'advi'. Currently, only 'advi' and 'nuts' are supported

        inference_args : dict, arguments to be passed to the inference methods.
           Check the PyMC3 docs to see what is permitted. Defaults to None.
        """
        if inference_type == 'advi':
            self._advi_inference(inference_args)
        elif inference_type == 'nuts':
            self._nuts_inference(inference_args)
        else:
            raise NotFittedError('{} is not a supported type'
                                 ' of inference'.format(inference_type))

    def _advi_inference(self, inference_args):
        """
        Runs variational ADVI and then samples from those results.

        Parameters
        ----------
        inference_args : dict, arguments to be passed to the PyMC3 fit method.
           See PyMC3 doc for permissible values.
        """
        with self.cached_model:
            inference = pm.ADVI()
            approx = pm.fit(method=inference, **inference_args)

        self.approx = approx
        self.trace = approx.sample(draws=self.default_advi_sample_draws)
        self.summary = pm.summary(self.trace)
        self.advi_hist = inference.hist

    def _nuts_inference(self, inference_args):
        """
        Runs NUTS inference.

        Parameters
        ----------
        inference_args : dict, arguments passed to the PyMC3 sample method.
           See PyMC3 doc for permissible values.
        """
        with self.cached_model:
            step = pm.NUTS()
            nuts_trace = pm.sample(step=step, **inference_args)

        self.trace = nuts_trace
        self.summary = pm.summary(self.trace)

    def _set_default_inference_args(self):
        """
        Set default values for inference arguments if none are provided,
        dependent on inference type.

        ADVI
        -----
        callbacks : list containing a parameter stopping check.

        n : number of iterations for ADVI fit, defaults to 200000

        NUTS
        -----
        draws : the number of samples to draw, defaults to 2000
        """
        if self.inference_type == 'advi':
            inference_args = {
                'n': 200000,
                'callbacks': [pm.callbacks.CheckParametersConvergence()]
            }
        elif self.inference_type == 'nuts':
            inference_args = {
                'draws': 2000
            }
        else:
            inference_args = None

        return inference_args

[docs]    def save(self, file_prefix, custom_params=None):
        """
        Saves the trace and custom params to files with the given file_prefix.

        Parameters
        ----------
        file_prefix : str, path and prefix used to identify where to save the
        trace for this model.
            Ex: given file_prefix = "path/to/file/"
            This will attempt to save to "path/to/file/trace.pickle"

        custom_params : Dictionary of custom parameters to save.
           Defaults to None
        """
        fileObject = open(file_prefix + 'trace.pickle', 'wb')
        joblib.dump(self.trace, fileObject)
        fileObject.close()

        if custom_params:
            fileObject = open(file_prefix + 'params.pickle', 'wb')
            joblib.dump(custom_params, fileObject)
            fileObject.close()

[docs]    def load(self, file_prefix, load_custom_params=False):
        """
        Loads a saved version of the trace, and custom param files with the
        given file_prefix.

        Parameters
        ----------
        file_prefix : str, path and prefix used to identify where to load the
        saved trace for this model.
            Ex: given file_prefix = "path/to/file/"
            This will attempt to load "path/to/file/trace.pickle"

        load_custom_params : Boolean flag to indicate whether custom parameters
        should be loaded. Defaults to False.

        Returns
        ----------
        custom_params : Dictionary of custom parameters
        """
        self.trace = joblib.load(file_prefix + 'trace.pickle')

        custom_params = None
        if load_custom_params:
            custom_params = joblib.load(file_prefix + 'params.pickle')

        return custom_params

[docs]    def plot_elbo(self):
        """
        Plot the ELBO values after running ADVI minibatch.
        """
        if self.inference_type != 'advi':
            raise NotFittedError(
                'This method should only be called after calling fit with '
                'ADVI minibatch.'
            )

        sns.set_style("white")
        plt.plot(-self.advi_hist)
        plt.ylabel('ELBO')
        plt.xlabel('iteration')
        sns.despine()


[docs]class BayesianRegressorMixin(RegressorMixin):
    """Mixin for regression models in pmlearn

    """
[docs]    def fit(self, X, y, inference_type='advi', minibatch_size=None,
            inference_args=None):
        """
        Train the Linear Regression model

        Parameters
        ----------
        X : numpy array, shape [n_samples, n_features]

        y : numpy array, shape [n_samples, ]

        inference_type : string, specifies which inference method to call.
           Defaults to 'advi'. Currently, only 'advi' and 'nuts' are supported

        minibatch_size : number of samples to include in each minibatch for
           ADVI, defaults to None, so minibatch is not run by default

        inference_args : dict, arguments to be passed to the inference methods.
           Check the PyMC3 docs for permissable values. If no arguments are
           specified, default values will be set.
        """
        self.num_training_samples, self.num_pred = X.shape

        self.inference_type = inference_type

        if y.ndim != 1:
            y = np.squeeze(y)

        if not inference_args:
            inference_args = self._set_default_inference_args()

        if self.cached_model is None:
            self.cached_model = self.create_model()

        if minibatch_size:
            with self.cached_model:
                minibatches = {
                    self.shared_vars['model_input']: pm.Minibatch(
                        X, batch_size=minibatch_size),
                    self.shared_vars['model_output']: pm.Minibatch(
                        y, batch_size=minibatch_size),
                }

                inference_args['more_replacements'] = minibatches
        else:
            self._set_shared_vars({'model_input': X, 'model_output': y})

        self._inference(inference_type, inference_args)

        return self

[docs]    def predict(self, X, return_std=False):
        """
        Predicts values of new data with a trained Linear Regression model

        Parameters
        ----------
        X : numpy array, shape [n_samples, n_features]

        return_std : Boolean flag
           Boolean flag of whether to return standard deviations with mean
           values. Defaults to False.
        """

        if self.trace is None:
            raise NotFittedError('Run fit on the model before predict.')

        num_samples = X.shape[0]

        if self.cached_model is None:
            self.cached_model = self.create_model()

        self._set_shared_vars({'model_input': X,
                               'model_output': np.zeros(num_samples)})

        ppc = pm.sample_ppc(self.trace, model=self.cached_model, samples=2000)

        if return_std:
            return ppc['y'].mean(axis=0), ppc['y'].std(axis=0)
        else:
            return ppc['y'].mean(axis=0)


[docs]class BayesianClassifierMixin(ClassifierMixin):
    """Mixin for regression models in pmlearn

    """
[docs]    def fit(self, X, y, inference_type='advi', minibatch_size=None,
            inference_args=None):
        """ Train the Multilayer perceptron model

        Parameters
        ----------
        X : numpy array, shape [n_samples, n_features]

        y : numpy array, shape [n_samples, ]

        inference_type : string, specifies which inference method to call.
        Defaults to 'advi'. Currently, only 'advi' and 'nuts' are supported

        minibatch_size : number of samples to include in each minibatch
        for ADVI, defaults to None, so minibatch is not run by default

        inference_args : dict, arguments to be passed to the inference methods.
        Check the PyMC3 docs for permissable values. If no arguments are
        specified, default values will be set.
        """
        self.num_training_samples, self.num_pred = X.shape

        self.inference_type = inference_type

        if y.ndim != 1:
            y = np.squeeze(y)

        if not inference_args:
            inference_args = self._set_default_inference_args()

        if self.cached_model is None:
            self.cached_model = self.create_model()

        if minibatch_size:
            with self.cached_model:
                minibatches = {
                    self.shared_vars['model_input']: pm.Minibatch(
                        X, batch_size=minibatch_size),
                    self.shared_vars['model_output']: pm.Minibatch(
                        y, batch_size=minibatch_size),
                }

                inference_args['more_replacements'] = minibatches
        else:
            self._set_shared_vars({'model_input': X, 'model_output': y})

        self._inference(inference_type, inference_args)

        return self

[docs]    def predict_proba(self, X, return_std=False):
        """ Perform Prediction

        Predicts values of new data with a trained Gaussian Process
        Regression model

        Parameters
        ----------
        X : numpy array, shape [n_samples, n_features]

        return_std : Boolean
            Whether to return standard deviations with mean values.
            Defaults to False.
        """

        if self.trace is None:
            raise NotFittedError('Run fit on the model before predict.')

        num_samples = X.shape[0]

        if self.cached_model is None:
            self.cached_model = self.create_model()

        self._set_shared_vars({'model_input': X,
                               'model_output': np.zeros(num_samples)})

        ppc = pm.sample_ppc(self.trace, model=self.cached_model, samples=2000)

        if return_std:
            return ppc['y'].mean(axis=0), ppc['y'].std(axis=0)
        else:
            return ppc['y'].mean(axis=0)

[docs]    def predict(self, X):
        """
        Predicts labels of new data with a trained model

        Parameters
        ----------
        X : numpy array, shape [n_samples, n_features]

        """
        ppc_mean = self.predict_proba(X)

        pred = ppc_mean > 0.5

        return pred


[docs]class BayesianDensityMixin(DensityMixin):
    """Mixin for regression models in pmlearn

    """
[docs]    def fit(self, X, num_components, inference_type='advi',
            minibatch_size=None, inference_args=None):
        """
        Train the Gaussian Mixture Model model

        Parameters
        ----------
        X : numpy array, shape [n_samples, n_features]

        n_truncate : numpy array, shape [n_samples, ]

        inference_type : string, specifies which inference method to call.
        Defaults to 'advi'. Currently, only 'advi' and 'nuts' are supported

        minibatch_size : number of samples to include in each minibatch for
        ADVI,
        defaults to None, so minibatch is not run by default

        inference_args : dict, arguments to be passed to the inference methods.
        Check the PyMC3 docs for permissable values. If no arguments are
        specified,
        default values will be set.
        """
        self.num_components = num_components
        self.num_training_samples, self.num_pred = X.shape

        self.inference_type = inference_type

        # if y.ndim != 1:
        #     y = np.squeeze(y)

        if not inference_args:
            inference_args = self._set_default_inference_args()

        if self.cached_model is None:
            self.cached_model = self.create_model()

        if minibatch_size:
            with self.cached_model:
                minibatches = {
                    self.shared_vars['model_input']: pm.Minibatch(
                        X, batch_size=minibatch_size)
                    # ,
                    # self.shared_vars['model_output']: pm.Minibatch(
                    # y, batch_size=minibatch_size),
                    # self.shared_vars['model_components']: pm.Minibatch(
                    # components, batch_size=minibatch_size)
                }

                inference_args['more_replacements'] = minibatches
        else:
            self._set_shared_vars({'model_input': X})

        self._inference(inference_type, inference_args)

        return self

[docs]    def predict_proba(self, X, return_std=False):
        """
        Predicts probabilities of new data with a trained GaussianMixture Model

        Parameters
        ----------
        X : numpy array, shape [n_samples, n_features]

        cats : numpy array, shape [n_samples, ]

        return_std : Boolean flag of whether to return standard deviations with
        mean probabilities. Defaults to False.
        """

        if self.trace is None:
            raise NotFittedError('Run fit on the model before predict.')

        # num_samples = X.shape[0]

        if self.cached_model is None:
            self.cached_model = self.create_model()

        self._set_shared_vars({'model_input': X})
        K = self.num_components

        with self.cached_model:
            pi = pm.Dirichlet("probability",
                              a=np.array([1.0, 1.0, 1.0]),
                              shape=K)
            _vars = [pi]

            ppc = pm.sample_ppc(self.trace,
                                # model=self.cached_model,
                                vars=_vars,
                                samples=2000,
                                size=len(X))

        if return_std:
            return ppc['probability'].mean(axis=0), \
                   ppc['probability'].std(axis=0)
        else:
            return ppc['probability'].mean(axis=0)

[docs]    def predict(self, X):
        """
        Predicts labels of new data with a trained model

        Parameters
        ----------
        X : numpy array, shape [n_samples, n_features]

        cats : numpy array, shape [n_samples, ]
        """
        ppc_mean = self.predict_proba(X)

        # pred = ppc_mean > 0.5
        #
        # return pred
        return ppc_mean

[docs]    def score(self, X, y, cats):
        """
        Scores new data with a trained model.

        Parameters
        ----------
        X : numpy array, shape [n_samples, n_features]

        y : numpy array, shape [n_samples, ]

        cats : numpy array, shape [n_samples, ]
        """
        from sklearn.metrics import accuracy_score
        return accuracy_score(y, self.predict(X, cats))