Source code for pmlearn.linear_model.base

"""
Generalized Linear models.
"""

# Authors: Nicole Carlson <nicole@parsingscience.com>
#          Daniel <daniel.emaasit@gmail.com>
# License: BSD 3 clause

import numpy as np
import pymc3 as pm
import theano
import theano.tensor as tt
from sklearn.base import ClassifierMixin
from sklearn.metrics import accuracy_score

from ..base import BayesianModel, BayesianRegressorMixin
from ..exceptions import NotFittedError


[docs]class BayesianLinearClassifierMixin(ClassifierMixin):
    """Mixin for linear classifiers models in pmlearn

    """
[docs]    def fit(self, X, y, cats, inference_type='advi', minibatch_size=None,
            inference_args=None):
        """
        Train the Hierarchical Logistic Regression model

        Parameters
        ----------
        X : numpy array, shape [n_samples, n_features]

        y : numpy array, shape [n_samples, ]

        cats : numpy array, shape [n_samples, ]

        inference_type : string, specifies which inference method to call.
           Defaults to 'advi'. Currently, only 'advi' and 'nuts' are supported

        minibatch_size : number of samples to include in each minibatch for
           ADVI, defaults to None, so minibatch is not run by default

        inference_args : dict, arguments to be passed to the inference methods.
           Check the PyMC3 docs for permissable values. If no arguments are
           specified, default values will be set.
        """
        self.num_cats = len(np.unique(cats))
        self.num_training_samples, self.num_pred = X.shape

        self.inference_type = inference_type

        if y.ndim != 1:
            y = np.squeeze(y)

        if not inference_args:
            inference_args = self._set_default_inference_args()

        if self.cached_model is None:
            self.cached_model = self.create_model()

        if minibatch_size:
            with self.cached_model:
                minibatches = {
                    self.shared_vars['model_input']: pm.Minibatch(
                        X, batch_size=minibatch_size),
                    self.shared_vars['model_output']: pm.Minibatch(
                        y, batch_size=minibatch_size),
                    self.shared_vars['model_cats']: pm.Minibatch(
                        cats, batch_size=minibatch_size)
                }

                inference_args['more_replacements'] = minibatches
        else:
            self._set_shared_vars({
                'model_input': X,
                'model_output': y,
                'model_cats': cats
            })

        self._inference(inference_type, inference_args)

        return self

[docs]    def predict_proba(self, X, cats, return_std=False):
        """
        Predicts probabilities of new data with a trained Hierarchical
        Logistic Regression

        Parameters
        ----------
        X : numpy array, shape [n_samples, n_features]

        cats : numpy array, shape [n_samples, ]

        return_std : Boolean flag of whether to return standard deviations with
        mean probabilities. Defaults to False.
        """

        if self.trace is None:
            raise NotFittedError('Run fit on the model before predict.')

        num_samples = X.shape[0]

        if self.cached_model is None:
            self.cached_model = self.create_model()

        self._set_shared_vars({
            'model_input': X,
            'model_output': np.zeros(num_samples, dtype='int'),
            'model_cats': cats
        })

        ppc = pm.sample_ppc(self.trace, model=self.cached_model, samples=2000)

        if return_std:
            return ppc['y'].mean(axis=0), ppc['y'].std(axis=0)
        else:
            return ppc['y'].mean(axis=0)

[docs]    def predict(self, X, cats):
        """
        Predicts labels of new data with a trained model

        Parameters
        ----------
        X : numpy array, shape [n_samples, n_features]

        cats : numpy array, shape [n_samples, ]
        """
        ppc_mean = self.predict_proba(X, cats)

        pred = ppc_mean > 0.5

        return pred

[docs]    def score(self, X, y, cats):
        """
        Scores new data with a trained model.

        Parameters
        ----------
        X : numpy array, shape [n_samples, n_features]

        y : numpy array, shape [n_samples, ]

        cats : numpy array, shape [n_samples, ]
        """

        return accuracy_score(y, self.predict(X, cats))


[docs]class LinearRegression(BayesianModel, BayesianRegressorMixin):
    """
    Linear Regression built using PyMC3.
    """
    def __init__(self):
        super(LinearRegression, self).__init__()

[docs]    def create_model(self):
        """
        Creates and returns the PyMC3 model.

        Note: The size of the shared variables must match the size of the
        training data. Otherwise, setting the shared variables later will
        raise an error. See http://docs.pymc.io/advanced_theano.html

        Returns
        ----------
        the PyMC3 model
        """
        model_input = theano.shared(
            np.zeros([self.num_training_samples, self.num_pred]))

        model_output = theano.shared(np.zeros(self.num_training_samples))

        self.shared_vars = {
            'model_input': model_input,
            'model_output': model_output,
        }

        model = pm.Model()

        with model:
            alpha = pm.Normal('alpha', mu=0, sd=100, shape=1)
            betas = pm.Normal('betas', mu=0, sd=100, shape=(1, self.num_pred))

            s = pm.HalfNormal('s', tau=1)

            mean = alpha + tt.sum(betas * model_input, 1)

            y = pm.Normal('y', mu=mean, sd=s, observed=model_output)

        return model

[docs]    def save(self, file_prefix):
        params = {
            'inference_type': self.inference_type,
            'num_pred': self.num_pred,
            'num_training_samples': self.num_training_samples
        }

        super(LinearRegression, self).save(file_prefix, params)

[docs]    def load(self, file_prefix):
        params = super(LinearRegression, self).load(file_prefix,
                                                    load_custom_params=True)

        self.inference_type = params['inference_type']
        self.num_pred = params['num_pred']
        self.num_training_samples = params['num_training_samples']