Source code for pmlearn.linear_model.base

"""
Generalized Linear models.
"""

# Authors: Nicole Carlson <nicole@parsingscience.com>
#          Daniel <daniel.emaasit@gmail.com>
# License: BSD 3 clause

import numpy as np
import pymc3 as pm
import theano
import theano.tensor as tt
from sklearn.base import ClassifierMixin
from sklearn.metrics import accuracy_score

from ..base import BayesianModel, BayesianRegressorMixin
from ..exceptions import NotFittedError


[docs]class BayesianLinearClassifierMixin(ClassifierMixin): """Mixin for linear classifiers models in pmlearn """
[docs] def fit(self, X, y, cats, inference_type='advi', minibatch_size=None, inference_args=None): """ Train the Hierarchical Logistic Regression model Parameters ---------- X : numpy array, shape [n_samples, n_features] y : numpy array, shape [n_samples, ] cats : numpy array, shape [n_samples, ] inference_type : string, specifies which inference method to call. Defaults to 'advi'. Currently, only 'advi' and 'nuts' are supported minibatch_size : number of samples to include in each minibatch for ADVI, defaults to None, so minibatch is not run by default inference_args : dict, arguments to be passed to the inference methods. Check the PyMC3 docs for permissable values. If no arguments are specified, default values will be set. """ self.num_cats = len(np.unique(cats)) self.num_training_samples, self.num_pred = X.shape self.inference_type = inference_type if y.ndim != 1: y = np.squeeze(y) if not inference_args: inference_args = self._set_default_inference_args() if self.cached_model is None: self.cached_model = self.create_model() if minibatch_size: with self.cached_model: minibatches = { self.shared_vars['model_input']: pm.Minibatch( X, batch_size=minibatch_size), self.shared_vars['model_output']: pm.Minibatch( y, batch_size=minibatch_size), self.shared_vars['model_cats']: pm.Minibatch( cats, batch_size=minibatch_size) } inference_args['more_replacements'] = minibatches else: self._set_shared_vars({ 'model_input': X, 'model_output': y, 'model_cats': cats }) self._inference(inference_type, inference_args) return self
[docs] def predict_proba(self, X, cats, return_std=False): """ Predicts probabilities of new data with a trained Hierarchical Logistic Regression Parameters ---------- X : numpy array, shape [n_samples, n_features] cats : numpy array, shape [n_samples, ] return_std : Boolean flag of whether to return standard deviations with mean probabilities. Defaults to False. """ if self.trace is None: raise NotFittedError('Run fit on the model before predict.') num_samples = X.shape[0] if self.cached_model is None: self.cached_model = self.create_model() self._set_shared_vars({ 'model_input': X, 'model_output': np.zeros(num_samples, dtype='int'), 'model_cats': cats }) ppc = pm.sample_ppc(self.trace, model=self.cached_model, samples=2000) if return_std: return ppc['y'].mean(axis=0), ppc['y'].std(axis=0) else: return ppc['y'].mean(axis=0)
[docs] def predict(self, X, cats): """ Predicts labels of new data with a trained model Parameters ---------- X : numpy array, shape [n_samples, n_features] cats : numpy array, shape [n_samples, ] """ ppc_mean = self.predict_proba(X, cats) pred = ppc_mean > 0.5 return pred
[docs] def score(self, X, y, cats): """ Scores new data with a trained model. Parameters ---------- X : numpy array, shape [n_samples, n_features] y : numpy array, shape [n_samples, ] cats : numpy array, shape [n_samples, ] """ return accuracy_score(y, self.predict(X, cats))
[docs]class LinearRegression(BayesianModel, BayesianRegressorMixin): """ Linear Regression built using PyMC3. """ def __init__(self): super(LinearRegression, self).__init__()
[docs] def create_model(self): """ Creates and returns the PyMC3 model. Note: The size of the shared variables must match the size of the training data. Otherwise, setting the shared variables later will raise an error. See http://docs.pymc.io/advanced_theano.html Returns ---------- the PyMC3 model """ model_input = theano.shared( np.zeros([self.num_training_samples, self.num_pred])) model_output = theano.shared(np.zeros(self.num_training_samples)) self.shared_vars = { 'model_input': model_input, 'model_output': model_output, } model = pm.Model() with model: alpha = pm.Normal('alpha', mu=0, sd=100, shape=1) betas = pm.Normal('betas', mu=0, sd=100, shape=(1, self.num_pred)) s = pm.HalfNormal('s', tau=1) mean = alpha + tt.sum(betas * model_input, 1) y = pm.Normal('y', mu=mean, sd=s, observed=model_output) return model
[docs] def save(self, file_prefix): params = { 'inference_type': self.inference_type, 'num_pred': self.num_pred, 'num_training_samples': self.num_training_samples } super(LinearRegression, self).save(file_prefix, params)
[docs] def load(self, file_prefix): params = super(LinearRegression, self).load(file_prefix, load_custom_params=True) self.inference_type = params['inference_type'] self.num_pred = params['num_pred'] self.num_training_samples = params['num_training_samples']