Source code for pmlearn.linear_model.tests.test_logistic

"""Testing for Logistic regression """

# Authors: Daniel Emaasit <daniel.emaasit@gmail.com>
#
# License: BSD 3 clause

import pytest
import numpy.testing as npt
import pandas.testing as pdt
import shutil
import tempfile

import numpy as np
from pymc3 import summary
from sklearn.model_selection import train_test_split

from pmlearn.exceptions import NotFittedError
from pmlearn.linear_model import HierarchicalLogisticRegression


[docs]class TestHierarchicalLogisticRegression(object):
[docs] def setup_method(self): def numpy_invlogit(x): return 1 / (1 + np.exp(-x)) self.num_cats = 3 self.num_pred = 1 self.num_samples_per_cat = 1000 self.alphas = np.random.randn(self.num_cats) self.betas = np.random.randn(self.num_cats, self.num_pred) # TODO: make this more efficient; right now, it's very explicit # so I understand it. x_a = np.random.randn(self.num_samples_per_cat, self.num_pred) y_a = np.random.binomial(1, numpy_invlogit(self.alphas[0] + np.sum(self.betas[0] * x_a, 1) )) x_b = np.random.randn(self.num_samples_per_cat, self.num_pred) y_b = np.random.binomial(1, numpy_invlogit(self.alphas[1] + np.sum(self.betas[1] * x_b, 1) )) x_c = np.random.randn(self.num_samples_per_cat, self.num_pred) y_c = np.random.binomial(1, numpy_invlogit(self.alphas[2] + np.sum(self.betas[2] * x_c, 1) )) X = np.concatenate([x_a, x_b, x_c]) y = np.concatenate([y_a, y_b, y_c]) cats = np.concatenate([ np.zeros(self.num_samples_per_cat, dtype=np.int), np.ones(self.num_samples_per_cat, dtype=np.int), 2*np.ones(self.num_samples_per_cat, dtype=np.int) ]) self.X_train, self.X_test, self.cat_train, self.cat_test, \ self.y_train, self.y_test = train_test_split( X, cats, y, test_size=0.4 ) self.advi_hlr = HierarchicalLogisticRegression() self.test_dir = tempfile.mkdtemp()
[docs] def teardown_method(self): shutil.rmtree(self.test_dir)
[docs]class TestHierarchicalLogisticRegressionFit(TestHierarchicalLogisticRegression):
[docs] def test_advi_fit_returns_correct_model(self): # Note: print is here so PyMC3 output won't overwrite the test name print('') self.advi_hlr.fit(self.X_train, self.y_train, self.cat_train, minibatch_size=500, inference_args={"n": 50000}) npt.assert_equal(self.num_cats, self.advi_hlr.num_cats) npt.assert_equal(self.num_pred, self.advi_hlr.num_pred) #TODO: Figure out best way to test #np.testing.assert_almost_equal(self.alphas, # self.advi_hlr.trace['alphas'].mean(), decimal=1) #np.testing.assert_almost_equal(self.betas, # self.advi_hlr.trace['betas'].mean(), decimal=1) # For now, just check that the estimated parameters # have the correct signs npt.assert_equal( np.sign(self.alphas), np.sign(self.advi_hlr.trace['alpha'].mean(axis=0)) ) npt.assert_equal( np.sign(self.betas), np.sign(self.advi_hlr.trace['beta'].mean(axis=0)) )
[docs]class TestHierarchicalLogisticRegressionPredictProba( TestHierarchicalLogisticRegression):
[docs] def test_predict_proba_returns_probabilities(self): print('') self.advi_hlr.fit(self.X_train, self.y_train, self.cat_train, minibatch_size=500, inference_args={"n": 50000}) probs = self.advi_hlr.predict_proba(self.X_test, self.cat_test) npt.assert_equal(probs.shape, self.y_test.shape)
[docs] def test_predict_proba_returns_probabilities_and_std(self): print('') self.advi_hlr.fit(self.X_train, self.y_train, self.cat_train, minibatch_size=500, inference_args={"n": 50000}) probs, stds = self.advi_hlr.predict_proba(self.X_test, self.cat_test, return_std=True) npt.assert_equal(probs.shape, self.y_test.shape) npt.assert_equal(stds.shape, self.y_test.shape)
[docs] def test_predict_proba_raises_error_if_not_fit(self): with pytest.raises(NotFittedError): advi_hlr = HierarchicalLogisticRegression() advi_hlr.predict_proba(self.X_train, self.cat_train)
[docs]class TestHierarchicalLogisticRegressionPredict( TestHierarchicalLogisticRegression):
[docs] def test_predict_returns_predictions(self): print('') self.advi_hlr.fit(self.X_train, self.y_train, self.cat_train, minibatch_size=500, inference_args={"n": 50000}) preds = self.advi_hlr.predict(self.X_test, self.cat_test) npt.assert_equal(preds.shape, self.y_test.shape)
[docs]class TestHierarchicalLogisticRegressionScore( TestHierarchicalLogisticRegression):
[docs] def test_score_scores(self): print('') self.advi_hlr.fit(self.X_train, self.y_train, self.cat_train, minibatch_size=500, inference_args={"n": 50000}) score = self.advi_hlr.score(self.X_test, self.y_test, self.cat_test) naive_score = np.mean(self.y_test) npt.assert_array_less(naive_score, score)
[docs]class TestHierarchicalLogisticRegressionSaveandLoad( TestHierarchicalLogisticRegression):
[docs] def test_save_and_load_work_correctly(self): print('') self.advi_hlr.fit(self.X_train, self.y_train, self.cat_train, minibatch_size=500, inference_args={"n": 50000}) probs1 = self.advi_hlr.predict_proba(self.X_test, self.cat_test) self.advi_hlr.save(self.test_dir) hlr2 = HierarchicalLogisticRegression() hlr2.load(self.test_dir) npt.assert_equal(self.advi_hlr.num_cats, hlr2.num_cats) npt.assert_equal(self.advi_hlr.num_pred, hlr2.num_pred) npt.assert_equal(self.advi_hlr.num_training_samples, hlr2.num_training_samples) pdt.assert_frame_equal(summary(self.advi_hlr.trace), summary(hlr2.trace)) probs2 = hlr2.predict_proba(self.X_test, self.cat_test) npt.assert_almost_equal(probs2, probs1, decimal=1)