Module model_tools.model_evaluation_tools
A module for calculating various statistics not included in the sklearn or scipy packages.
Source code
# -*- coding: utf-8 -*-
"""
A module for calculating various statistics not included
in the sklearn or scipy packages.
"""
import numpy as np
import pandas as pd
from scipy.stats import norm
from sklearn.metrics import roc_curve
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
def _cox_snell_r2(log_likelihood, null_log_likelihood, n):
"""
calculates the cox and snell R2 as described in the following link:
"""
return 1 - (null_log_likelihood / log_likelihood)**(2 / n)
def log_likelihood(true_y, probability_y):
"""
calculates the log likelihood using the log loss function as described here:
Parameters
----------
true_y : np.array
an array of true observations
probability_y : np.array
an array of predicted probabilities from a model
Returns
-------
log_likelihood : float
the calculated loglikelihood
References
----------
[1] https://stackoverflow.com/questions/48185090/how-to-get-the-log-likelihood-for-a-logistic-regression-model-in-sklearn
# unit tests for log_likelihood function
>>> import numpy as np
>>> y_true = np.array([0, 1, 1])
>>> y_pred = np.array([0.1, 0.2, 0.9])
>>> from sklearn.metrics import log_loss
>>> lloss = log_loss(y_true, y_pred)
>>> log_likelihood_elements = y_true*np.log(y_pred) + (1-y_true)*np.log(1-y_pred)
>>> lloss
0.6067196479165843
>>> lloss == -np.sum(log_likelihood_elements)/len(y_true)
True
"""
return - log_loss(true_y, probability_y, normalize=False)
def null_model_probability(true_y):
"""
Calculates the probability of class belonging assuming
all of the model variables are null.
Parameters
----------
true_y : numpy.array
an array of true values
Returns
-------
null_model_probability : numpy.array
the intercept value calculated for each data point
"""
return sum(true_y) / float(true_y.shape[0]) * np.ones(true_y.shape)
def mcfaddens_r2(true_y, predicted_proba):
"""
Calculates the mcfaddens pseudo R2
R2_mcf = 1 - loglikelihood/null_logllikelihood
Parameters
----------
true_y : np.array
an array of true observations
probability_y : np.array
an array of predicted probabilities from a model
Returns
-------
mcfaddens_r2 : float
the calculated loglikelihood
References
----------
[1] https://stats.idre.ucla.edu/other/mult-pkg/faq/general/faq-what-are-pseudo-r-squareds/
# unit tests
>>> import statsmodels.formula.api as sm
>>> y = [1, 1, 1, 0, 0, 0]
>>> x = [1, 2, 3, 4, 5, 6]
>>> model = sm.Logit(y, x)
>>> result = model.fit(disp=0)
>>> round(result.prsquared, 4) == round(mcfaddens_r2(y, result.predict(x)),4)
True
"""
true_y = np.array(true_y)
predicted_proba = np.array(predicted_proba)
null_prob = null_model_probability(true_y)
model_log_likelihood = log_likelihood(true_y, predicted_proba)
null_log_likelihood = log_likelihood(true_y, null_prob)
return 1 - (model_log_likelihood / null_log_likelihood)
def cox_snell_r2(true_y, predicted_proba):
"""
calculates the cox and snell r2 which is:
r2 = 1 - (null_model_log_like/model_log_like)^2/n
Parameters
----------
log_likelihood : float
the loglikelihood caclulated for the fitted model
null_log_likelihood : float
the loglikelihood calculated for the intercept model
n : int
the number of observations used to fit the model
Returns
-------
cox_and_snell_R2 : float
the calculated C&S pseudo-R2
References
----------
[1] https://statisticalhorizons.com/r2logistic
# unit tests
"""
raise ImplementationError('this is broken')
true_y = np.array(true_y)
n = true_y.shape[0]
predicted_proba = np.array(predicted_proba)
null_prob = null_model_probability(true_y)
model_log_likelihood = log_likelihood(true_y, predicted_proba)
null_log_likelihood = log_likelihood(true_y, null_prob)
cox_snell_r2 = _cox_snell_r2(model_log_likelihood, null_log_likelihood, n)
return cox_snell_r2
def zscale_dataframe(df, columns_to_scale):
"""
Scales specified columns in dataframe
Parameters
----------
df : pandas.DataFrame
dataframe of data for scaling
columns_to_scale : list
list of columns that should be scaled using the standard score
Returns
-------
transformed_df : pandas.DataFrame
dataframe of scaled columns along with non-scaled columns
"""
df_to_scale = df[columns_to_scale].copy()
df_to_not_scale = df.drop(columns_to_scale, axis=1).copy()
ss = StandardScaler()
ss.fit(df_to_scale)
ss_data = ss.transform(df_to_scale)
transformed_df = pd.DataFrame(ss_data, index=df_to_scale.index, columns=df_to_scale.columns)
transformed_df = transformed_df.join(df_to_not_scale)
return transformed_df
def calculate_covariance_matrix(model, train_x):
"""
returns the covariance matrix of the fitted model using
the inverse of the hessian of the log-likelihood
Parameters
----------
model : sklearn.model
sklearn like model. must have predict_proba method
train_x : pandas.DataFrame
training data that was used to train the model
Returns
-------
variance_covariance_matrix : numpy.array
the covariance matrix of the model feature interactions
References
----------
[1] https://stackoverflow.com/questions/25122999/scikit-learn-how-to-check-coefficients-significance
"""
predicted_probability = model.predict_proba(train_x)
training_length = train_x.shape[0]
coefficients_length = train_x.shape[1] + 1
train_x_with_ones = np.matrix(np.insert(np.array(train_x), 0, 1, axis = 1))
zeros = np.zeros((coefficients_length, coefficients_length))
for i in range(training_length):
zeros = zeros + np.dot(np.transpose(train_x_with_ones[i, :]), train_x_with_ones[i,:]) * predicted_probability[i,1] * predicted_probability[i,0]
variance_covariance_matrix = np.linalg.inv(np.matrix(zeros))
return variance_covariance_matrix
def calculuate_logit_t_statistic(variance_covariance_matrix, model):
"""
Calculates the t_statistic of a sklearn model since sklearn
focuses on prediction and not fittedness
Parameters
----------
variance_covariance_matrix : numpy.array
the covariance matrix of the model feature interactions
model : sklearn.model
sklearn like model. must have predict_proba method
Returns
-------
t_statistic : numpy.array
the calculated t-statistics for each model coefficient
"""
standard_error = np.sqrt(np.diag(variance_covariance_matrix))
coefficients = np.concatenate([model.intercept_, model.coef_[0]])
t_statistic = coefficients/standard_error
return t_statistic
def calculate_logit_pvalue(t_statistics):
"""
Calculates the coefficient p-value for a sklearn model
Parameters
----------
t_statistic : numpy.array
the calculated t-statistics for each model coefficient
Returns
-------
p : numpy.array
an array of p-values for the inputted t statistics
"""
p = (1 - norm.cdf(abs(t_statistics))) * 2
return p
def fit_and_score_model(model, data : list, metrics : dict, **kwargs):
"""
Fits any sklearn classification model and produces
metric scores from metrics input.
The kwargs should be for the model hyperparameters.
Parameters
----------
model : sklearn.model
sklearn model to fit/test
data : pandas.DataFrame
list of pandas data frames in the order as
sklearn.model_selection.train_test_split
metrics : dict
dict of model assessment metrics. keys are
only allowed to be the exact metric function
name.
**kwargs : dict
any arguments for the model hyperparameters
Returns
-------
scores : dict
scores for model for the input metrics
"""
train_x, test_x, train_y, test_y = data
model = model(**kwargs)
model.fit(train_x, train_y)
predicted = model.predict(test_x)
predicted_proba = model.predict_proba(test_x)
rocdf = pd.DataFrame({'test_true':test_y})
rocdf['test_pred'] = predicted
fpr, tpr, thresholds = roc_curve(rocdf['test_true'], predicted_proba[:,1])
scores = {}
for key in metrics:
if key in ('roc_auc_score', 'log_likelihood', 'cox_snell_r2', 'mcfaddens_r2'):
# print('predicted probability test')
scores[key] = metrics[key](rocdf['test_true'], predicted_proba[:,1])
elif key in ('f1_score', 'accuracy_score', 'confusion_matrix'):
# print('predicted value test')
scores[key] = metrics[key](test_y, predicted)
else:
raise ValueError('{k} is not an accepted metric'.format(k=key))
return scores
if __name__ == '__main__':
import doctest
doctest.testmod()
Functions
def calculate_covariance_matrix(model, train_x)
-
returns the covariance matrix of the fitted model using the inverse of the hessian of the log-likelihood
Parameters
model
:sklearn.model
- sklearn like model. must have predict_proba method
train_x
:pandas.DataFrame
- training data that was used to train the model
Returns
variance_covariance_matrix
:numpy.array
- the covariance matrix of the model feature interactions
References
[1] https://stackoverflow.com/questions/25122999/scikit-learn-how-to-check-coefficients-significance
Source code
def calculate_covariance_matrix(model, train_x): """ returns the covariance matrix of the fitted model using the inverse of the hessian of the log-likelihood Parameters ---------- model : sklearn.model sklearn like model. must have predict_proba method train_x : pandas.DataFrame training data that was used to train the model Returns ------- variance_covariance_matrix : numpy.array the covariance matrix of the model feature interactions References ---------- [1] https://stackoverflow.com/questions/25122999/scikit-learn-how-to-check-coefficients-significance """ predicted_probability = model.predict_proba(train_x) training_length = train_x.shape[0] coefficients_length = train_x.shape[1] + 1 train_x_with_ones = np.matrix(np.insert(np.array(train_x), 0, 1, axis = 1)) zeros = np.zeros((coefficients_length, coefficients_length)) for i in range(training_length): zeros = zeros + np.dot(np.transpose(train_x_with_ones[i, :]), train_x_with_ones[i,:]) * predicted_probability[i,1] * predicted_probability[i,0] variance_covariance_matrix = np.linalg.inv(np.matrix(zeros)) return variance_covariance_matrix
def calculate_logit_pvalue(t_statistics)
-
Calculates the coefficient p-value for a sklearn model
Parameters
t_statistic
:numpy.array
- the calculated t-statistics for each model coefficient
Returns
p
:numpy.array
- an array of p-values for the inputted t statistics
Source code
def calculate_logit_pvalue(t_statistics): """ Calculates the coefficient p-value for a sklearn model Parameters ---------- t_statistic : numpy.array the calculated t-statistics for each model coefficient Returns ------- p : numpy.array an array of p-values for the inputted t statistics """ p = (1 - norm.cdf(abs(t_statistics))) * 2 return p
def calculuate_logit_t_statistic(variance_covariance_matrix, model)
-
Calculates the t_statistic of a sklearn model since sklearn focuses on prediction and not fittedness
Parameters
variance_covariance_matrix
:numpy.array
- the covariance matrix of the model feature interactions
model
:sklearn.model
- sklearn like model. must have predict_proba method
Returns
t_statistic
:numpy.array
- the calculated t-statistics for each model coefficient
Source code
def calculuate_logit_t_statistic(variance_covariance_matrix, model): """ Calculates the t_statistic of a sklearn model since sklearn focuses on prediction and not fittedness Parameters ---------- variance_covariance_matrix : numpy.array the covariance matrix of the model feature interactions model : sklearn.model sklearn like model. must have predict_proba method Returns ------- t_statistic : numpy.array the calculated t-statistics for each model coefficient """ standard_error = np.sqrt(np.diag(variance_covariance_matrix)) coefficients = np.concatenate([model.intercept_, model.coef_[0]]) t_statistic = coefficients/standard_error return t_statistic
def cox_snell_r2(true_y, predicted_proba)
-
calculates the cox and snell r2 which is:
r2 = 1 - (null_model_log_like/model_log_like)^2/n
Parameters
log_likelihood()
:float
- the loglikelihood caclulated for the fitted model
null_log_likelihood
:float
- the loglikelihood calculated for the intercept model
n
:int
- the number of observations used to fit the model
Returns
cox_and_snell_R2
:float
- the calculated C&S pseudo-R2
References
[1] https://statisticalhorizons.com/r2logistic
unit tests
Source code
def cox_snell_r2(true_y, predicted_proba): """ calculates the cox and snell r2 which is: r2 = 1 - (null_model_log_like/model_log_like)^2/n Parameters ---------- log_likelihood : float the loglikelihood caclulated for the fitted model null_log_likelihood : float the loglikelihood calculated for the intercept model n : int the number of observations used to fit the model Returns ------- cox_and_snell_R2 : float the calculated C&S pseudo-R2 References ---------- [1] https://statisticalhorizons.com/r2logistic # unit tests """ raise ImplementationError('this is broken') true_y = np.array(true_y) n = true_y.shape[0] predicted_proba = np.array(predicted_proba) null_prob = null_model_probability(true_y) model_log_likelihood = log_likelihood(true_y, predicted_proba) null_log_likelihood = log_likelihood(true_y, null_prob) cox_snell_r2 = _cox_snell_r2(model_log_likelihood, null_log_likelihood, n) return cox_snell_r2
def fit_and_score_model(model, data, metrics, **kwargs)
-
Fits any sklearn classification model and produces metric scores from metrics input.
The kwargs should be for the model hyperparameters.
Parameters
model
:sklearn.model
- sklearn model to fit/test
data
:pandas.DataFrame
- list of pandas data frames in the order as sklearn.model_selection.train_test_split
metrics
:dict
- dict of model assessment metrics. keys are only allowed to be the exact metric function name.
**kwargs
:dict
- any arguments for the model hyperparameters
Returns
scores
:dict
- scores for model for the input metrics
Source code
def fit_and_score_model(model, data : list, metrics : dict, **kwargs): """ Fits any sklearn classification model and produces metric scores from metrics input. The kwargs should be for the model hyperparameters. Parameters ---------- model : sklearn.model sklearn model to fit/test data : pandas.DataFrame list of pandas data frames in the order as sklearn.model_selection.train_test_split metrics : dict dict of model assessment metrics. keys are only allowed to be the exact metric function name. **kwargs : dict any arguments for the model hyperparameters Returns ------- scores : dict scores for model for the input metrics """ train_x, test_x, train_y, test_y = data model = model(**kwargs) model.fit(train_x, train_y) predicted = model.predict(test_x) predicted_proba = model.predict_proba(test_x) rocdf = pd.DataFrame({'test_true':test_y}) rocdf['test_pred'] = predicted fpr, tpr, thresholds = roc_curve(rocdf['test_true'], predicted_proba[:,1]) scores = {} for key in metrics: if key in ('roc_auc_score', 'log_likelihood', 'cox_snell_r2', 'mcfaddens_r2'): # print('predicted probability test') scores[key] = metrics[key](rocdf['test_true'], predicted_proba[:,1]) elif key in ('f1_score', 'accuracy_score', 'confusion_matrix'): # print('predicted value test') scores[key] = metrics[key](test_y, predicted) else: raise ValueError('{k} is not an accepted metric'.format(k=key)) return scores
def log_likelihood(true_y, probability_y)
-
calculates the log likelihood using the log loss function as described here:
Parameters
true_y
:np.array
- an array of true observations
probability_y
:np.array
- an array of predicted probabilities from a model
Returns
log_likelihood()
:float
- the calculated loglikelihood
References
unit tests for log_likelihood function
>>> import numpy as np >>> y_true = np.array([0, 1, 1]) >>> y_pred = np.array([0.1, 0.2, 0.9]) >>> from sklearn.metrics import log_loss >>> lloss = log_loss(y_true, y_pred) >>> log_likelihood_elements = y_true*np.log(y_pred) + (1-y_true)*np.log(1-y_pred) >>> lloss 0.6067196479165843 >>> lloss == -np.sum(log_likelihood_elements)/len(y_true) **`True`**
Source code
def log_likelihood(true_y, probability_y): """ calculates the log likelihood using the log loss function as described here: Parameters ---------- true_y : np.array an array of true observations probability_y : np.array an array of predicted probabilities from a model Returns ------- log_likelihood : float the calculated loglikelihood References ---------- [1] https://stackoverflow.com/questions/48185090/how-to-get-the-log-likelihood-for-a-logistic-regression-model-in-sklearn # unit tests for log_likelihood function >>> import numpy as np >>> y_true = np.array([0, 1, 1]) >>> y_pred = np.array([0.1, 0.2, 0.9]) >>> from sklearn.metrics import log_loss >>> lloss = log_loss(y_true, y_pred) >>> log_likelihood_elements = y_true*np.log(y_pred) + (1-y_true)*np.log(1-y_pred) >>> lloss 0.6067196479165843 >>> lloss == -np.sum(log_likelihood_elements)/len(y_true) True """ return - log_loss(true_y, probability_y, normalize=False)
def mcfaddens_r2(true_y, predicted_proba)
-
Calculates the mcfaddens pseudo R2
R2_mcf = 1 - loglikelihood/null_logllikelihood
Parameters
true_y
:np.array
- an array of true observations
probability_y
:np.array
- an array of predicted probabilities from a model
Returns
mcfaddens_r2()
:float
- the calculated loglikelihood
References
[1] https://stats.idre.ucla.edu/other/mult-pkg/faq/general/faq-what-are-pseudo-r-squareds/
unit tests
>>> import statsmodels.formula.api as sm >>> y = [1, 1, 1, 0, 0, 0] >>> x = [1, 2, 3, 4, 5, 6] >>> model = sm.Logit(y, x) >>> result = model.fit(disp=0) >>> round(result.prsquared, 4) == round(mcfaddens_r2(y, result.predict(x)),4) **`True`**
Source code
def mcfaddens_r2(true_y, predicted_proba): """ Calculates the mcfaddens pseudo R2 R2_mcf = 1 - loglikelihood/null_logllikelihood Parameters ---------- true_y : np.array an array of true observations probability_y : np.array an array of predicted probabilities from a model Returns ------- mcfaddens_r2 : float the calculated loglikelihood References ---------- [1] https://stats.idre.ucla.edu/other/mult-pkg/faq/general/faq-what-are-pseudo-r-squareds/ # unit tests >>> import statsmodels.formula.api as sm >>> y = [1, 1, 1, 0, 0, 0] >>> x = [1, 2, 3, 4, 5, 6] >>> model = sm.Logit(y, x) >>> result = model.fit(disp=0) >>> round(result.prsquared, 4) == round(mcfaddens_r2(y, result.predict(x)),4) True """ true_y = np.array(true_y) predicted_proba = np.array(predicted_proba) null_prob = null_model_probability(true_y) model_log_likelihood = log_likelihood(true_y, predicted_proba) null_log_likelihood = log_likelihood(true_y, null_prob) return 1 - (model_log_likelihood / null_log_likelihood)
def null_model_probability(true_y)
-
Calculates the probability of class belonging assuming all of the model variables are null.
Parameters
true_y
:numpy.array
- an array of true values
Returns
null_model_probability()
:numpy.array
- the intercept value calculated for each data point
Source code
def null_model_probability(true_y): """ Calculates the probability of class belonging assuming all of the model variables are null. Parameters ---------- true_y : numpy.array an array of true values Returns ------- null_model_probability : numpy.array the intercept value calculated for each data point """ return sum(true_y) / float(true_y.shape[0]) * np.ones(true_y.shape)
def zscale_dataframe(df, columns_to_scale)
-
Scales specified columns in dataframe
Parameters
df
:pandas.DataFrame
- dataframe of data for scaling
columns_to_scale
:list
- list of columns that should be scaled using the standard score
Returns
transformed_df
:pandas.DataFrame
- dataframe of scaled columns along with non-scaled columns
Source code
def zscale_dataframe(df, columns_to_scale): """ Scales specified columns in dataframe Parameters ---------- df : pandas.DataFrame dataframe of data for scaling columns_to_scale : list list of columns that should be scaled using the standard score Returns ------- transformed_df : pandas.DataFrame dataframe of scaled columns along with non-scaled columns """ df_to_scale = df[columns_to_scale].copy() df_to_not_scale = df.drop(columns_to_scale, axis=1).copy() ss = StandardScaler() ss.fit(df_to_scale) ss_data = ss.transform(df_to_scale) transformed_df = pd.DataFrame(ss_data, index=df_to_scale.index, columns=df_to_scale.columns) transformed_df = transformed_df.join(df_to_not_scale) return transformed_df