Module `model_tools.model_evaluation_tools`

A module for calculating various statistics not included in the sklearn or scipy packages.

Source code

# -*- coding: utf-8 -*-
"""
A module for calculating various statistics not included
in the sklearn or scipy packages.
"""
import numpy as np
import pandas as pd
from scipy.stats import norm
from sklearn.metrics import roc_curve
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler

def _cox_snell_r2(log_likelihood, null_log_likelihood, n):
    """
    calculates the cox and snell R2 as described in the following link:
    """
    return 1 - (null_log_likelihood / log_likelihood)**(2 / n)

def log_likelihood(true_y, probability_y):
    """
    calculates the log likelihood using the log loss function as described here:
    
    
    Parameters
    ----------
    true_y : np.array
             an array of true observations
    probability_y : np.array
                    an array of predicted probabilities from a model
    Returns
    -------
    log_likelihood : float
                     the calculated loglikelihood

    References
    ----------
    [1] https://stackoverflow.com/questions/48185090/how-to-get-the-log-likelihood-for-a-logistic-regression-model-in-sklearn

    # unit tests for log_likelihood function
    >>> import numpy as np
    >>> y_true = np.array([0, 1, 1])
    >>> y_pred = np.array([0.1, 0.2, 0.9])
    >>> from sklearn.metrics import log_loss
    >>> lloss = log_loss(y_true, y_pred)
    >>> log_likelihood_elements = y_true*np.log(y_pred) + (1-y_true)*np.log(1-y_pred)
    >>> lloss
    0.6067196479165843
    >>> lloss == -np.sum(log_likelihood_elements)/len(y_true)
    True
    """
    return - log_loss(true_y, probability_y, normalize=False)

def null_model_probability(true_y):
    """
    Calculates the probability of class belonging assuming
    all of the model variables are null.

    Parameters
    ----------
    true_y : numpy.array
             an array of true values

    Returns
    -------
    null_model_probability : numpy.array
                             the intercept value calculated for each data point
    """
    return sum(true_y) / float(true_y.shape[0]) * np.ones(true_y.shape)

def mcfaddens_r2(true_y, predicted_proba):
    """
    Calculates the mcfaddens pseudo R2

    R2_mcf = 1 - loglikelihood/null_logllikelihood
    
    Parameters
    ----------
    true_y : np.array
             an array of true observations
    probability_y : np.array
                    an array of predicted probabilities from a model
    Returns
    -------
    mcfaddens_r2 : float
                     the calculated loglikelihood

    References
    ----------
    [1] https://stats.idre.ucla.edu/other/mult-pkg/faq/general/faq-what-are-pseudo-r-squareds/

    # unit tests
    >>> import statsmodels.formula.api as sm
    >>> y = [1, 1, 1, 0, 0, 0]
    >>> x = [1, 2, 3, 4, 5, 6]
    >>> model = sm.Logit(y, x)
    >>> result = model.fit(disp=0)
    >>> round(result.prsquared, 4) == round(mcfaddens_r2(y, result.predict(x)),4)
    True
    """
    true_y = np.array(true_y)
    predicted_proba = np.array(predicted_proba)
    null_prob = null_model_probability(true_y)

    model_log_likelihood = log_likelihood(true_y, predicted_proba)
    null_log_likelihood = log_likelihood(true_y, null_prob)

    return 1 - (model_log_likelihood / null_log_likelihood)

def cox_snell_r2(true_y, predicted_proba):
    """
    calculates the cox and snell r2 which is:

    r2 = 1 - (null_model_log_like/model_log_like)^2/n

    Parameters
    ----------
    log_likelihood : float
                     the loglikelihood caclulated for the fitted model
    null_log_likelihood : float
                          the loglikelihood calculated for the intercept model
    n : int
        the number of observations used to fit the model

    Returns
    -------
    cox_and_snell_R2 : float
                       the calculated C&S pseudo-R2

    References
    ----------
    [1] https://statisticalhorizons.com/r2logistic

    # unit tests
    """
    raise ImplementationError('this is broken')
    true_y = np.array(true_y)
    n = true_y.shape[0]
    predicted_proba = np.array(predicted_proba)
    null_prob = null_model_probability(true_y)

    model_log_likelihood = log_likelihood(true_y, predicted_proba)
    null_log_likelihood = log_likelihood(true_y, null_prob)

    cox_snell_r2 = _cox_snell_r2(model_log_likelihood, null_log_likelihood, n)

    return cox_snell_r2

def zscale_dataframe(df, columns_to_scale):
    """
    Scales specified columns in dataframe

    Parameters
    ----------
    df : pandas.DataFrame
         dataframe of data for scaling
    columns_to_scale : list
                       list of columns that should be scaled using the standard score

    Returns
    -------
    transformed_df : pandas.DataFrame
                    dataframe of scaled columns along with non-scaled columns
    """
    
    df_to_scale = df[columns_to_scale].copy()
    df_to_not_scale = df.drop(columns_to_scale, axis=1).copy()
    
    ss = StandardScaler()
    ss.fit(df_to_scale)
    ss_data = ss.transform(df_to_scale)
    
    transformed_df = pd.DataFrame(ss_data, index=df_to_scale.index, columns=df_to_scale.columns)
    transformed_df = transformed_df.join(df_to_not_scale)
    return transformed_df

def calculate_covariance_matrix(model, train_x):
    """
    returns the covariance matrix of the fitted model using
    the inverse of the hessian of the log-likelihood

    Parameters
    ----------
    model : sklearn.model
            sklearn like model. must have predict_proba method
    train_x : pandas.DataFrame
              training data that was used to train the model

    Returns
    -------
    variance_covariance_matrix : numpy.array
                                 the covariance matrix of the model feature interactions

    
    References
    ----------
    [1] https://stackoverflow.com/questions/25122999/scikit-learn-how-to-check-coefficients-significance
    """
    predicted_probability = model.predict_proba(train_x)
    training_length = train_x.shape[0]
    coefficients_length = train_x.shape[1] + 1
    
    train_x_with_ones =  np.matrix(np.insert(np.array(train_x), 0, 1, axis = 1))
    zeros = np.zeros((coefficients_length, coefficients_length))
    for i in range(training_length):
        zeros = zeros + np.dot(np.transpose(train_x_with_ones[i, :]), train_x_with_ones[i,:]) * predicted_probability[i,1] * predicted_probability[i,0]
    
    variance_covariance_matrix = np.linalg.inv(np.matrix(zeros))
    
    return variance_covariance_matrix

def calculuate_logit_t_statistic(variance_covariance_matrix, model):
    """
    Calculates the t_statistic of a sklearn model since sklearn
    focuses on prediction and not fittedness

    Parameters
    ----------
    variance_covariance_matrix : numpy.array
                                 the covariance matrix of the model feature interactions
    model : sklearn.model
            sklearn like model. must have predict_proba method

    Returns
    -------
    t_statistic : numpy.array
                  the calculated t-statistics for each model coefficient

    """
    standard_error = np.sqrt(np.diag(variance_covariance_matrix))
    coefficients = np.concatenate([model.intercept_, model.coef_[0]])
    t_statistic = coefficients/standard_error
    return t_statistic

def calculate_logit_pvalue(t_statistics):
    """
    Calculates the coefficient p-value for a sklearn model

    Parameters
    ----------
    t_statistic : numpy.array
                  the calculated t-statistics for each model coefficient

    Returns
    -------
    p : numpy.array
        an array of p-values for the inputted t statistics
    """
    p = (1 - norm.cdf(abs(t_statistics))) * 2
    return p

def fit_and_score_model(model, data : list, metrics : dict, **kwargs):
    """
    Fits any sklearn classification model and produces
    metric scores from metrics input.

    The kwargs should be for the model hyperparameters.

    Parameters
    ----------
    model : sklearn.model
            sklearn model to fit/test
    data : pandas.DataFrame
           list of pandas data frames in the order as
           sklearn.model_selection.train_test_split
    metrics : dict
              dict of model assessment metrics. keys are 
              only allowed to be the exact metric function
              name.
    **kwargs : dict
               any arguments for the model hyperparameters

    Returns
    -------
    scores : dict
             scores for model for the input metrics
    """
    train_x, test_x, train_y, test_y = data
    model = model(**kwargs)
    model.fit(train_x, train_y)
    predicted = model.predict(test_x)
    predicted_proba = model.predict_proba(test_x)
    rocdf = pd.DataFrame({'test_true':test_y})
    rocdf['test_pred'] = predicted
    fpr, tpr, thresholds = roc_curve(rocdf['test_true'], predicted_proba[:,1])
    
    scores = {}
    for key in metrics:
        if key in ('roc_auc_score', 'log_likelihood', 'cox_snell_r2', 'mcfaddens_r2'):
            # print('predicted probability test')
            scores[key] = metrics[key](rocdf['test_true'], predicted_proba[:,1])
        elif key in ('f1_score', 'accuracy_score', 'confusion_matrix'):
            # print('predicted value test')
            scores[key] = metrics[key](test_y, predicted)
        else:
            raise ValueError('{k} is not an accepted metric'.format(k=key))
    return scores

if __name__ == '__main__':
    import doctest
    doctest.testmod()

Functions

def calculate_covariance_matrix(model, train_x)

returns the covariance matrix of the fitted model using the inverse of the hessian of the log-likelihood

Parameters

model : sklearn.model: sklearn like model. must have predict_proba method
train_x : pandas.DataFrame: training data that was used to train the model

Returns

variance_covariance_matrix : numpy.array: the covariance matrix of the model feature interactions

References

[1] https://stackoverflow.com/questions/25122999/scikit-learn-how-to-check-coefficients-significance

Source code

def calculate_covariance_matrix(model, train_x):
    """
    returns the covariance matrix of the fitted model using
    the inverse of the hessian of the log-likelihood

    Parameters
    ----------
    model : sklearn.model
            sklearn like model. must have predict_proba method
    train_x : pandas.DataFrame
              training data that was used to train the model

    Returns
    -------
    variance_covariance_matrix : numpy.array
                                 the covariance matrix of the model feature interactions

    
    References
    ----------
    [1] https://stackoverflow.com/questions/25122999/scikit-learn-how-to-check-coefficients-significance
    """
    predicted_probability = model.predict_proba(train_x)
    training_length = train_x.shape[0]
    coefficients_length = train_x.shape[1] + 1
    
    train_x_with_ones =  np.matrix(np.insert(np.array(train_x), 0, 1, axis = 1))
    zeros = np.zeros((coefficients_length, coefficients_length))
    for i in range(training_length):
        zeros = zeros + np.dot(np.transpose(train_x_with_ones[i, :]), train_x_with_ones[i,:]) * predicted_probability[i,1] * predicted_probability[i,0]
    
    variance_covariance_matrix = np.linalg.inv(np.matrix(zeros))
    
    return variance_covariance_matrix

def calculate_logit_pvalue(t_statistics)

Calculates the coefficient p-value for a sklearn model

Parameters

t_statistic : numpy.array: the calculated t-statistics for each model coefficient

Returns

p : numpy.array: an array of p-values for the inputted t statistics

Source code

def calculate_logit_pvalue(t_statistics):
    """
    Calculates the coefficient p-value for a sklearn model

    Parameters
    ----------
    t_statistic : numpy.array
                  the calculated t-statistics for each model coefficient

    Returns
    -------
    p : numpy.array
        an array of p-values for the inputted t statistics
    """
    p = (1 - norm.cdf(abs(t_statistics))) * 2
    return p

def calculuate_logit_t_statistic(variance_covariance_matrix, model)

Calculates the t_statistic of a sklearn model since sklearn focuses on prediction and not fittedness

Parameters

variance_covariance_matrix : numpy.array: the covariance matrix of the model feature interactions
model : sklearn.model: sklearn like model. must have predict_proba method

Returns

t_statistic : numpy.array: the calculated t-statistics for each model coefficient

Source code

def calculuate_logit_t_statistic(variance_covariance_matrix, model):
    """
    Calculates the t_statistic of a sklearn model since sklearn
    focuses on prediction and not fittedness

    Parameters
    ----------
    variance_covariance_matrix : numpy.array
                                 the covariance matrix of the model feature interactions
    model : sklearn.model
            sklearn like model. must have predict_proba method

    Returns
    -------
    t_statistic : numpy.array
                  the calculated t-statistics for each model coefficient

    """
    standard_error = np.sqrt(np.diag(variance_covariance_matrix))
    coefficients = np.concatenate([model.intercept_, model.coef_[0]])
    t_statistic = coefficients/standard_error
    return t_statistic

def cox_snell_r2(true_y, predicted_proba)

calculates the cox and snell r2 which is:

r2 = 1 - (null_model_log_like/model_log_like)^2/n

Parameters

log_likelihood() : float: the loglikelihood caclulated for the fitted model
null_log_likelihood : float: the loglikelihood calculated for the intercept model
n : int: the number of observations used to fit the model

Returns

cox_and_snell_R2 : float: the calculated C&S pseudo-R2

References

[1] https://statisticalhorizons.com/r2logistic

unit tests

Source code

def cox_snell_r2(true_y, predicted_proba):
    """
    calculates the cox and snell r2 which is:

    r2 = 1 - (null_model_log_like/model_log_like)^2/n

    Parameters
    ----------
    log_likelihood : float
                     the loglikelihood caclulated for the fitted model
    null_log_likelihood : float
                          the loglikelihood calculated for the intercept model
    n : int
        the number of observations used to fit the model

    Returns
    -------
    cox_and_snell_R2 : float
                       the calculated C&S pseudo-R2

    References
    ----------
    [1] https://statisticalhorizons.com/r2logistic

    # unit tests
    """
    raise ImplementationError('this is broken')
    true_y = np.array(true_y)
    n = true_y.shape[0]
    predicted_proba = np.array(predicted_proba)
    null_prob = null_model_probability(true_y)

    model_log_likelihood = log_likelihood(true_y, predicted_proba)
    null_log_likelihood = log_likelihood(true_y, null_prob)

    cox_snell_r2 = _cox_snell_r2(model_log_likelihood, null_log_likelihood, n)

    return cox_snell_r2

def fit_and_score_model(model, data, metrics, **kwargs)

Fits any sklearn classification model and produces metric scores from metrics input.

The kwargs should be for the model hyperparameters.

Parameters

model : sklearn.model: sklearn model to fit/test
data : pandas.DataFrame: list of pandas data frames in the order as sklearn.model_selection.train_test_split
metrics : dict: dict of model assessment metrics. keys are only allowed to be the exact metric function name.
**kwargs : dict: any arguments for the model hyperparameters

Returns

scores : dict: scores for model for the input metrics

Source code

def fit_and_score_model(model, data : list, metrics : dict, **kwargs):
    """
    Fits any sklearn classification model and produces
    metric scores from metrics input.

    The kwargs should be for the model hyperparameters.

    Parameters
    ----------
    model : sklearn.model
            sklearn model to fit/test
    data : pandas.DataFrame
           list of pandas data frames in the order as
           sklearn.model_selection.train_test_split
    metrics : dict
              dict of model assessment metrics. keys are 
              only allowed to be the exact metric function
              name.
    **kwargs : dict
               any arguments for the model hyperparameters

    Returns
    -------
    scores : dict
             scores for model for the input metrics
    """
    train_x, test_x, train_y, test_y = data
    model = model(**kwargs)
    model.fit(train_x, train_y)
    predicted = model.predict(test_x)
    predicted_proba = model.predict_proba(test_x)
    rocdf = pd.DataFrame({'test_true':test_y})
    rocdf['test_pred'] = predicted
    fpr, tpr, thresholds = roc_curve(rocdf['test_true'], predicted_proba[:,1])
    
    scores = {}
    for key in metrics:
        if key in ('roc_auc_score', 'log_likelihood', 'cox_snell_r2', 'mcfaddens_r2'):
            # print('predicted probability test')
            scores[key] = metrics[key](rocdf['test_true'], predicted_proba[:,1])
        elif key in ('f1_score', 'accuracy_score', 'confusion_matrix'):
            # print('predicted value test')
            scores[key] = metrics[key](test_y, predicted)
        else:
            raise ValueError('{k} is not an accepted metric'.format(k=key))
    return scores

def log_likelihood(true_y, probability_y)

calculates the log likelihood using the log loss function as described here:

Parameters

true_y : np.array: an array of true observations
probability_y : np.array: an array of predicted probabilities from a model

Returns

log_likelihood() : float: the calculated loglikelihood

References

[1] https://stackoverflow.com/questions/48185090/how-to-get-the-log-likelihood-for-a-logistic-regression-model-in-sklearn

unit tests for log_likelihood function

>>> import numpy as np
>>> y_true = np.array([0, 1, 1])
>>> y_pred = np.array([0.1, 0.2, 0.9])
>>> from sklearn.metrics import log_loss
>>> lloss = log_loss(y_true, y_pred)
>>> log_likelihood_elements = y_true*np.log(y_pred) + (1-y_true)*np.log(1-y_pred)
>>> lloss
0.6067196479165843
>>> lloss == -np.sum(log_likelihood_elements)/len(y_true)
**`True`**

Source code

def log_likelihood(true_y, probability_y):
    """
    calculates the log likelihood using the log loss function as described here:
    
    
    Parameters
    ----------
    true_y : np.array
             an array of true observations
    probability_y : np.array
                    an array of predicted probabilities from a model
    Returns
    -------
    log_likelihood : float
                     the calculated loglikelihood

    References
    ----------
    [1] https://stackoverflow.com/questions/48185090/how-to-get-the-log-likelihood-for-a-logistic-regression-model-in-sklearn

    # unit tests for log_likelihood function
    >>> import numpy as np
    >>> y_true = np.array([0, 1, 1])
    >>> y_pred = np.array([0.1, 0.2, 0.9])
    >>> from sklearn.metrics import log_loss
    >>> lloss = log_loss(y_true, y_pred)
    >>> log_likelihood_elements = y_true*np.log(y_pred) + (1-y_true)*np.log(1-y_pred)
    >>> lloss
    0.6067196479165843
    >>> lloss == -np.sum(log_likelihood_elements)/len(y_true)
    True
    """
    return - log_loss(true_y, probability_y, normalize=False)

def mcfaddens_r2(true_y, predicted_proba)

Calculates the mcfaddens pseudo R2

R2_mcf = 1 - loglikelihood/null_logllikelihood

Parameters

true_y : np.array: an array of true observations
probability_y : np.array: an array of predicted probabilities from a model

Returns

mcfaddens_r2() : float: the calculated loglikelihood

References

[1] https://stats.idre.ucla.edu/other/mult-pkg/faq/general/faq-what-are-pseudo-r-squareds/

unit tests

>>> import statsmodels.formula.api as sm
>>> y = [1, 1, 1, 0, 0, 0]
>>> x = [1, 2, 3, 4, 5, 6]
>>> model = sm.Logit(y, x)
>>> result = model.fit(disp=0)
>>> round(result.prsquared, 4) == round(mcfaddens_r2(y, result.predict(x)),4)
**`True`**

Source code

def mcfaddens_r2(true_y, predicted_proba):
    """
    Calculates the mcfaddens pseudo R2

    R2_mcf = 1 - loglikelihood/null_logllikelihood
    
    Parameters
    ----------
    true_y : np.array
             an array of true observations
    probability_y : np.array
                    an array of predicted probabilities from a model
    Returns
    -------
    mcfaddens_r2 : float
                     the calculated loglikelihood

    References
    ----------
    [1] https://stats.idre.ucla.edu/other/mult-pkg/faq/general/faq-what-are-pseudo-r-squareds/

    # unit tests
    >>> import statsmodels.formula.api as sm
    >>> y = [1, 1, 1, 0, 0, 0]
    >>> x = [1, 2, 3, 4, 5, 6]
    >>> model = sm.Logit(y, x)
    >>> result = model.fit(disp=0)
    >>> round(result.prsquared, 4) == round(mcfaddens_r2(y, result.predict(x)),4)
    True
    """
    true_y = np.array(true_y)
    predicted_proba = np.array(predicted_proba)
    null_prob = null_model_probability(true_y)

    model_log_likelihood = log_likelihood(true_y, predicted_proba)
    null_log_likelihood = log_likelihood(true_y, null_prob)

    return 1 - (model_log_likelihood / null_log_likelihood)

def null_model_probability(true_y)

Calculates the probability of class belonging assuming all of the model variables are null.

Parameters

true_y : numpy.array: an array of true values

Returns

null_model_probability() : numpy.array: the intercept value calculated for each data point

Source code

def null_model_probability(true_y):
    """
    Calculates the probability of class belonging assuming
    all of the model variables are null.

    Parameters
    ----------
    true_y : numpy.array
             an array of true values

    Returns
    -------
    null_model_probability : numpy.array
                             the intercept value calculated for each data point
    """
    return sum(true_y) / float(true_y.shape[0]) * np.ones(true_y.shape)

def zscale_dataframe(df, columns_to_scale)

Scales specified columns in dataframe

Parameters

df : pandas.DataFrame: dataframe of data for scaling
columns_to_scale : list: list of columns that should be scaled using the standard score

Returns

transformed_df : pandas.DataFrame: dataframe of scaled columns along with non-scaled columns

Source code

def zscale_dataframe(df, columns_to_scale):
    """
    Scales specified columns in dataframe

    Parameters
    ----------
    df : pandas.DataFrame
         dataframe of data for scaling
    columns_to_scale : list
                       list of columns that should be scaled using the standard score

    Returns
    -------
    transformed_df : pandas.DataFrame
                    dataframe of scaled columns along with non-scaled columns
    """
    
    df_to_scale = df[columns_to_scale].copy()
    df_to_not_scale = df.drop(columns_to_scale, axis=1).copy()
    
    ss = StandardScaler()
    ss.fit(df_to_scale)
    ss_data = ss.transform(df_to_scale)
    
    transformed_df = pd.DataFrame(ss_data, index=df_to_scale.index, columns=df_to_scale.columns)
    transformed_df = transformed_df.join(df_to_not_scale)
    return transformed_df