Source code for nannyml.sampling_error.multiclass_classification

#  Author:   Niels Nuyttens  <niels@nannyml.com>
#            Jakub Bialek    <jabub@nannyml.com>
#
#  License: Apache Software License 2.0
from typing import List, Tuple

import numpy as np
import pandas as pd


def _standard_deviation_of_variances(components: List[Tuple], data) -> float:
    class_variances = [c[0] / (len(data) * c[1]) for c in components]
    multiclass_std = np.sqrt(np.sum(class_variances)) / len(class_variances)
    return multiclass_std


[docs]def auroc_sampling_error_components(y_true_reference: List[pd.Series], y_pred_proba_reference: List[pd.Series]):
    """
    Calculate the sampling error components on reference data.

    The ``y_true_reference`` and ``y_pred_proba_reference`` lists represent the binarized target values and model
    probabilities. The order of the Series in both lists should both match the list of class labels present.

    Parameters
    ----------
    y_true_reference: List[pd.Series]
        Target values for the reference dataset.
    y_pred_proba_reference: List[pd.Series]
        Prediction probability values for the reference dataset.

    Returns
    -------
    sampling_error_components: List[Tuple]
    """

    def _get_class_components(y_true, y_pred_proba):
        if np.mean(y_true) > 0.5:
            y_true = abs(np.asarray(y_true) - 1)
            y_pred_proba = 1 - y_pred_proba

        sorted_idx = np.argsort(y_pred_proba)
        y_pred_proba = y_pred_proba[sorted_idx]
        y_true = y_true[sorted_idx]
        rank_order = np.asarray(range(len(y_pred_proba)))
        positive_ranks = y_true * rank_order
        indexes = np.unique(positive_ranks)[1:]
        ser = []
        for i, index in enumerate(indexes):
            ser.append(index - i)

        n_pos = np.sum(y_true)
        n_neg = len(y_true) - n_pos
        ser_multi = ser / n_neg
        fraction = n_pos / len(y_true)

        return np.var(ser_multi), fraction

    # classes = sorted(y_pred_proba_reference.keys())
    # binarized_y_true_reference_list = list(label_binarize(y_true_reference, classes=classes).T)
    # y_pred_proba_reference_list = [y_pred_proba_reference[clz] for clz in classes]

    class_components = []
    for y_true_class, y_pred_proba_class in zip(y_true_reference, y_pred_proba_reference):
        class_components.append(_get_class_components(y_true_class, y_pred_proba_class))

    return class_components


[docs]def auroc_sampling_error(sampling_error_components, data) -> float:
    """
    Calculate the AUROC sampling error for a chunk of data.

    Parameters
    ----------
    sampling_error_components : a set of parameters that were derived from reference data.
    data : the (analysis) data you want to calculate or estimate a metric for.

    Returns
    -------
    sampling_error: float

    """
    class_variances = [c[0] / (len(data) * c[1]) for c in sampling_error_components]
    multiclass_std = np.sqrt(np.sum(class_variances)) / len(class_variances) * 1.2
    return multiclass_std


[docs]def f1_sampling_error_components(y_true_reference: List[pd.Series], y_pred_reference: List[pd.Series]):
    """
    Calculate the sampling error components on reference data.

    The ``y_true_reference`` and ``y_pred_proba_reference`` lists represent the binarized target values and model
    probabilities. The order of the Series in both lists should both match the list of class labels present.

    Parameters
    ----------
    y_true_reference: List[pd.Series]
        Target values for the reference dataset.
    y_pred_reference: List[pd.Series]
        Prediction values for the reference dataset.

    Returns
    -------
    sampling_error_components: List[Tuple]
    """

    def _get_class_components(y_true, y_pred):
        TP = np.where((y_true == y_pred) & (y_pred == 1), 1, np.nan)
        FP = np.where((y_true != y_pred) & (y_pred == 1), 0, np.nan)
        FN = np.where((y_true != y_pred) & (y_pred == 0), 0, np.nan)

        TP = TP[~np.isnan(TP)]
        FN = FN[~np.isnan(FN)]
        FP = FP[~np.isnan(FP)]

        obs_level_f1 = np.concatenate([TP, FN, FP])
        fraction_of_relevant = len(obs_level_f1) / len(y_pred)

        return np.var(obs_level_f1), fraction_of_relevant

    class_components = []
    for y_true_class, y_pred_class in zip(y_true_reference, y_pred_reference):
        class_components.append(_get_class_components(y_true_class, y_pred_class))

    return class_components


[docs]def f1_sampling_error(sampling_error_components: List[Tuple], data) -> float:
    """
    Calculate the F1 sampling error for a chunk of data.

    Parameters
    ----------
    sampling_error_components : a set of parameters that were derived from reference data.
    data : the (analysis) data you want to calculate or estimate a metric for.

    Returns
    -------
    sampling_error: float

    """
    return _standard_deviation_of_variances(sampling_error_components, data)


[docs]def precision_sampling_error_components(y_true_reference: List[pd.Series], y_pred_reference: List[pd.Series]):
    """
    Calculate the sampling error components on reference data.

    The ``y_true_reference`` and ``y_pred_proba_reference`` lists represent the binarized target values and model
    probabilities. The order of the Series in both lists should both match the list of class labels present.

    Parameters
    ----------
    y_true_reference: List[pd.Series]
        Target values for the reference dataset.
    y_pred_reference: List[pd.Series]
        Prediction values for the reference dataset.

    Returns
    -------
    sampling_error_components: List[Tuple]
    """

    def _get_class_components(y_true, y_pred):
        TP = np.where((y_true == y_pred) & (y_pred == 1), 1, np.nan)
        FP = np.where((y_true != y_pred) & (y_pred == 1), 0, np.nan)

        TP = TP[~np.isnan(TP)]
        FP = FP[~np.isnan(FP)]
        obs_level_precision = np.concatenate([TP, FP])
        amount_positive_pred = np.sum(y_pred)
        fraction_of_pos_pred = amount_positive_pred / len(y_pred)

        return np.var(obs_level_precision), fraction_of_pos_pred

    class_components = []
    for y_true_class, y_pred_class in zip(y_true_reference, y_pred_reference):
        class_components.append(_get_class_components(y_true_class, y_pred_class))

    return class_components


[docs]def precision_sampling_error(sampling_error_components: List[Tuple], data) -> float:
    """
    Calculate the precision sampling error for a chunk of data.

    Parameters
    ----------
    sampling_error_components : a set of parameters that were derived from reference data.
    data : the (analysis) data you want to calculate or estimate a metric for.

    Returns
    -------
    sampling_error: float

    """
    return _standard_deviation_of_variances(sampling_error_components, data)


[docs]def recall_sampling_error_components(y_true_reference: List[pd.Series], y_pred_reference: List[pd.Series]):
    """
    Calculate the sampling error components on reference data.

    The ``y_true_reference`` and ``y_pred_proba_reference`` lists represent the binarized target values and model
    probabilities. The order of the Series in both lists should both match the list of class labels present.

    Parameters
    ----------
    y_true_reference: List[pd.Series]
        Target values for the reference dataset.
    y_pred_reference: List[pd.Series]
        Prediction values for the reference dataset.

    Returns
    -------
    sampling_error_components: List[Tuple]
    """

    def _get_class_components(y_true, y_pred):
        TP = np.where((y_true == y_pred) & (y_pred == 1), 1, np.nan)
        FN = np.where((y_true != y_pred) & (y_pred == 0), 0, np.nan)

        TP = TP[~np.isnan(TP)]
        FN = FN[~np.isnan(FN)]
        obs_level_recall = np.concatenate([TP, FN])
        fraction_of_relevant = sum(y_true) / len(y_pred)

        return np.var(obs_level_recall), fraction_of_relevant

    class_components = []
    for y_true_class, y_pred_class in zip(y_true_reference, y_pred_reference):
        class_components.append(_get_class_components(y_true_class, y_pred_class))

    return class_components


[docs]def recall_sampling_error(sampling_error_components: List[Tuple], data) -> float:
    """
    Calculate the recall sampling error for a chunk of data.

    Parameters
    ----------
    sampling_error_components : a set of parameters that were derived from reference data.
    data : the (analysis) data you want to calculate or estimate a metric for.

    Returns
    -------
    sampling_error: float

    """
    return _standard_deviation_of_variances(sampling_error_components, data)


[docs]def specificity_sampling_error_components(y_true_reference: List[pd.Series], y_pred_reference: List[pd.Series]):
    """
    Calculate the sampling error components on reference data.

    The ``y_true_reference`` and ``y_pred_proba_reference`` lists represent the binarized target values and model
    probabilities. The order of the Series in both lists should both match the list of class labels present.

    Parameters
    ----------
    y_true_reference: List[pd.Series]
        Target values for the reference dataset.
    y_pred_reference: List[pd.Series]
        Prediction values for the reference dataset.

    Returns
    -------
    sampling_error_components: List[Tuple]
    """

    def _get_class_components(y_true, y_pred):
        TN = np.where((y_true == y_pred) & (y_pred == 0), 1, np.nan)
        FP = np.where((y_true != y_pred) & (y_pred == 1), 0, np.nan)

        TN = TN[~np.isnan(TN)]
        FP = FP[~np.isnan(FP)]
        obs_level_specificity = np.concatenate([TN, FP])
        fraction_of_relevant = len(obs_level_specificity) / len(y_pred)

        return np.var(obs_level_specificity), fraction_of_relevant

    class_components = []
    for y_true_class, y_pred_class in zip(y_true_reference, y_pred_reference):
        class_components.append(_get_class_components(y_true_class, y_pred_class))

    return class_components


[docs]def specificity_sampling_error(sampling_error_components: List[Tuple], data) -> float:
    """
    Calculate the specificity sampling error for a chunk of data.

    Parameters
    ----------
    sampling_error_components : a set of parameters that were derived from reference data.
    data : the (analysis) data you want to calculate or estimate a metric for.

    Returns
    -------
    sampling_error: float

    """
    return _standard_deviation_of_variances(sampling_error_components, data)


[docs]def accuracy_sampling_error_components(y_true_reference: List[pd.Series], y_pred_reference: List[pd.Series]):
    """
    Calculate the sampling error components on reference data.

    The ``y_true_reference`` and ``y_pred_proba_reference`` lists represent the binarized target values and model
    probabilities. The order of the Series in both lists should both match the list of class labels present.

    Parameters
    ----------
    y_true_reference: List[pd.Series]
        Target values for the reference dataset.
    y_pred_reference: List[pd.Series]
        Prediction values for the reference dataset.

    Returns
    -------
    sampling_error_components: Tuple
    """
    y_true = np.asarray(y_true_reference).astype(int)
    y_pred = np.asarray(y_pred_reference).astype(int)
    correct_table = (y_true == y_pred).all(axis=1).astype(int)

    return (np.std(correct_table),)


[docs]def accuracy_sampling_error(sampling_error_components: Tuple, data) -> float:
    """
    Calculate the accuracy sampling error for a chunk of data.

    Parameters
    ----------
    sampling_error_components : a set of parameters that were derived from reference data.
    data : the (analysis) data you want to calculate or estimate a metric for.

    Returns
    -------
    sampling_error: float

    """
    return sampling_error_components[0] / np.sqrt(len(data))