Source code for nannyml.sampling_error.multiclass_classification

#  Author:   Niels Nuyttens  <niels@nannyml.com>
#            Jakub Bialek    <jabub@nannyml.com>
#
#  License: Apache Software License 2.0

"""Module containing functions to estimate sampling error for multiclass classification metrics."""

from typing import List, Tuple, Union, Optional

import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, average_precision_score


# How many experiments to perform when doing resampling to approximate sampling error.
N_EXPERIMENTS = 50
# Max resample size - we don't need full reference if it is too big.
MAX_RESAMPLE_SIZE = 50_000


def _standard_deviation_of_variances(components: List[Tuple], data) -> float:
    class_variances = [c[0] / (len(data) * c[1]) for c in components]
    multiclass_std = np.sqrt(np.sum(class_variances)) / len(class_variances)
    return multiclass_std


[docs]def auroc_sampling_error_components(y_true_reference: List[pd.Series], y_pred_proba_reference: List[pd.Series]):
    """Calculate sampling error components for AUROC using reference data.

    The ``y_true_reference`` and ``y_pred_proba_reference`` lists represent the binarized target values and model
    probabilities. The order of the Series in both lists should both match the list of class labels present.

    Parameters
    ----------
    y_true_reference: List[pd.Series]
        Target values for the reference dataset.
    y_pred_proba_reference: List[pd.Series]
        Prediction probability values for the reference dataset.

    Returns
    -------
    sampling_error_components: List[Tuple]
    """

    def _get_class_components(y_true, y_pred_proba):
        if np.mean(y_true) > 0.5:
            y_true = abs(np.asarray(y_true) - 1)
            y_pred_proba = 1 - y_pred_proba

        sorted_idx = np.argsort(y_pred_proba)
        y_pred_proba = y_pred_proba[sorted_idx]
        y_true = y_true[sorted_idx]
        rank_order = np.asarray(range(len(y_pred_proba)))
        positive_ranks = y_true * rank_order
        indexes = np.unique(positive_ranks)[1:]
        ser = []
        for i, index in enumerate(indexes):
            ser.append(index - i)

        n_pos = np.sum(y_true)
        n_neg = len(y_true) - n_pos
        ser_multi = ser / n_neg
        fraction = n_pos / len(y_true)

        return np.var(ser_multi), fraction

    # classes = sorted(y_pred_proba_reference.keys())
    # binarized_y_true_reference_list = list(label_binarize(y_true_reference, classes=classes).T)
    # y_pred_proba_reference_list = [y_pred_proba_reference[clz] for clz in classes]

    class_components = []
    for y_true_class, y_pred_proba_class in zip(y_true_reference, y_pred_proba_reference):
        class_components.append(_get_class_components(y_true_class, y_pred_proba_class))

    return class_components


[docs]def auroc_sampling_error(sampling_error_components, data) -> float:
    """Calculate the AUROC sampling error for a chunk of data.

    Parameters
    ----------
    sampling_error_components:
        a set of parameters that were derived from reference data.
    data:
        the (analysis) data you want to calculate or estimate a metric for.

    Returns
    -------
    sampling_error: float

    """
    class_variances = [c[0] / (len(data) * c[1]) for c in sampling_error_components]
    # Experiments showed that std of class variances underestimated sampling error by 20% so we manually adjust result
    multiclass_std = np.sqrt(np.sum(class_variances)) / len(class_variances) * 1.2
    return multiclass_std


[docs]def f1_sampling_error_components(y_true_reference: List[pd.Series], y_pred_reference: List[pd.Series]):
    """Calculate sampling error components for F1 using reference data.

    The ``y_true_reference`` and ``y_pred_proba_reference`` lists represent the binarized target values and model
    probabilities. The order of the Series in both lists should both match the list of class labels present.

    Parameters
    ----------
    y_true_reference: List[pd.Series]
        Target values for the reference dataset.
    y_pred_reference: List[pd.Series]
        Prediction values for the reference dataset.

    Returns
    -------
    sampling_error_components: List[Tuple]
    """

    def _get_class_components(y_true, y_pred):
        TP = np.where((y_true == y_pred) & (y_pred == 1), 1, np.nan)
        FP = np.where((y_true != y_pred) & (y_pred == 1), 0, np.nan)
        FN = np.where((y_true != y_pred) & (y_pred == 0), 0, np.nan)

        TP = TP[~np.isnan(TP)]
        FN = FN[~np.isnan(FN)]
        FP = FP[~np.isnan(FP)]

        obs_level_f1 = np.concatenate([TP, FN, FP])
        fraction_of_relevant = len(obs_level_f1) / len(y_pred)

        return np.var(obs_level_f1), fraction_of_relevant

    class_components = []
    for y_true_class, y_pred_class in zip(y_true_reference, y_pred_reference):
        class_components.append(_get_class_components(y_true_class, y_pred_class))

    return class_components


[docs]def f1_sampling_error(sampling_error_components: List[Tuple], data) -> float:
    """Calculate the F1 sampling error for a chunk of data.

    Parameters
    ----------
    sampling_error_components:
        a set of parameters that were derived from reference data.
    data:
        the (analysis) data you want to calculate or estimate a metric for.

    Returns
    -------
    sampling_error: float

    """
    return _standard_deviation_of_variances(sampling_error_components, data)


[docs]def precision_sampling_error_components(y_true_reference: List[pd.Series], y_pred_reference: List[pd.Series]):
    """Calculate sampling error components for precision using reference data.

    The ``y_true_reference`` and ``y_pred_proba_reference`` lists represent the binarized target values and model
    probabilities. The order of the Series in both lists should both match the list of class labels present.

    Parameters
    ----------
    y_true_reference: List[pd.Series]
        Target values for the reference dataset.
    y_pred_reference: List[pd.Series]
        Prediction values for the reference dataset.

    Returns
    -------
    sampling_error_components: List[Tuple]
    """

    def _get_class_components(y_true, y_pred):
        TP = np.where((y_true == y_pred) & (y_pred == 1), 1, np.nan)
        FP = np.where((y_true != y_pred) & (y_pred == 1), 0, np.nan)

        TP = TP[~np.isnan(TP)]
        FP = FP[~np.isnan(FP)]
        obs_level_precision = np.concatenate([TP, FP])
        amount_positive_pred = np.sum(y_pred)
        fraction_of_pos_pred = amount_positive_pred / len(y_pred)

        return np.var(obs_level_precision), fraction_of_pos_pred

    class_components = []
    for y_true_class, y_pred_class in zip(y_true_reference, y_pred_reference):
        class_components.append(_get_class_components(y_true_class, y_pred_class))

    return class_components


[docs]def precision_sampling_error(sampling_error_components: List[Tuple], data) -> float:
    """Calculate the precision sampling error for a chunk of data.

    Parameters
    ----------
    sampling_error_components:
        a set of parameters that were derived from reference data.
    data:
        the (analysis) data you want to calculate or estimate a metric for.

    Returns
    -------
    sampling_error: float

    """
    return _standard_deviation_of_variances(sampling_error_components, data)


[docs]def recall_sampling_error_components(y_true_reference: List[pd.Series], y_pred_reference: List[pd.Series]):
    """Calculate sampling error components for recall using reference data.

    The ``y_true_reference`` and ``y_pred_proba_reference`` lists represent the binarized target values and model
    probabilities. The order of the Series in both lists should both match the list of class labels present.

    Parameters
    ----------
    y_true_reference: List[pd.Series]
        Target values for the reference dataset.
    y_pred_reference: List[pd.Series]
        Prediction values for the reference dataset.

    Returns
    -------
    sampling_error_components: List[Tuple]
    """

    def _get_class_components(y_true, y_pred):
        TP = np.where((y_true == y_pred) & (y_pred == 1), 1, np.nan)
        FN = np.where((y_true != y_pred) & (y_pred == 0), 0, np.nan)

        TP = TP[~np.isnan(TP)]
        FN = FN[~np.isnan(FN)]
        obs_level_recall = np.concatenate([TP, FN])
        fraction_of_relevant = sum(y_true) / len(y_pred)

        return np.var(obs_level_recall), fraction_of_relevant

    class_components = []
    for y_true_class, y_pred_class in zip(y_true_reference, y_pred_reference):
        class_components.append(_get_class_components(y_true_class, y_pred_class))

    return class_components


[docs]def recall_sampling_error(sampling_error_components: List[Tuple], data) -> float:
    """Calculate the recall sampling error for a chunk of data.

    Parameters
    ----------
    sampling_error_components:
        a set of parameters that were derived from reference data.
    data:
        the (analysis) data you want to calculate or estimate a metric for.

    Returns
    -------
    sampling_error: float

    """
    return _standard_deviation_of_variances(sampling_error_components, data)


[docs]def specificity_sampling_error_components(y_true_reference: List[pd.Series], y_pred_reference: List[pd.Series]):
    """Calculate sampling error components for specificity using reference data.

    The ``y_true_reference`` and ``y_pred_proba_reference`` lists represent the binarized target values and model
    probabilities. The order of the Series in both lists should both match the list of class labels present.

    Parameters
    ----------
    y_true_reference: List[pd.Series]
        Target values for the reference dataset.
    y_pred_reference: List[pd.Series]
        Prediction values for the reference dataset.

    Returns
    -------
    sampling_error_components: List[Tuple]
    """

    def _get_class_components(y_true, y_pred):
        TN = np.where((y_true == y_pred) & (y_pred == 0), 1, np.nan)
        FP = np.where((y_true != y_pred) & (y_pred == 1), 0, np.nan)

        TN = TN[~np.isnan(TN)]
        FP = FP[~np.isnan(FP)]
        obs_level_specificity = np.concatenate([TN, FP])
        fraction_of_relevant = len(obs_level_specificity) / len(y_pred)

        return np.var(obs_level_specificity), fraction_of_relevant

    class_components = []
    for y_true_class, y_pred_class in zip(y_true_reference, y_pred_reference):
        class_components.append(_get_class_components(y_true_class, y_pred_class))

    return class_components


[docs]def specificity_sampling_error(sampling_error_components: List[Tuple], data) -> float:
    """Calculate the specificity sampling error for a chunk of data.

    Parameters
    ----------
    sampling_error_components:
        a set of parameters that were derived from reference data.
    data:
        the (analysis) data you want to calculate or estimate a metric for.

    Returns
    -------
    sampling_error: float

    """
    return _standard_deviation_of_variances(sampling_error_components, data)


[docs]def accuracy_sampling_error_components(y_true_reference: List[pd.Series], y_pred_reference: List[pd.Series]):
    """Calculate sampling error components for accuracy using reference data.

    The ``y_true_reference`` and ``y_pred_proba_reference`` lists represent the binarized target values and model
    probabilities. The order of the Series in both lists should both match the list of class labels present.

    Parameters
    ----------
    y_true_reference: List[pd.Series]
        Target values for the reference dataset.
    y_pred_reference: List[pd.Series]
        Prediction values for the reference dataset.

    Returns
    -------
    sampling_error_components: Tuple
    """
    y_true = np.asarray(y_true_reference).astype(int)
    y_pred = np.asarray(y_pred_reference).astype(int)
    correct_table = (y_true == y_pred).all(axis=1).astype(int)

    return (np.std(correct_table),)


[docs]def accuracy_sampling_error(sampling_error_components: Tuple, data) -> float:
    """Calculate the accuracy sampling error for a chunk of data.

    Parameters
    ----------
    sampling_error_components:
        a set of parameters that were derived from reference data.
    data:
        the (analysis) data you want to calculate or estimate a metric for.

    Returns
    -------
    sampling_error: float

    """
    return sampling_error_components[0] / np.sqrt(len(data))


[docs]def multiclass_confusion_matrix_sampling_error_components(
    y_true_reference: List[pd.Series], y_pred_reference: List[pd.Series], normalize_confusion_matrix: Union[str, None]
):
    """Calculate sampling error components for CM using reference data."""
    cm = confusion_matrix(y_true_reference, y_pred_reference)

    true_marginal = cm.sum(axis=1)[:, None]
    pred_marginal = cm.sum(axis=0)[None, :]

    num_observations = len(y_true_reference)

    if normalize_confusion_matrix == 'true':
        relevant_proportions = true_marginal / num_observations
    elif normalize_confusion_matrix == 'pred':
        relevant_proportions = pred_marginal / num_observations
    elif normalize_confusion_matrix == 'all':
        relevant_proportions = 1
    else:
        relevant_proportions = None

    n_rows, n_cols = cm.shape

    stds = np.zeros((n_rows, n_cols))

    for i in range(n_rows):
        for j in range(n_cols):
            if normalize_confusion_matrix == 'true':
                obs_level_array = np.zeros(true_marginal[i, 0], dtype=int)
            elif normalize_confusion_matrix == 'pred':
                obs_level_array = np.zeros(pred_marginal[0, j], dtype=int)
            elif normalize_confusion_matrix == 'all':
                obs_level_array = np.zeros(num_observations, dtype=int)
            else:
                obs_level_array = np.zeros(num_observations, dtype=int)

            end_index = cm[i, j]
            obs_level_array[:end_index] = 1

            stds[i, j] = np.std(obs_level_array)

    return stds, relevant_proportions


[docs]def multiclass_confusion_matrix_sampling_error(sampling_error_components: Tuple, data):
    """Calculate the CM sampling error for a chunk of data."""
    reference_stds, relevant_proportions = sampling_error_components

    if relevant_proportions is None:
        standard_errors = (reference_stds / np.sqrt(len(data))) * len(data)
    else:
        standard_errors = reference_stds / np.sqrt(len(data) * relevant_proportions)

    return standard_errors


[docs]def average_precision_sampling_error_components(
    y_true_reference: List[np.ndarray], y_pred_proba_reference: List[pd.Series]
):
    """Calculate sampling error components for AP using reference data.

    The ``y_true_reference`` and ``y_pred_proba_reference`` lists represent the binarized target values and model
    probabilities. The order of the Series in both lists should both match the list of class labels present.

    Parameters
    ----------
    y_true_reference: List[np.ndarray]
        Target values for the reference dataset.
    y_pred_proba_reference: List[pd.Series]
        Prediction probability values for the reference dataset.

    Returns
    -------
    sampling_error_components: List[Tuple]
    """

    def _get_class_components(y_true_reference: np.ndarray, y_pred_proba_reference: pd.Series):
        sample_size = np.minimum(y_true_reference.shape[0] // 2, MAX_RESAMPLE_SIZE)

        y_pred_proba_reference = y_pred_proba_reference.to_numpy()

        ap_results = []
        for _ in range(N_EXPERIMENTS):
            _indexes_for_sample = np.random.choice(y_true_reference.shape[0], sample_size, replace=True)
            sample_y_true_reference = y_true_reference[_indexes_for_sample]
            sample_y_pred_proba_reference = y_pred_proba_reference[_indexes_for_sample]
            ap_results.append(average_precision_score(sample_y_true_reference, sample_y_pred_proba_reference))
        return np.var(ap_results), sample_size

    class_components = []
    for y_true_class, y_pred_proba_class in zip(y_true_reference, y_pred_proba_reference):
        class_components.append(_get_class_components(y_true_class, y_pred_proba_class))

    return class_components


[docs]def average_precision_sampling_error(sampling_error_components, data) -> float:
    """Calculate the AUROC sampling error for a chunk of data.

    Parameters
    ----------
    sampling_error_components:
        a set of parameters that were derived from reference data.
    data:
        the (chunk) data you want to calculate or estimate a metric for.

    Returns
    -------
    sampling_error: float
    """
    class_variances = [c[0] * c[1] / len(data) for c in sampling_error_components]
    multiclass_std = np.sqrt(np.mean(class_variances))
    return multiclass_std


def _calculate_business_value_per_row(
    row,
    business_value_matrix: np.ndarray,
    classes: List[str],
):
    """Helper function that calculates business value per row in a dataframe.

    Intended to be used within a pandas apply function.
    """
    cm = confusion_matrix(y_true=np.array([row.y_true]), y_pred=np.array([row.y_pred]), labels=classes)
    bv = (cm * business_value_matrix).sum()
    return bv


[docs]def business_value_sampling_error_components(
    y_true_reference: pd.Series,
    y_pred_reference: pd.Series,
    business_value_matrix: np.ndarray,
    classes: List[str],
    normalize_business_value: Optional[str],
) -> Tuple[float, Union[str, None]]:
    """Estimate sampling error for the false negative rate.

    Parameters
    ----------
    y_true_reference: pd.Series
        Target values for the reference dataset.
    y_pred_reference: pd.Series
        Predictions for the reference dataset.
    business_value_matrix: np.ndarray
        A nxn matrix of values for the business problem.
    classes: List[str]
        An alphanumerically sorted list of the unique classes in the multiclass problem
    normalize_business_value: Optional[str], default=None
        Determines how the business value will be normalized. Allowed values are None and 'per_prediction'.

    Returns
    -------
    components: tuple
    """
    data = pd.DataFrame(
        {
            'y_true': y_true_reference,
            'y_pred': y_pred_reference,
        }
    )
    bvs = data.apply(lambda x: _calculate_business_value_per_row(x, business_value_matrix, classes), axis=1)
    return (bvs.std(), normalize_business_value)


[docs]def business_value_sampling_error(sampling_error_components: Tuple, data) -> float:
    """Calculate the false positive rate sampling error for a chunk of data.

    Parameters
    ----------
    sampling_error_components:
        a set of parameters that were derived from reference data.
    data:
        the (chunk) data you want to calculate or estimate a metric for.

    Returns
    -------
    sampling_error: float
    """
    (reference_std, norm_type) = sampling_error_components
    _size = len(data)

    if norm_type is None:
        analysis_std = reference_std * _size
    else:  # norm_type must be 'per_prediciton'
        analysis_std = reference_std

    total_value_standard_error = analysis_std / np.sqrt(_size)
    return total_value_standard_error