Source code for nannyml.sampling_error.binary_classification

#  Author:   Niels Nuyttens  <niels@nannyml.com>
#            Jakub Bialek    <jakub@nannyml.com>
#
#  License: Apache Software License 2.0

"""Module containing functions to estimate sampling error for binary classification metrics.

The implementation of the sampling error estimation is split into two functions.

The first function is called during fitting and will calculate the sampling error components based the reference data.
Most of the time these will be the standard deviation of the distribution of differences between
``y_true`` and ``y_pred`` and the fraction of positive labels in ``y_true``.

The second function will be called during calculation or estimation. It takes the predetermined error components and
combines them with the size of the (analysis) data to give an estimate for the sampling error.
"""

from typing import Tuple

import numpy as np
import pandas as pd


def _universal_sampling_error(reference_std, reference_fraction, data):
    return reference_std / np.sqrt(len(data) * reference_fraction)


[docs]def auroc_sampling_error_components(y_true_reference: pd.Series, y_pred_proba_reference: pd.Series) -> Tuple:
    """
    Estimation of AUROC sampling error. Calculation is based on the Variance Sum Law and expressing AUROC as
    a Mann-Whitney U statistic.

    Parameters
    ----------
    y_true_reference: pd.Series
        Target values for the reference dataset.
    y_pred_proba_reference: pd.Series
        Prediction values for the reference dataset.

    Returns
    -------
    (std, fraction): Tuple[np.ndarray, float]
    """

    y_true = y_true_reference.copy().reset_index(drop=True)
    y_pred_proba = y_pred_proba_reference.copy().reset_index(drop=True)

    if np.mean(y_true) > 0.5:
        y_true = abs(np.asarray(y_true) - 1)
        y_pred_proba = 1 - y_pred_proba

    sorted_idx = np.argsort(y_pred_proba)
    y_pred_proba = y_pred_proba[sorted_idx]
    y_true = y_true[sorted_idx]
    rank_order = np.asarray(range(len(y_pred_proba)))
    positive_ranks = y_true * rank_order
    indexes = np.unique(positive_ranks)[1:]
    ser = []
    for i, index in enumerate(indexes):
        ser.append(index - i)

    n_pos = np.sum(y_true)
    n_neg = len(y_true) - n_pos
    ser_multi = ser / n_neg
    fraction = n_pos / len(y_true)

    return np.std(ser_multi), fraction


[docs]def auroc_sampling_error(sampling_error_components, data):
    """
    Calculate the AUROC sampling error for a chunk of data.

    Parameters
    ----------
    sampling_error_components : a set of parameters that were derived from reference data.
    data : the (analysis) data you want to calculate or estimate a metric for.

    Returns
    -------
    sampling_error: float

    """
    reference_std, reference_fraction = sampling_error_components
    return _universal_sampling_error(reference_std, reference_fraction, data)


[docs]def f1_sampling_error_components(y_true_reference: pd.Series, y_pred_reference: pd.Series) -> Tuple:
    """
    Estimate sampling error of F1 using modified standard error of mean formula.

    Parameters
    ----------
    y_true_reference: pd.Series
        Target values for the reference dataset.
    y_pred_reference: pd.Series
        Predictions for the reference dataset.

    Returns
    -------
    (std, fraction): Tuple[np.ndarray, float]
    """
    TP = np.where((y_true_reference == y_pred_reference) & (y_pred_reference == 1), 1, np.nan)

    FP = np.where((y_true_reference != y_pred_reference) & (y_pred_reference == 1), 0, np.nan)
    FN = np.where((y_true_reference != y_pred_reference) & (y_pred_reference == 0), 0, np.nan)

    TP = TP[~np.isnan(TP)]

    FN = FN[~np.isnan(FN)]
    FP = FP[~np.isnan(FP)]

    tp_fp_fn = np.concatenate([TP, FN, FP])

    correcting_factor = len(tp_fp_fn) / ((len(FN) + len(FP)) * 0.5 + len(TP))
    obs_level_f1 = tp_fp_fn * correcting_factor
    fraction_of_relevant = len(tp_fp_fn) / len(y_pred_reference)

    return np.std(obs_level_f1), fraction_of_relevant


[docs]def f1_sampling_error(sampling_error_components, data):
    """
    Calculate the F1 sampling error for a chunk of data.

    Parameters
    ----------
    sampling_error_components : a set of parameters that were derived from reference data.
    data : the (analysis) data you want to calculate or estimate a metric for.

    Returns
    -------
    sampling_error: float

    """
    reference_std, reference_fraction = sampling_error_components
    return _universal_sampling_error(reference_std, reference_fraction, data)


[docs]def precision_sampling_error_components(y_true_reference: pd.Series, y_pred_reference: pd.Series) -> Tuple:
    """
    Estimate sampling error for precision using modified standard error of mean formula.

    Parameters
    ----------
    y_true_reference: pd.Series
        Target values for the reference dataset.
    y_pred_reference: pd.Series
        Predictions for the reference dataset.

    Returns
    -------
    (std, fraction): Tuple[np.ndarray, float]
    """
    TP = np.where((y_true_reference == y_pred_reference) & (y_pred_reference == 1), 1, np.nan)
    FP = np.where((y_true_reference != y_pred_reference) & (y_pred_reference == 1), 0, np.nan)

    TP = TP[~np.isnan(TP)]
    FP = FP[~np.isnan(FP)]
    obs_level_precision = np.concatenate([TP, FP])
    amount_positive_pred = np.sum(y_pred_reference)
    fraction_of_pos_pred = amount_positive_pred / len(y_pred_reference)

    return np.std(obs_level_precision), fraction_of_pos_pred


[docs]def precision_sampling_error(sampling_error_components, data):
    """
    Calculate the precision sampling error for a chunk of data.

    Parameters
    ----------
    sampling_error_components : a set of parameters that were derived from reference data.
    data : the (analysis) data you want to calculate or estimate a metric for.

    Returns
    -------
    sampling_error: float

    """
    reference_std, reference_fraction = sampling_error_components
    return _universal_sampling_error(reference_std, reference_fraction, data)


[docs]def recall_sampling_error_components(y_true_reference: pd.Series, y_pred_reference: pd.Series) -> Tuple:
    """
    Estimate sampling error for recall using modified standard error of mean formula.

    Parameters
    ----------
    y_true_reference: pd.Series
        Target values for the reference dataset.
    y_pred_reference: pd.Series
        Predictions for the reference dataset.

    Returns
    -------
    (std, fraction): Tuple[np.ndarray, float]
    """
    TP = np.where((y_true_reference == y_pred_reference) & (y_pred_reference == 1), 1, np.nan)
    FN = np.where((y_true_reference != y_pred_reference) & (y_pred_reference == 0), 0, np.nan)

    TP = TP[~np.isnan(TP)]
    FN = FN[~np.isnan(FN)]
    obs_level_recall = np.concatenate([TP, FN])
    fraction_of_relevant = sum(y_true_reference) / len(y_pred_reference)

    return np.std(obs_level_recall), fraction_of_relevant


[docs]def recall_sampling_error(sampling_error_components, data):
    """
    Calculate the recall sampling error for a chunk of data.

    Parameters
    ----------
    sampling_error_components : a set of parameters that were derived from reference data.
    data : the (analysis) data you want to calculate or estimate a metric for.

    Returns
    -------
    sampling_error: float

    """
    reference_std, reference_fraction = sampling_error_components
    return _universal_sampling_error(reference_std, reference_fraction, data)


[docs]def specificity_sampling_error_components(y_true_reference: pd.Series, y_pred_reference: pd.Series) -> Tuple:
    """
    Estimate sampling error for specificity using modified standard error of mean formula.

    Parameters
    ----------
    y_true_reference: pd.Series
        Target values for the reference dataset.
    y_pred_reference: pd.Series
        Predictions for the reference dataset.

    Returns
    -------
    (std, fraction): Tuple[np.ndarray, float]
    """
    TN = np.where((y_true_reference == y_pred_reference) & (y_pred_reference == 0), 1, np.nan)
    FP = np.where((y_true_reference != y_pred_reference) & (y_pred_reference == 1), 0, np.nan)

    TN = TN[~np.isnan(TN)]
    FP = FP[~np.isnan(FP)]
    obs_level_specificity = np.concatenate([TN, FP])
    fraction_of_relevant = len(obs_level_specificity) / len(y_pred_reference)

    return np.std(obs_level_specificity), fraction_of_relevant


[docs]def specificity_sampling_error(sampling_error_components, data):
    """
    Calculate the specificity sampling error for a chunk of data.

    Parameters
    ----------
    sampling_error_components : a set of parameters that were derived from reference data.
    data : the (analysis) data you want to calculate or estimate a metric for.

    Returns
    -------
    sampling_error: float

    """
    reference_std, reference_fraction = sampling_error_components
    return _universal_sampling_error(reference_std, reference_fraction, data)


[docs]def accuracy_sampling_error_components(y_true_reference: pd.Series, y_pred_reference: pd.Series) -> Tuple:
    """
    Estimate sampling error for accuracy.

    Parameters
    ----------
    y_true_reference: pd.Series
        Target values for the reference dataset.
    y_pred_reference: pd.Series
        Predictions for the reference dataset.

    Returns
    -------
    (std,): Tuple[np.ndarray]
    """
    y_true_reference = np.asarray(y_true_reference).astype(int)
    y_pred_reference = np.asarray(y_pred_reference).astype(int)

    correct_table = (y_true_reference == y_pred_reference).astype(int)

    return (np.std(correct_table),)


[docs]def accuracy_sampling_error(sampling_error_components: Tuple, data) -> float:
    """
    Calculate the accuracy sampling error for a chunk of data.

    Parameters
    ----------
    sampling_error_components : a set of parameters that were derived from reference data.
    data : the (analysis) data you want to calculate or estimate a metric for.

    Returns
    -------
    sampling_error: float

    """
    (reference_std,) = sampling_error_components
    return reference_std / np.sqrt(len(data))