Source code for nannyml.sampling_error.multiclass_classification

#  Author:   Niels Nuyttens  <niels@nannyml.com>
#            Jakub Bialek    <jabub@nannyml.com>
#
#  License: Apache Software License 2.0

"""Module containing functions to estimate sampling error for multiclass classification metrics."""

from typing import List, Tuple, Union, Optional

import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, average_precision_score


# How many experiments to perform when doing resampling to approximate sampling error.
N_EXPERIMENTS = 50
# Max resample size - we don't need full reference if it is too big.
MAX_RESAMPLE_SIZE = 50_000


def _standard_deviation_of_variances(components: List[Tuple], data) -> float:
    class_variances = [c[0] / (len(data) * c[1]) for c in components]
    multiclass_std = np.sqrt(np.sum(class_variances)) / len(class_variances)
    return multiclass_std


[docs]def auroc_sampling_error_components(y_true_reference: List[pd.Series], y_pred_proba_reference: List[pd.Series]): """Calculate sampling error components for AUROC using reference data. The ``y_true_reference`` and ``y_pred_proba_reference`` lists represent the binarized target values and model probabilities. The order of the Series in both lists should both match the list of class labels present. Parameters ---------- y_true_reference: List[pd.Series] Target values for the reference dataset. y_pred_proba_reference: List[pd.Series] Prediction probability values for the reference dataset. Returns ------- sampling_error_components: List[Tuple] """ def _get_class_components(y_true, y_pred_proba): if np.mean(y_true) > 0.5: y_true = abs(np.asarray(y_true) - 1) y_pred_proba = 1 - y_pred_proba sorted_idx = np.argsort(y_pred_proba) y_pred_proba = y_pred_proba[sorted_idx] y_true = y_true[sorted_idx] rank_order = np.asarray(range(len(y_pred_proba))) positive_ranks = y_true * rank_order indexes = np.unique(positive_ranks)[1:] ser = [] for i, index in enumerate(indexes): ser.append(index - i) n_pos = np.sum(y_true) n_neg = len(y_true) - n_pos ser_multi = ser / n_neg fraction = n_pos / len(y_true) return np.var(ser_multi), fraction # classes = sorted(y_pred_proba_reference.keys()) # binarized_y_true_reference_list = list(label_binarize(y_true_reference, classes=classes).T) # y_pred_proba_reference_list = [y_pred_proba_reference[clz] for clz in classes] class_components = [] for y_true_class, y_pred_proba_class in zip(y_true_reference, y_pred_proba_reference): class_components.append(_get_class_components(y_true_class, y_pred_proba_class)) return class_components
[docs]def auroc_sampling_error(sampling_error_components, data) -> float: """Calculate the AUROC sampling error for a chunk of data. Parameters ---------- sampling_error_components: a set of parameters that were derived from reference data. data: the (analysis) data you want to calculate or estimate a metric for. Returns ------- sampling_error: float """ class_variances = [c[0] / (len(data) * c[1]) for c in sampling_error_components] # Experiments showed that std of class variances underestimated sampling error by 20% so we manually adjust result multiclass_std = np.sqrt(np.sum(class_variances)) / len(class_variances) * 1.2 return multiclass_std
[docs]def f1_sampling_error_components(y_true_reference: List[pd.Series], y_pred_reference: List[pd.Series]): """Calculate sampling error components for F1 using reference data. The ``y_true_reference`` and ``y_pred_proba_reference`` lists represent the binarized target values and model probabilities. The order of the Series in both lists should both match the list of class labels present. Parameters ---------- y_true_reference: List[pd.Series] Target values for the reference dataset. y_pred_reference: List[pd.Series] Prediction values for the reference dataset. Returns ------- sampling_error_components: List[Tuple] """ def _get_class_components(y_true, y_pred): TP = np.where((y_true == y_pred) & (y_pred == 1), 1, np.nan) FP = np.where((y_true != y_pred) & (y_pred == 1), 0, np.nan) FN = np.where((y_true != y_pred) & (y_pred == 0), 0, np.nan) TP = TP[~np.isnan(TP)] FN = FN[~np.isnan(FN)] FP = FP[~np.isnan(FP)] obs_level_f1 = np.concatenate([TP, FN, FP]) fraction_of_relevant = len(obs_level_f1) / len(y_pred) return np.var(obs_level_f1), fraction_of_relevant class_components = [] for y_true_class, y_pred_class in zip(y_true_reference, y_pred_reference): class_components.append(_get_class_components(y_true_class, y_pred_class)) return class_components
[docs]def f1_sampling_error(sampling_error_components: List[Tuple], data) -> float: """Calculate the F1 sampling error for a chunk of data. Parameters ---------- sampling_error_components: a set of parameters that were derived from reference data. data: the (analysis) data you want to calculate or estimate a metric for. Returns ------- sampling_error: float """ return _standard_deviation_of_variances(sampling_error_components, data)
[docs]def precision_sampling_error_components(y_true_reference: List[pd.Series], y_pred_reference: List[pd.Series]): """Calculate sampling error components for precision using reference data. The ``y_true_reference`` and ``y_pred_proba_reference`` lists represent the binarized target values and model probabilities. The order of the Series in both lists should both match the list of class labels present. Parameters ---------- y_true_reference: List[pd.Series] Target values for the reference dataset. y_pred_reference: List[pd.Series] Prediction values for the reference dataset. Returns ------- sampling_error_components: List[Tuple] """ def _get_class_components(y_true, y_pred): TP = np.where((y_true == y_pred) & (y_pred == 1), 1, np.nan) FP = np.where((y_true != y_pred) & (y_pred == 1), 0, np.nan) TP = TP[~np.isnan(TP)] FP = FP[~np.isnan(FP)] obs_level_precision = np.concatenate([TP, FP]) amount_positive_pred = np.sum(y_pred) fraction_of_pos_pred = amount_positive_pred / len(y_pred) return np.var(obs_level_precision), fraction_of_pos_pred class_components = [] for y_true_class, y_pred_class in zip(y_true_reference, y_pred_reference): class_components.append(_get_class_components(y_true_class, y_pred_class)) return class_components
[docs]def precision_sampling_error(sampling_error_components: List[Tuple], data) -> float: """Calculate the precision sampling error for a chunk of data. Parameters ---------- sampling_error_components: a set of parameters that were derived from reference data. data: the (analysis) data you want to calculate or estimate a metric for. Returns ------- sampling_error: float """ return _standard_deviation_of_variances(sampling_error_components, data)
[docs]def recall_sampling_error_components(y_true_reference: List[pd.Series], y_pred_reference: List[pd.Series]): """Calculate sampling error components for recall using reference data. The ``y_true_reference`` and ``y_pred_proba_reference`` lists represent the binarized target values and model probabilities. The order of the Series in both lists should both match the list of class labels present. Parameters ---------- y_true_reference: List[pd.Series] Target values for the reference dataset. y_pred_reference: List[pd.Series] Prediction values for the reference dataset. Returns ------- sampling_error_components: List[Tuple] """ def _get_class_components(y_true, y_pred): TP = np.where((y_true == y_pred) & (y_pred == 1), 1, np.nan) FN = np.where((y_true != y_pred) & (y_pred == 0), 0, np.nan) TP = TP[~np.isnan(TP)] FN = FN[~np.isnan(FN)] obs_level_recall = np.concatenate([TP, FN]) fraction_of_relevant = sum(y_true) / len(y_pred) return np.var(obs_level_recall), fraction_of_relevant class_components = [] for y_true_class, y_pred_class in zip(y_true_reference, y_pred_reference): class_components.append(_get_class_components(y_true_class, y_pred_class)) return class_components
[docs]def recall_sampling_error(sampling_error_components: List[Tuple], data) -> float: """Calculate the recall sampling error for a chunk of data. Parameters ---------- sampling_error_components: a set of parameters that were derived from reference data. data: the (analysis) data you want to calculate or estimate a metric for. Returns ------- sampling_error: float """ return _standard_deviation_of_variances(sampling_error_components, data)
[docs]def specificity_sampling_error_components(y_true_reference: List[pd.Series], y_pred_reference: List[pd.Series]): """Calculate sampling error components for specificity using reference data. The ``y_true_reference`` and ``y_pred_proba_reference`` lists represent the binarized target values and model probabilities. The order of the Series in both lists should both match the list of class labels present. Parameters ---------- y_true_reference: List[pd.Series] Target values for the reference dataset. y_pred_reference: List[pd.Series] Prediction values for the reference dataset. Returns ------- sampling_error_components: List[Tuple] """ def _get_class_components(y_true, y_pred): TN = np.where((y_true == y_pred) & (y_pred == 0), 1, np.nan) FP = np.where((y_true != y_pred) & (y_pred == 1), 0, np.nan) TN = TN[~np.isnan(TN)] FP = FP[~np.isnan(FP)] obs_level_specificity = np.concatenate([TN, FP]) fraction_of_relevant = len(obs_level_specificity) / len(y_pred) return np.var(obs_level_specificity), fraction_of_relevant class_components = [] for y_true_class, y_pred_class in zip(y_true_reference, y_pred_reference): class_components.append(_get_class_components(y_true_class, y_pred_class)) return class_components
[docs]def specificity_sampling_error(sampling_error_components: List[Tuple], data) -> float: """Calculate the specificity sampling error for a chunk of data. Parameters ---------- sampling_error_components: a set of parameters that were derived from reference data. data: the (analysis) data you want to calculate or estimate a metric for. Returns ------- sampling_error: float """ return _standard_deviation_of_variances(sampling_error_components, data)
[docs]def accuracy_sampling_error_components(y_true_reference: List[pd.Series], y_pred_reference: List[pd.Series]): """Calculate sampling error components for accuracy using reference data. The ``y_true_reference`` and ``y_pred_proba_reference`` lists represent the binarized target values and model probabilities. The order of the Series in both lists should both match the list of class labels present. Parameters ---------- y_true_reference: List[pd.Series] Target values for the reference dataset. y_pred_reference: List[pd.Series] Prediction values for the reference dataset. Returns ------- sampling_error_components: Tuple """ y_true = np.asarray(y_true_reference).astype(int) y_pred = np.asarray(y_pred_reference).astype(int) correct_table = (y_true == y_pred).all(axis=1).astype(int) return (np.std(correct_table),)
[docs]def accuracy_sampling_error(sampling_error_components: Tuple, data) -> float: """Calculate the accuracy sampling error for a chunk of data. Parameters ---------- sampling_error_components: a set of parameters that were derived from reference data. data: the (analysis) data you want to calculate or estimate a metric for. Returns ------- sampling_error: float """ return sampling_error_components[0] / np.sqrt(len(data))
[docs]def multiclass_confusion_matrix_sampling_error_components( y_true_reference: List[pd.Series], y_pred_reference: List[pd.Series], normalize_confusion_matrix: Union[str, None] ): """Calculate sampling error components for CM using reference data.""" cm = confusion_matrix(y_true_reference, y_pred_reference) true_marginal = cm.sum(axis=1)[:, None] pred_marginal = cm.sum(axis=0)[None, :] num_observations = len(y_true_reference) if normalize_confusion_matrix == 'true': relevant_proportions = true_marginal / num_observations elif normalize_confusion_matrix == 'pred': relevant_proportions = pred_marginal / num_observations elif normalize_confusion_matrix == 'all': relevant_proportions = 1 else: relevant_proportions = None n_rows, n_cols = cm.shape stds = np.zeros((n_rows, n_cols)) for i in range(n_rows): for j in range(n_cols): if normalize_confusion_matrix == 'true': obs_level_array = np.zeros(true_marginal[i, 0], dtype=int) elif normalize_confusion_matrix == 'pred': obs_level_array = np.zeros(pred_marginal[0, j], dtype=int) elif normalize_confusion_matrix == 'all': obs_level_array = np.zeros(num_observations, dtype=int) else: obs_level_array = np.zeros(num_observations, dtype=int) end_index = cm[i, j] obs_level_array[:end_index] = 1 stds[i, j] = np.std(obs_level_array) return stds, relevant_proportions
[docs]def multiclass_confusion_matrix_sampling_error(sampling_error_components: Tuple, data): """Calculate the CM sampling error for a chunk of data.""" reference_stds, relevant_proportions = sampling_error_components if relevant_proportions is None: standard_errors = (reference_stds / np.sqrt(len(data))) * len(data) else: standard_errors = reference_stds / np.sqrt(len(data) * relevant_proportions) return standard_errors
[docs]def average_precision_sampling_error_components( y_true_reference: List[np.ndarray], y_pred_proba_reference: List[pd.Series] ): """Calculate sampling error components for AP using reference data. The ``y_true_reference`` and ``y_pred_proba_reference`` lists represent the binarized target values and model probabilities. The order of the Series in both lists should both match the list of class labels present. Parameters ---------- y_true_reference: List[np.ndarray] Target values for the reference dataset. y_pred_proba_reference: List[pd.Series] Prediction probability values for the reference dataset. Returns ------- sampling_error_components: List[Tuple] """ def _get_class_components(y_true_reference: np.ndarray, y_pred_proba_reference: pd.Series): sample_size = np.minimum(y_true_reference.shape[0] // 2, MAX_RESAMPLE_SIZE) y_pred_proba_reference = y_pred_proba_reference.to_numpy() ap_results = [] for _ in range(N_EXPERIMENTS): _indexes_for_sample = np.random.choice(y_true_reference.shape[0], sample_size, replace=True) sample_y_true_reference = y_true_reference[_indexes_for_sample] sample_y_pred_proba_reference = y_pred_proba_reference[_indexes_for_sample] ap_results.append(average_precision_score(sample_y_true_reference, sample_y_pred_proba_reference)) return np.var(ap_results), sample_size class_components = [] for y_true_class, y_pred_proba_class in zip(y_true_reference, y_pred_proba_reference): class_components.append(_get_class_components(y_true_class, y_pred_proba_class)) return class_components
[docs]def average_precision_sampling_error(sampling_error_components, data) -> float: """Calculate the AUROC sampling error for a chunk of data. Parameters ---------- sampling_error_components: a set of parameters that were derived from reference data. data: the (chunk) data you want to calculate or estimate a metric for. Returns ------- sampling_error: float """ class_variances = [c[0] * c[1] / len(data) for c in sampling_error_components] multiclass_std = np.sqrt(np.mean(class_variances)) return multiclass_std
def _calculate_business_value_per_row( row, business_value_matrix: np.ndarray, classes: List[str], ): """Helper function that calculates business value per row in a dataframe. Intended to be used within a pandas apply function. """ cm = confusion_matrix(y_true=np.array([row.y_true]), y_pred=np.array([row.y_pred]), labels=classes) bv = (cm * business_value_matrix).sum() return bv
[docs]def business_value_sampling_error_components( y_true_reference: pd.Series, y_pred_reference: pd.Series, business_value_matrix: np.ndarray, classes: List[str], normalize_business_value: Optional[str], ) -> Tuple[float, Union[str, None]]: """Estimate sampling error for the false negative rate. Parameters ---------- y_true_reference: pd.Series Target values for the reference dataset. y_pred_reference: pd.Series Predictions for the reference dataset. business_value_matrix: np.ndarray A nxn matrix of values for the business problem. classes: List[str] An alphanumerically sorted list of the unique classes in the multiclass problem normalize_business_value: Optional[str], default=None Determines how the business value will be normalized. Allowed values are None and 'per_prediction'. Returns ------- components: tuple """ data = pd.DataFrame( { 'y_true': y_true_reference, 'y_pred': y_pred_reference, } ) bvs = data.apply(lambda x: _calculate_business_value_per_row(x, business_value_matrix, classes), axis=1) return (bvs.std(), normalize_business_value)
[docs]def business_value_sampling_error(sampling_error_components: Tuple, data) -> float: """Calculate the false positive rate sampling error for a chunk of data. Parameters ---------- sampling_error_components: a set of parameters that were derived from reference data. data: the (chunk) data you want to calculate or estimate a metric for. Returns ------- sampling_error: float """ (reference_std, norm_type) = sampling_error_components _size = len(data) if norm_type is None: analysis_std = reference_std * _size else: # norm_type must be 'per_prediciton' analysis_std = reference_std total_value_standard_error = analysis_std / np.sqrt(_size) return total_value_standard_error