# Author: Niels Nuyttens <niels@nannyml.com>
# Jakub Bialek <jabub@nannyml.com>
#
# License: Apache Software License 2.0
from typing import List, Tuple
import numpy as np
import pandas as pd
def _standard_deviation_of_variances(components: List[Tuple], data) -> float:
class_variances = [c[0] / (len(data) * c[1]) for c in components]
multiclass_std = np.sqrt(np.sum(class_variances)) / len(class_variances)
return multiclass_std
[docs]def auroc_sampling_error_components(y_true_reference: List[pd.Series], y_pred_proba_reference: List[pd.Series]):
"""
Calculate the sampling error components on reference data.
The ``y_true_reference`` and ``y_pred_proba_reference`` lists represent the binarized target values and model
probabilities. The order of the Series in both lists should both match the list of class labels present.
Parameters
----------
y_true_reference: List[pd.Series]
Target values for the reference dataset.
y_pred_proba_reference: List[pd.Series]
Prediction probability values for the reference dataset.
Returns
-------
sampling_error_components: List[Tuple]
"""
def _get_class_components(y_true, y_pred_proba):
if np.mean(y_true) > 0.5:
y_true = abs(np.asarray(y_true) - 1)
y_pred_proba = 1 - y_pred_proba
sorted_idx = np.argsort(y_pred_proba)
y_pred_proba = y_pred_proba[sorted_idx]
y_true = y_true[sorted_idx]
rank_order = np.asarray(range(len(y_pred_proba)))
positive_ranks = y_true * rank_order
indexes = np.unique(positive_ranks)[1:]
ser = []
for i, index in enumerate(indexes):
ser.append(index - i)
n_pos = np.sum(y_true)
n_neg = len(y_true) - n_pos
ser_multi = ser / n_neg
fraction = n_pos / len(y_true)
return np.var(ser_multi), fraction
# classes = sorted(y_pred_proba_reference.keys())
# binarized_y_true_reference_list = list(label_binarize(y_true_reference, classes=classes).T)
# y_pred_proba_reference_list = [y_pred_proba_reference[clz] for clz in classes]
class_components = []
for y_true_class, y_pred_proba_class in zip(y_true_reference, y_pred_proba_reference):
class_components.append(_get_class_components(y_true_class, y_pred_proba_class))
return class_components
[docs]def auroc_sampling_error(sampling_error_components, data) -> float:
"""
Calculate the AUROC sampling error for a chunk of data.
Parameters
----------
sampling_error_components : a set of parameters that were derived from reference data.
data : the (analysis) data you want to calculate or estimate a metric for.
Returns
-------
sampling_error: float
"""
class_variances = [c[0] / (len(data) * c[1]) for c in sampling_error_components]
multiclass_std = np.sqrt(np.sum(class_variances)) / len(class_variances) * 1.2
return multiclass_std
[docs]def f1_sampling_error_components(y_true_reference: List[pd.Series], y_pred_reference: List[pd.Series]):
"""
Calculate the sampling error components on reference data.
The ``y_true_reference`` and ``y_pred_proba_reference`` lists represent the binarized target values and model
probabilities. The order of the Series in both lists should both match the list of class labels present.
Parameters
----------
y_true_reference: List[pd.Series]
Target values for the reference dataset.
y_pred_reference: List[pd.Series]
Prediction values for the reference dataset.
Returns
-------
sampling_error_components: List[Tuple]
"""
def _get_class_components(y_true, y_pred):
TP = np.where((y_true == y_pred) & (y_pred == 1), 1, np.nan)
FP = np.where((y_true != y_pred) & (y_pred == 1), 0, np.nan)
FN = np.where((y_true != y_pred) & (y_pred == 0), 0, np.nan)
TP = TP[~np.isnan(TP)]
FN = FN[~np.isnan(FN)]
FP = FP[~np.isnan(FP)]
obs_level_f1 = np.concatenate([TP, FN, FP])
fraction_of_relevant = len(obs_level_f1) / len(y_pred)
return np.var(obs_level_f1), fraction_of_relevant
class_components = []
for y_true_class, y_pred_class in zip(y_true_reference, y_pred_reference):
class_components.append(_get_class_components(y_true_class, y_pred_class))
return class_components
[docs]def f1_sampling_error(sampling_error_components: List[Tuple], data) -> float:
"""
Calculate the F1 sampling error for a chunk of data.
Parameters
----------
sampling_error_components : a set of parameters that were derived from reference data.
data : the (analysis) data you want to calculate or estimate a metric for.
Returns
-------
sampling_error: float
"""
return _standard_deviation_of_variances(sampling_error_components, data)
[docs]def precision_sampling_error_components(y_true_reference: List[pd.Series], y_pred_reference: List[pd.Series]):
"""
Calculate the sampling error components on reference data.
The ``y_true_reference`` and ``y_pred_proba_reference`` lists represent the binarized target values and model
probabilities. The order of the Series in both lists should both match the list of class labels present.
Parameters
----------
y_true_reference: List[pd.Series]
Target values for the reference dataset.
y_pred_reference: List[pd.Series]
Prediction values for the reference dataset.
Returns
-------
sampling_error_components: List[Tuple]
"""
def _get_class_components(y_true, y_pred):
TP = np.where((y_true == y_pred) & (y_pred == 1), 1, np.nan)
FP = np.where((y_true != y_pred) & (y_pred == 1), 0, np.nan)
TP = TP[~np.isnan(TP)]
FP = FP[~np.isnan(FP)]
obs_level_precision = np.concatenate([TP, FP])
amount_positive_pred = np.sum(y_pred)
fraction_of_pos_pred = amount_positive_pred / len(y_pred)
return np.var(obs_level_precision), fraction_of_pos_pred
class_components = []
for y_true_class, y_pred_class in zip(y_true_reference, y_pred_reference):
class_components.append(_get_class_components(y_true_class, y_pred_class))
return class_components
[docs]def precision_sampling_error(sampling_error_components: List[Tuple], data) -> float:
"""
Calculate the precision sampling error for a chunk of data.
Parameters
----------
sampling_error_components : a set of parameters that were derived from reference data.
data : the (analysis) data you want to calculate or estimate a metric for.
Returns
-------
sampling_error: float
"""
return _standard_deviation_of_variances(sampling_error_components, data)
[docs]def recall_sampling_error_components(y_true_reference: List[pd.Series], y_pred_reference: List[pd.Series]):
"""
Calculate the sampling error components on reference data.
The ``y_true_reference`` and ``y_pred_proba_reference`` lists represent the binarized target values and model
probabilities. The order of the Series in both lists should both match the list of class labels present.
Parameters
----------
y_true_reference: List[pd.Series]
Target values for the reference dataset.
y_pred_reference: List[pd.Series]
Prediction values for the reference dataset.
Returns
-------
sampling_error_components: List[Tuple]
"""
def _get_class_components(y_true, y_pred):
TP = np.where((y_true == y_pred) & (y_pred == 1), 1, np.nan)
FN = np.where((y_true != y_pred) & (y_pred == 0), 0, np.nan)
TP = TP[~np.isnan(TP)]
FN = FN[~np.isnan(FN)]
obs_level_recall = np.concatenate([TP, FN])
fraction_of_relevant = sum(y_true) / len(y_pred)
return np.var(obs_level_recall), fraction_of_relevant
class_components = []
for y_true_class, y_pred_class in zip(y_true_reference, y_pred_reference):
class_components.append(_get_class_components(y_true_class, y_pred_class))
return class_components
[docs]def recall_sampling_error(sampling_error_components: List[Tuple], data) -> float:
"""
Calculate the recall sampling error for a chunk of data.
Parameters
----------
sampling_error_components : a set of parameters that were derived from reference data.
data : the (analysis) data you want to calculate or estimate a metric for.
Returns
-------
sampling_error: float
"""
return _standard_deviation_of_variances(sampling_error_components, data)
[docs]def specificity_sampling_error_components(y_true_reference: List[pd.Series], y_pred_reference: List[pd.Series]):
"""
Calculate the sampling error components on reference data.
The ``y_true_reference`` and ``y_pred_proba_reference`` lists represent the binarized target values and model
probabilities. The order of the Series in both lists should both match the list of class labels present.
Parameters
----------
y_true_reference: List[pd.Series]
Target values for the reference dataset.
y_pred_reference: List[pd.Series]
Prediction values for the reference dataset.
Returns
-------
sampling_error_components: List[Tuple]
"""
def _get_class_components(y_true, y_pred):
TN = np.where((y_true == y_pred) & (y_pred == 0), 1, np.nan)
FP = np.where((y_true != y_pred) & (y_pred == 1), 0, np.nan)
TN = TN[~np.isnan(TN)]
FP = FP[~np.isnan(FP)]
obs_level_specificity = np.concatenate([TN, FP])
fraction_of_relevant = len(obs_level_specificity) / len(y_pred)
return np.var(obs_level_specificity), fraction_of_relevant
class_components = []
for y_true_class, y_pred_class in zip(y_true_reference, y_pred_reference):
class_components.append(_get_class_components(y_true_class, y_pred_class))
return class_components
[docs]def specificity_sampling_error(sampling_error_components: List[Tuple], data) -> float:
"""
Calculate the specificity sampling error for a chunk of data.
Parameters
----------
sampling_error_components : a set of parameters that were derived from reference data.
data : the (analysis) data you want to calculate or estimate a metric for.
Returns
-------
sampling_error: float
"""
return _standard_deviation_of_variances(sampling_error_components, data)
[docs]def accuracy_sampling_error_components(y_true_reference: List[pd.Series], y_pred_reference: List[pd.Series]):
"""
Calculate the sampling error components on reference data.
The ``y_true_reference`` and ``y_pred_proba_reference`` lists represent the binarized target values and model
probabilities. The order of the Series in both lists should both match the list of class labels present.
Parameters
----------
y_true_reference: List[pd.Series]
Target values for the reference dataset.
y_pred_reference: List[pd.Series]
Prediction values for the reference dataset.
Returns
-------
sampling_error_components: Tuple
"""
y_true = np.asarray(y_true_reference).astype(int)
y_pred = np.asarray(y_pred_reference).astype(int)
correct_table = (y_true == y_pred).all(axis=1).astype(int)
return (np.std(correct_table),)
[docs]def accuracy_sampling_error(sampling_error_components: Tuple, data) -> float:
"""
Calculate the accuracy sampling error for a chunk of data.
Parameters
----------
sampling_error_components : a set of parameters that were derived from reference data.
data : the (analysis) data you want to calculate or estimate a metric for.
Returns
-------
sampling_error: float
"""
return sampling_error_components[0] / np.sqrt(len(data))