# Author: Niels Nuyttens <niels@nannyml.com>
#
# License: Apache Software License 2.0
"""Module containing univariate statistical drift calculation results and associated plotting implementations."""
from typing import List, Optional, Tuple
import pandas as pd
import plotly.graph_objects as go
from nannyml._typing import ProblemType
from nannyml.base import AbstractCalculator, AbstractCalculatorResult, _column_is_categorical, _column_is_continuous
from nannyml.chunk import Chunk
from nannyml.exceptions import InvalidArgumentsException
from nannyml.plots import CHUNK_KEY_COLUMN_NAME
from nannyml.plots._joy_plot import _joy_plot
from nannyml.plots._stacked_bar_plot import _stacked_bar_plot
from nannyml.plots._step_plot import _step_plot
"""Contains the results of the model output statistical drift calculation and provides plotting functionality."""
[docs]class UnivariateDriftResult(AbstractCalculatorResult):
"""Contains the results of the model output statistical drift calculation and provides plotting functionality."""
def __init__(self, results_data: pd.DataFrame, calculator: AbstractCalculator):
super().__init__(results_data)
from .calculator import StatisticalOutputDriftCalculator
if not isinstance(calculator, StatisticalOutputDriftCalculator):
raise RuntimeError(
f"{calculator.__class__.__name__} is not an instance of type " f"UnivariateStatisticalDriftCalculator"
)
self.calculator = calculator
@property
def calculator_name(self) -> str:
return 'univariate_statistical_output_drift'
[docs] def plot(
self,
kind: str = 'prediction_drift',
metric: str = 'statistic',
class_label: str = None,
plot_reference: bool = False,
*args,
**kwargs,
) -> Optional[go.Figure]:
"""Renders plots for metrics returned by the univariate statistical drift calculator.
For both model predictions and outputs you can render the statistic value or p-values as a step plot,
or create a distribution plot. For multiclass use cases it is required to provide a ``class_label`` parameter
when rendering model output plots.
Select a plot using the ``kind`` parameter:
- ``prediction_drift``
plots the drift metric per :class:`~nannyml.chunk.Chunk` for the model predictions ``y_pred``.
- ``prediction_distribution``
plots the distribution per :class:`~nannyml.chunk.Chunk` for the model predictions ``y_pred``.
- ``score_drift``
plots the drift metric per :class:`~nannyml.chunk.Chunk` for the model outputs ``y_pred_proba``.
- ``score_distribution``
plots the distribution per per :class:`~nannyml.chunk.Chunk` for the model outputs ``y_pred_proba``
Parameters
----------
kind: str, default=`prediction_drift`
The kind of plot you want to have. Allowed values are ``prediction_drift``,
``prediction_distribution``, ``score_drift`` and ``score_distribution``.
metric : str, default=``statistic``
The metric to plot. Allowed values are ``statistic`` and ``p_value``.
Not applicable when plotting distributions.
plot_reference: bool, default=False
Indicates whether to include the reference period in the plot or not. Defaults to ``False``.
class_label: str, default=None
The label of the class to plot the prediction distribution for.
Only required in case of multiclass use cases.
Returns
-------
fig: :class:`plotly.graph_objs._figure.Figure`
A :class:`~plotly.graph_objs._figure.Figure` object containing the requested drift plot.
Can be saved to disk using the :meth:`~plotly.graph_objs._figure.Figure.write_image` method
or shown rendered on screen using the :meth:`~plotly.graph_objs._figure.Figure.show` method.
Examples
--------
>>> import nannyml as nml
>>>
>>> reference_df, analysis_df, _ = nml.load_synthetic_binary_classification_dataset()
>>>
>>> calc = nml.StatisticalOutputDriftCalculator(
>>> y_pred_proba='y_pred_proba',
>>> y_pred='y_pred',
>>> timestamp_column_name='timestamp'
>>> )
>>> calc.fit(reference_df)
>>> results = calc.calculate(analysis_df)
>>>
>>> print(results.data) # check the numbers
key start_index ... y_pred_proba_alert y_pred_proba_threshold
0 [0:4999] 0 ... True 0.05
1 [5000:9999] 5000 ... False 0.05
2 [10000:14999] 10000 ... False 0.05
3 [15000:19999] 15000 ... False 0.05
4 [20000:24999] 20000 ... False 0.05
5 [25000:29999] 25000 ... True 0.05
6 [30000:34999] 30000 ... True 0.05
7 [35000:39999] 35000 ... True 0.05
8 [40000:44999] 40000 ... True 0.05
9 [45000:49999] 45000 ... True 0.05
>>>
>>> results.plot(kind='score_drift', plot_reference=True).show()
>>> results.plot(kind='score_distribution', plot_reference=True).show()
>>> results.plot(kind='prediction_drift', plot_reference=True).show()
>>> results.plot(kind='prediction_distribution', plot_reference=True).show()
"""
if kind == 'prediction_drift':
return _plot_prediction_drift(
self.data,
self.calculator,
self.calculator.y_pred,
plot_reference,
metric,
)
elif kind == 'prediction_distribution':
return _plot_prediction_distribution(
data=self.calculator.previous_analysis_data,
drift_data=self.data,
calculator=self.calculator,
plot_reference=plot_reference,
)
elif kind == 'score_drift':
return _plot_score_drift(self.data, self.calculator, plot_reference, metric, class_label)
elif kind == 'score_distribution':
return _plot_score_distribution(
data=self.calculator.previous_analysis_data,
drift_data=self.data,
calculator=self.calculator,
plot_reference=plot_reference,
class_label=class_label,
)
else:
raise InvalidArgumentsException(
f"unknown plot kind '{kind}'. "
"Please provide on of: ['prediction_drift', 'prediction_distribution', 'score_drift',"
"'score_distribution']."
)
# @property
# def plots(self) -> Dict[str, go.Figure]:
# plots: Dict[str, go.Figure] = {}
#
# if isinstance(self.metadata, BinaryClassificationMetadata):
# prediction_column_name = self.metadata.predicted_probability_column_name
# plots[f'{prediction_column_name}_drift_statistic'] = _plot_prediction_drift(
# self.data, self.metadata, 'statistic'
# )
# plots[f'{prediction_column_name}_drift_p_value'] = _plot_prediction_drift(
# self.data, self.metadata, 'p_value'
# )
# plots[f'{prediction_column_name}_distribution'] = _plot_prediction_distribution(
# data=self._analysis_data, drift_data=self.data, metadata=self.metadata
# )
# elif isinstance(self.metadata, MulticlassClassificationMetadata):
# for class_label, prediction_column_name in self.metadata.predicted_probabilities_column_names.items():
# plots[f'{prediction_column_name}_drift_statistic'] = _plot_prediction_drift(
# self.data, self.metadata, 'statistic', class_label
# )
# plots[f'{prediction_column_name}_drift_p_value'] = _plot_prediction_drift(
# self.data, self.metadata, 'p_value', class_label
# )
# plots[f'{prediction_column_name}_distribution'] = _plot_prediction_distribution(
# data=self._analysis_data, drift_data=self.data, metadata=self.metadata, class_label=class_label
# )
# elif isinstance(self.metadata, RegressionMetadata):
# prediction_column_name = self.metadata.prediction_column_name
# plots[f'{prediction_column_name}_drift_statistic'] = _plot_prediction_drift(
# self.data, self.metadata, 'statistic'
# )
# plots[f'{prediction_column_name}_drift_p_value'] = _plot_prediction_drift(
# self.data, self.metadata, 'p_value'
# )
# plots[f'{prediction_column_name}_distribution'] = _plot_prediction_distribution(
# data=self._analysis_data, drift_data=self.data, metadata=self.metadata
# )
#
# return plots
def _get_drift_column_names_for_feature(feature_column_name: str, feature_type: str, metric: str) -> Tuple:
metric_column_name, metric_label, threshold_column_name = None, None, None
if metric == 'statistic':
if feature_type == 'categorical':
metric_column_name = f'{feature_column_name}_chi2'
metric_label = 'Chi-square statistic'
elif feature_type == 'continuous':
metric_column_name = f'{feature_column_name}_dstat'
metric_label = 'KS statistic'
threshold_column_name = None
elif metric == 'p_value':
metric_column_name = f'{feature_column_name}_p_value'
metric_label = 'P-value'
threshold_column_name = f'{feature_column_name}_threshold'
drift_column_name = f'{feature_column_name}_alert'
title = f'{metric_label} for {feature_column_name}'
return metric_column_name, metric_label, threshold_column_name, drift_column_name, title
def _plot_prediction_drift(
data: pd.DataFrame,
calculator,
y_pred: str,
plot_reference: bool,
metric: str = 'statistic',
) -> go.Figure:
"""Renders a line plot of the drift metric for a given feature."""
(
metric_column_name,
metric_label,
threshold_column_name,
drift_column_name,
title,
) = _get_drift_column_names_for_feature(
y_pred,
'continuous' if calculator.problem_type == ProblemType.REGRESSION else 'categorical',
metric,
)
plot_period_separator = plot_reference
data['period'] = 'analysis'
if plot_reference:
reference_results = calculator.previous_reference_results.copy()
reference_results['period'] = 'reference'
data = pd.concat([reference_results, data], ignore_index=True)
fig = _step_plot(
table=data,
metric_column_name=metric_column_name,
chunk_column_name=CHUNK_KEY_COLUMN_NAME,
drift_column_name=drift_column_name,
lower_threshold_column_name=threshold_column_name,
hover_labels=['Chunk', metric_label, 'Target data'],
title=title,
y_axis_title=metric_label,
v_line_separating_analysis_period=plot_period_separator,
statistically_significant_column_name=drift_column_name,
)
return fig
def _plot_prediction_distribution(
data: pd.DataFrame,
drift_data: pd.DataFrame,
calculator,
plot_reference: bool,
) -> go.Figure:
"""Plots the data distribution and associated drift for each chunk of the model predictions.
Parameters
----------
data : pd.DataFrame
The original model inputs and outputs
drift_data : pd.DataFrame
The results of the drift calculation
Returns
-------
fig: plotly.graph_objects.Figure
A visualization of the data distribution and drift using joy-plots.
"""
clip: Optional[Tuple[int, int]] = None
if calculator.problem_type in [ProblemType.CLASSIFICATION_BINARY, ProblemType.CLASSIFICATION_MULTICLASS]:
clip = (0, 1)
prediction_column_name = calculator.y_pred
axis_title = f'{prediction_column_name}'
drift_column_name = f'{prediction_column_name}_alert'
title = f'Distribution over time for {prediction_column_name}'
drift_data['period'] = 'analysis'
data['period'] = 'analysis'
feature_table = _create_feature_table(calculator.chunker.split(data, calculator.timestamp_column_name))
if plot_reference:
reference_drift = calculator.previous_reference_results.copy()
if reference_drift is None:
raise RuntimeError(
f"could not plot distribution for '{prediction_column_name}': "
f"calculator is missing reference results\n{calculator}"
)
reference_drift['period'] = 'reference'
drift_data = pd.concat([reference_drift, drift_data], ignore_index=True)
reference_feature_table = _create_feature_table(
calculator.chunker.split(calculator.previous_reference_data, calculator.timestamp_column_name)
)
feature_table = pd.concat([reference_feature_table, feature_table], ignore_index=True)
if calculator.problem_type in [ProblemType.CLASSIFICATION_BINARY, ProblemType.CLASSIFICATION_MULTICLASS]:
fig = _stacked_bar_plot(
feature_table=feature_table,
drift_table=drift_data,
chunk_column_name='key',
drift_column_name=drift_column_name,
feature_column_name=prediction_column_name,
yaxis_title=axis_title,
title=title,
)
elif calculator.problem_type == ProblemType.REGRESSION:
fig = _joy_plot(
feature_table=feature_table,
drift_table=drift_data,
chunk_column_name=CHUNK_KEY_COLUMN_NAME,
drift_column_name=drift_column_name,
feature_column_name=prediction_column_name,
x_axis_title=axis_title,
post_kde_clip=clip,
title=title,
style='vertical',
)
else:
raise RuntimeError(
f"dtype '{data[prediction_column_name].dtype}' is not supported yet.\nPlease convert to one of "
f"the following dtypes: ['object', 'string', 'category', 'bool'] for categorical data\n"
f"or ['float64', 'int64'] for continuous data."
)
return fig
def _plot_score_drift(
data: pd.DataFrame,
calculator,
plot_reference: bool,
metric: str = 'statistic',
class_label: str = None,
) -> go.Figure:
"""Renders a line plot of the drift metric for a given feature."""
if calculator.problem_type == ProblemType.REGRESSION:
raise InvalidArgumentsException(
"plot of kind 'score_drift' don't support "
"regression problems. Please use the 'prediction_distribution' plot."
)
# deal with multiclass stuff
# if isinstance(calculator.y_pred_proba, Dict):
if calculator.problem_type == ProblemType.CLASSIFICATION_MULTICLASS:
if class_label is None:
raise InvalidArgumentsException(
"a class label is required when plotting multiclass model"
"outputs.\nPlease provide one using the 'class_label' parameter."
)
if class_label not in calculator.y_pred_proba:
raise InvalidArgumentsException(
f"class label '{class_label}' was not found in configured "
f"model outputs {calculator.y_pred_proba}.\n"
f"Please provide a value that is present in the model outputs."
)
output_column_name = calculator.y_pred_proba[class_label]
elif calculator.problem_type == ProblemType.CLASSIFICATION_BINARY:
output_column_name = calculator.y_pred_proba
else:
raise InvalidArgumentsException(
f"parameter 'y_pred_proba' is of type '{type(calculator.y_pred_proba)}' "
"but should be of type 'Union[str, Dict[str, str].'"
)
(
metric_column_name,
metric_label,
threshold_column_name,
drift_column_name,
title,
) = _get_drift_column_names_for_feature(
output_column_name,
'continuous' if _column_is_continuous(calculator.previous_analysis_data[output_column_name]) else 'categorical',
metric,
)
plot_period_separator = plot_reference
data['period'] = 'analysis'
if plot_reference:
reference_results = calculator.previous_reference_results.copy()
reference_results['period'] = 'reference'
data = pd.concat([reference_results, data], ignore_index=True)
fig = _step_plot(
table=data,
metric_column_name=metric_column_name,
chunk_column_name=CHUNK_KEY_COLUMN_NAME,
drift_column_name=drift_column_name,
lower_threshold_column_name=threshold_column_name,
hover_labels=['Chunk', metric_label, 'Target data'],
title=title,
y_axis_title=metric_label,
v_line_separating_analysis_period=plot_period_separator,
statistically_significant_column_name=drift_column_name,
)
return fig
def _plot_score_distribution(
data: pd.DataFrame, drift_data: pd.DataFrame, calculator, plot_reference: bool, class_label: str = None
) -> go.Figure:
"""Plots the data distribution and associated drift for each chunk of the model predictions.
Parameters
----------
data : pd.DataFrame
The original model inputs and outputs
drift_data : pd.DataFrame
The results of the drift calculation
calculator:
The calculator that produced these results
plot_reference: bool
Flag instructing to either include reference data on the plot or not
class_label: str, default=None
The label of the class to plot the prediction distribution for. Only required in case of multiclass models.
Returns
-------
fig: plotly.graph_objects.Figure
A visualization of the data distribution and drift using joy-plots.
"""
if calculator.problem_type == ProblemType.REGRESSION:
raise InvalidArgumentsException(
"plot of kind 'score_distribution' don't support "
"regression problems. Please use the 'prediction_distribution' plot."
)
clip: Optional[Tuple[int, int]] = None
# deal with multiclass stuff
if calculator.problem_type == ProblemType.CLASSIFICATION_MULTICLASS:
if class_label is None:
raise InvalidArgumentsException(
"a class label is required when plotting multiclass model"
"outputs.\nPlease provide one using the 'class_label' parameter."
)
if class_label not in calculator.y_pred_proba:
raise InvalidArgumentsException(
f"class label '{class_label}' was not found in configured "
f"model outputs {calculator.y_pred_proba}.\n"
f"Please provide a value that is present in the model outputs."
)
output_column_name = calculator.y_pred_proba[class_label]
clip = (0, 1)
# elif isinstance(calculator.y_pred_proba, str):
elif calculator.problem_type == ProblemType.CLASSIFICATION_BINARY:
output_column_name = calculator.y_pred_proba
clip = (0, 1)
else:
raise InvalidArgumentsException(
f"parameter 'y_pred_proba' is of type '{type(calculator.y_pred_proba)}' "
"but should be of type 'Union[str, Dict[str, str].'"
)
axis_title = f'{output_column_name}'
drift_column_name = f'{output_column_name}_alert'
title = f'Distribution over time for {output_column_name}'
drift_data['period'] = 'analysis'
data['period'] = 'analysis'
feature_table = _create_feature_table(calculator.chunker.split(data, calculator.timestamp_column_name))
if plot_reference:
reference_drift = calculator.previous_reference_results.copy()
if reference_drift is None:
raise RuntimeError(
f"could not plot categorical distribution for feature '{output_column_name}': "
f"calculator is missing reference results\n{calculator}"
)
reference_drift['period'] = 'reference'
drift_data = pd.concat([reference_drift, drift_data], ignore_index=True)
reference_feature_table = _create_feature_table(
calculator.chunker.split(calculator.previous_reference_data, calculator.timestamp_column_name)
)
feature_table = pd.concat([reference_feature_table, feature_table], ignore_index=True)
if _column_is_categorical(data[output_column_name]):
fig = _stacked_bar_plot(
feature_table=feature_table,
drift_table=drift_data,
chunk_column_name='key',
drift_column_name=drift_column_name,
feature_column_name=output_column_name,
yaxis_title=axis_title,
title=title,
)
elif _column_is_continuous(data[output_column_name]):
fig = _joy_plot(
feature_table=feature_table,
drift_table=drift_data,
chunk_column_name=CHUNK_KEY_COLUMN_NAME,
drift_column_name=drift_column_name,
feature_column_name=output_column_name,
x_axis_title=axis_title,
post_kde_clip=clip,
title=title,
style='vertical',
)
else:
raise RuntimeError(
f"dtype '{data[output_column_name].dtype}' is not supported yet.\nPlease convert to one of "
f"the following dtypes: ['object', 'string', 'category', 'bool'] for categorical data\n"
f"or ['float64', 'int64'] for continuous data."
)
return fig
def _create_feature_table(
data: List[Chunk],
) -> pd.DataFrame:
return pd.concat([chunk.data.assign(key=chunk.key) for chunk in data])