Source code for nannyml.drift.model_inputs.univariate.statistical.results

#  Author:   Niels Nuyttens  <niels@nannyml.com>
#
#  License: Apache Software License 2.0

"""Module containing univariate statistical drift calculation results and associated plotting implementations."""

from typing import List, Optional, Tuple

import pandas as pd
import plotly.graph_objects as go

from nannyml.chunk import Chunk
from nannyml.drift.base import DriftResult
from nannyml.exceptions import InvalidArgumentsException
from nannyml.metadata import BinaryClassificationMetadata, MulticlassClassificationMetadata, RegressionMetadata
from nannyml.metadata.base import Feature, FeatureType, ModelMetadata
from nannyml.plots import CHUNK_KEY_COLUMN_NAME
from nannyml.plots._joy_plot import _joy_plot
from nannyml.plots._stacked_bar_plot import _stacked_bar_plot
from nannyml.plots._step_plot import _step_plot


[docs]class UnivariateDriftResult(DriftResult): """Contains the univariate statistical drift calculation results and provides additional plotting functionality.""" # TODO: this is messing up functionality in scratch files (sets runtime class to DataFrame). Check this!
[docs] def __repr__(self): """Represent the DriftResults object as the data it contains.""" return self.data.__repr__()
[docs] def plot( self, kind: str = 'feature', metric: str = 'statistic', feature_label: str = None, feature_column_name: str = None, class_label: str = None, *args, **kwargs, ) -> go.Figure: """Renders a line plot for a chosen metric of statistical statistical drift calculation results. Given either a feature label (check ``model_metadata.features``) or the actual feature column name and a metric (one of either ``statistic`` or ``p_value``) this function will render a line plot displaying the metric value for the selected feature per chunk. Chunks are set on a time-based X-axis by using the period containing their observations. Chunks of different partitions (``reference`` and ``analysis``) are represented using different colors and a vertical separation if the drift results contain multiple partitions. The different plot kinds that are available: - ``feature_drift``: plots drift per :class:`~nannyml.chunk.Chunk` for a single feature of a chunked data set. - ``prediction_drift``: plots drift per :class:`~nannyml.chunk.Chunk` for the predictions of a chunked data set. - ``feature_distribution``: plots feature distribution per :class:`~nannyml.chunk.Chunk`. Joyplot for continuous features, stacked bar charts for categorical features. - ``prediction_distribution``: plots the prediction distribution per :class:`~nannyml.chunk.Chunk` of a chunked data set as a joyplot. Parameters ---------- kind: str, default=`feature_drift` The kind of plot you want to have. Value must be one of ``feature_drift``, ``prediction_drift``, ``feature_distribution`` or ``prediction_distribution``. metric : str, default=``statistic`` The metric to plot. Value must be one of ``statistic`` or ``p_value`` feature_label : str Feature label identifying a feature according to the preset model metadata. The function will raise an exception when no feature of that label was found in the metadata. Either ``feature_label`` or ``feature_column_name`` should be specified. feature_column_name : str Column name identifying a feature according to the preset model metadata. The function will raise an exception when no feature using that column name was found in the metadata. Either ``feature_column_name`` or ``feature_label`` should be specified. class_label: str, default=None The label of the class to plot the prediction distribution for. Only required in case of multiclass models. Returns ------- fig: plotly.graph_objects.Figure A ``Figure`` object containing the requested drift plot. Can be saved to disk or shown rendered on screen using ``fig.show()``. Examples -------- >>> import nannyml as nml >>> ref_df, ana_df, _ = nml.load_synthetic_binary_classification_dataset() >>> metadata = nml.extract_metadata(ref_df, model_type=nml.ModelType.CLASSIFICATION_BINARY) >>> drift_calc = nml.UnivariateStatisticalDriftCalculator(model_metadata=metadata, chunk_period='W') >>> drift_calc.fit(ref_df) >>> drifts = drift_calc.calculate(ana_df) >>> # loop over all features and plot the feature drift and feature distribution for each >>> for f in metadata.features: >>> drifts.plot(kind='feature_drift', feature_label=f.label).show() >>> drifts.plot(kind='feature_distribution', feature_label=f.label).show() """ if kind == 'feature_drift': feature = _get_feature(self.metadata, feature_label, feature_column_name) return _plot_feature_drift(self.data, feature, metric, args, kwargs) elif kind == 'prediction_drift': return _plot_prediction_drift(self.data, self.metadata, metric, class_label) elif kind == 'feature_distribution': feature = _get_feature(self.metadata, feature_label, feature_column_name) return _plot_feature_distribution( data=self._analysis_data, drift_data=self.data, feature=feature, ) elif kind == 'prediction_distribution': return _plot_prediction_distribution( data=self._analysis_data, drift_data=self.data, metadata=self.metadata, class_label=class_label ) else: raise InvalidArgumentsException( f"unknown plot kind '{kind}'. " f"Please provide on of: ['feature_drift', 'feature_distribution', " f"'prediction_drift', 'prediction_distribution']." )
def _get_feature(model_metadata: ModelMetadata, feature_label: str = None, feature_column_name: str = None) -> Feature: if feature_label is None and feature_column_name is None: raise InvalidArgumentsException("one of 'feature_label' or 'feature_column_name' should be provided.") feature = ( model_metadata.feature(feature=feature_label) if feature_label else model_metadata.feature(column=feature_column_name) ) if feature is None: raise InvalidArgumentsException(f'could not find a feature {feature_label or feature_column_name}') return feature def _get_drift_column_names_for_feature(feature_column_name: str, feature_type: FeatureType, metric: str) -> Tuple: metric_column_name, metric_label, threshold_column_name = None, None, None if metric == 'statistic': if feature_type == FeatureType.CATEGORICAL: metric_column_name = f'{feature_column_name}_chi2' metric_label = 'Chi-square statistic' elif feature_type == FeatureType.CONTINUOUS: metric_column_name = f'{feature_column_name}_dstat' metric_label = 'KS statistic' threshold_column_name = None elif metric == 'p_value': metric_column_name = f'{feature_column_name}_p_value' metric_label = 'P-value' threshold_column_name = f'{feature_column_name}_threshold' drift_column_name = f'{feature_column_name}_alert' title = f'{metric_label} for {feature_column_name}' return metric_column_name, metric_label, threshold_column_name, drift_column_name, title def _plot_feature_drift(data: pd.DataFrame, feature: Feature, metric: str = 'statistic', *args, **kwargs) -> go.Figure: """Renders a line plot for a chosen metric of statistical statistical drift calculation results.""" ( metric_column_name, metric_label, threshold_column_name, drift_column_name, title, ) = _get_drift_column_names_for_feature(feature.column_name, feature.feature_type, metric) plot_partition_separator = len(data.value_counts()) > 1 fig = _step_plot( table=data, metric_column_name=metric_column_name, chunk_column_name=CHUNK_KEY_COLUMN_NAME, drift_column_name=drift_column_name, threshold_column_name=threshold_column_name, hover_labels=['Chunk', metric_label, 'Target data'], title=title, y_axis_title=metric_label, v_line_separating_analysis_period=plot_partition_separator, statistically_significant_column_name=drift_column_name, ) return fig def _plot_prediction_drift( data: pd.DataFrame, metadata: ModelMetadata, metric: str = 'statistic', class_label: str = None ) -> go.Figure: """Renders a line plot of the drift metric for a given feature.""" if isinstance(metadata, BinaryClassificationMetadata): prediction_column_name = metadata.predicted_probability_column_name elif isinstance(metadata, MulticlassClassificationMetadata): if not class_label or class_label == "": raise InvalidArgumentsException("value for 'class_label' must be set when plotting for multiclass models") if class_label not in metadata.predicted_probabilities_column_names: raise InvalidArgumentsException(f"no classes found named '{class_label}'. Please review the given value.") prediction_column_name = metadata.predicted_probabilities_column_names[class_label] elif isinstance(metadata, RegressionMetadata): prediction_column_name = metadata.prediction_column_name else: raise NotImplementedError ( metric_column_name, metric_label, threshold_column_name, drift_column_name, title, ) = _get_drift_column_names_for_feature(prediction_column_name, FeatureType.CONTINUOUS, metric) plot_partition_separator = len(data.value_counts()) > 1 fig = _step_plot( table=data, metric_column_name=metric_column_name, chunk_column_name=CHUNK_KEY_COLUMN_NAME, drift_column_name=drift_column_name, threshold_column_name=threshold_column_name, hover_labels=['Chunk', metric_label, 'Target data'], title=title, y_axis_title=metric_label, v_line_separating_analysis_period=plot_partition_separator, statistically_significant_column_name=drift_column_name, ) return fig def _plot_feature_distribution(data: List[Chunk], drift_data: pd.DataFrame, feature: Feature) -> go.Figure: """Plots the data distribution and associated drift for each chunk of a given continuous feature.""" if feature.feature_type is FeatureType.CONTINUOUS: return _plot_continuous_feature_distribution(data, drift_data, feature) elif feature.feature_type is FeatureType.CATEGORICAL: return _plot_categorical_feature_distribution(data, drift_data, feature) def _plot_continuous_feature_distribution(data: List[Chunk], drift_data: pd.DataFrame, feature: Feature) -> go.Figure: """Plots the data distribution and associated drift for each chunk of a given continuous feature.""" feature_column_name = feature.column_name x_axis_title = f'{feature_column_name}' drift_column_name = f'{feature_column_name}_alert' title = f'Distribution over time for {feature.label}' fig = _joy_plot( feature_table=_create_feature_table(data=data), drift_table=drift_data, chunk_column_name=CHUNK_KEY_COLUMN_NAME, drift_column_name=drift_column_name, feature_column_name=feature_column_name, x_axis_title=x_axis_title, title=title, style='vertical', ) return fig def _plot_categorical_feature_distribution(data: List[Chunk], drift_data: pd.DataFrame, feature: Feature) -> go.Figure: """Plots the data distribution and associated drift for each chunk of a given categorical feature.""" feature_column_name = feature.column_name yaxis_title = f'{feature_column_name}' drift_column_name = f'{feature_column_name}_alert' title = f'Distribution over time for {feature.label}' fig = _stacked_bar_plot( feature_table=_create_feature_table(data=data), drift_table=drift_data, chunk_column_name=CHUNK_KEY_COLUMN_NAME, drift_column_name=drift_column_name, feature_column_name=feature_column_name, yaxis_title=yaxis_title, title=title, ) return fig def _plot_prediction_distribution( data: List[Chunk], drift_data: pd.DataFrame, metadata: ModelMetadata, class_label: str = None ) -> go.Figure: """Plots the data distribution and associated drift for each chunk of the model predictions. Parameters ---------- data : pd.DataFrame The original model inputs and outputs drift_data : pd.DataFrame The results of the drift calculation metadata: ModelMetadata The metadata for the monitored model class_label: str, default=None The label of the class to plot the prediction distribution for. Only required in case of multiclass models. Returns ------- fig: plotly.graph_objects.Figure A visualization of the data distribution and drift using joy-plots. """ clip: Optional[Tuple[int, int]] = None if isinstance(metadata, BinaryClassificationMetadata): prediction_column_name = metadata.predicted_probability_column_name clip = (0, 1) elif isinstance(metadata, MulticlassClassificationMetadata): if not class_label or class_label == "": raise InvalidArgumentsException("value for 'class_label' must be set when plotting for multiclass models") prediction_column_name = metadata.predicted_probabilities_column_names[class_label] clip = (0, 1) elif isinstance(metadata, RegressionMetadata): prediction_column_name = metadata.prediction_column_name else: raise NotImplementedError x_axis_title = f'{prediction_column_name}' drift_column_name = f'{prediction_column_name}_alert' title = f'Distribution over time for {prediction_column_name}' fig = _joy_plot( feature_table=_create_feature_table(data=data), drift_table=drift_data, chunk_column_name=CHUNK_KEY_COLUMN_NAME, drift_column_name=drift_column_name, feature_column_name=prediction_column_name, x_axis_title=x_axis_title, post_kde_clip=clip, title=title, style='vertical', ) return fig def _create_feature_table( data: List[Chunk], ) -> pd.DataFrame: return pd.concat([chunk.data.assign(key=chunk.key) for chunk in data])