Source code for nannyml.drift.model_inputs.univariate.statistical.results

#  Author:   Niels Nuyttens  <niels@nannyml.com>
#
#  License: Apache Software License 2.0

"""Module containing univariate statistical drift calculation results and associated plotting implementations."""

from typing import List, Tuple

import pandas as pd
import plotly.graph_objects as go

from nannyml.chunk import Chunk
from nannyml.drift.base import DriftResult
from nannyml.exceptions import InvalidArgumentsException
from nannyml.metadata import BinaryClassificationMetadata, MulticlassClassificationMetadata
from nannyml.metadata.base import Feature, FeatureType, ModelMetadata
from nannyml.plots import CHUNK_KEY_COLUMN_NAME
from nannyml.plots._joy_plot import _joy_plot
from nannyml.plots._stacked_bar_plot import _stacked_bar_plot
from nannyml.plots._step_plot import _step_plot


[docs]class UnivariateDriftResult(DriftResult): """Contains the univariate statistical drift calculation results and provides additional plotting functionality.""" # TODO: this is messing up functionality in scratch files (sets runtime class to DataFrame). Check this!
[docs] def __repr__(self): """Represent the DriftResults object as the data it contains.""" return self.data.__repr__()
[docs] def plot( self, kind: str = 'feature', metric: str = 'statistic', feature_label: str = None, feature_column_name: str = None, class_label: str = None, *args, **kwargs, ) -> go.Figure: """Renders a line plot for a chosen metric of statistical statistical drift calculation results. Given either a feature label (check ``model_metadata.features``) or the actual feature column name and a metric (one of either ``statistic`` or ``p_value``) this function will render a line plot displaying the metric value for the selected feature per chunk. Chunks are set on a time-based X-axis by using the period containing their observations. Chunks of different partitions (``reference`` and ``analysis``) are represented using different colors and a vertical separation if the drift results contain multiple partitions. The different plot kinds that are available: - ``feature_drift``: plots drift per :class:`~nannyml.chunk.Chunk` for a single feature of a chunked data set. - ``prediction_drift``: plots drift per :class:`~nannyml.chunk.Chunk` for the predictions of a chunked data set. - ``feature_distribution``: plots feature distribution per :class:`~nannyml.chunk.Chunk`. Joyplot for continuous features, stacked bar charts for categorical features. - ``prediction_distribution``: plots the prediction distribution per :class:`~nannyml.chunk.Chunk` of a chunked data set as a joyplot. Parameters ---------- kind: str, default=`feature_drift` The kind of plot you want to have. Value must be one of ``feature_drift``, ``prediction_drift``, ``feature_distribution`` or ``prediction_distribution``. metric : str, default=``statistic`` The metric to plot. Value must be one of ``statistic`` or ``p_value`` feature_label : str Feature label identifying a feature according to the preset model metadata. The function will raise an exception when no feature of that label was found in the metadata. Either ``feature_label`` or ``feature_column_name`` should be specified. feature_column_name : str Column name identifying a feature according to the preset model metadata. The function will raise an exception when no feature using that column name was found in the metadata. Either ``feature_column_name`` or ``feature_label`` should be specified. class_label: str, default=None The label of the class to plot the prediction distribution for. Only required in case of multiclass models. Returns ------- fig: plotly.graph_objects.Figure A ``Figure`` object containing the requested drift plot. Can be saved to disk or shown rendered on screen using ``fig.show()``. Examples -------- >>> import nannyml as nml >>> ref_df, ana_df, _ = nml.load_synthetic_binary_classification_dataset() >>> metadata = nml.extract_metadata(ref_df, model_type=nml.ModelType.CLASSIFICATION_BINARY) >>> drift_calc = nml.UnivariateStatisticalDriftCalculator(model_metadata=metadata, chunk_period='W') >>> drift_calc.fit(ref_df) >>> drifts = drift_calc.calculate(ana_df) >>> # loop over all features and plot the feature drift and feature distribution for each >>> for f in metadata.features: >>> drifts.plot(kind='feature_drift', feature_label=f.label).show() >>> drifts.plot(kind='feature_distribution', feature_label=f.label).show() """ if kind == 'feature_drift': feature = _get_feature(self.metadata, feature_label, feature_column_name) return _plot_feature_drift(self.data, feature, metric, args, kwargs) elif kind == 'prediction_drift': return _plot_prediction_drift(self.data, self.metadata, metric, class_label) elif kind == 'feature_distribution': feature = _get_feature(self.metadata, feature_label, feature_column_name) return _plot_feature_distribution( data=self._analysis_data, drift_data=self.data, feature=feature, ) elif kind == 'prediction_distribution': return _plot_prediction_distribution( data=self._analysis_data, drift_data=self.data, metadata=self.metadata, class_label=class_label ) else: raise InvalidArgumentsException( f"unknown plot kind '{kind}'. " f"Please provide on of: ['feature_drift', 'feature_distribution', " f"'prediction_drift', 'prediction_distribution']." )
def _get_feature(model_metadata: ModelMetadata, feature_label: str = None, feature_column_name: str = None) -> Feature: if feature_label is None and feature_column_name is None: raise InvalidArgumentsException("one of 'feature_label' or 'feature_column_name' should be provided.") feature = ( model_metadata.feature(feature=feature_label) if feature_label else model_metadata.feature(column=feature_column_name) ) if feature is None: raise InvalidArgumentsException(f'could not find a feature {feature_label or feature_column_name}') return feature def _get_drift_column_names_for_feature(feature_column_name: str, feature_type: FeatureType, metric: str) -> Tuple: metric_column_name, metric_label, threshold_column_name = None, None, None if metric == 'statistic': if feature_type == FeatureType.CATEGORICAL: metric_column_name = f'{feature_column_name}_chi2' metric_label = 'Chi-square statistic' elif feature_type == FeatureType.CONTINUOUS: metric_column_name = f'{feature_column_name}_dstat' metric_label = 'KS statistic' threshold_column_name = None elif metric == 'p_value': metric_column_name = f'{feature_column_name}_p_value' metric_label = 'P-value' threshold_column_name = f'{feature_column_name}_threshold' drift_column_name = f'{feature_column_name}_alert' title = f'{metric_label} for {feature_column_name}' return metric_column_name, metric_label, threshold_column_name, drift_column_name, title def _plot_feature_drift(data: pd.DataFrame, feature: Feature, metric: str = 'statistic', *args, **kwargs) -> go.Figure: """Renders a line plot for a chosen metric of statistical statistical drift calculation results.""" ( metric_column_name, metric_label, threshold_column_name, drift_column_name, title, ) = _get_drift_column_names_for_feature(feature.column_name, feature.feature_type, metric) plot_partition_separator = len(data.value_counts()) > 1 fig = _step_plot( table=data, metric_column_name=metric_column_name, chunk_column_name=CHUNK_KEY_COLUMN_NAME, drift_column_name=drift_column_name, threshold_column_name=threshold_column_name, hover_labels=['Chunk', metric_label, 'Target data'], title=title, y_axis_title=metric_label, v_line_separating_analysis_period=plot_partition_separator, statistically_significant_column_name=drift_column_name, ) return fig def _plot_prediction_drift( data: pd.DataFrame, metadata: ModelMetadata, metric: str = 'statistic', class_label: str = None ) -> go.Figure: """Renders a line plot of the drift metric for a given feature.""" if isinstance(metadata, BinaryClassificationMetadata): prediction_column_name = metadata.predicted_probability_column_name elif isinstance(metadata, MulticlassClassificationMetadata): if not class_label or class_label == "": raise InvalidArgumentsException("value for 'class_label' must be set when plotting for multiclass models") if class_label not in metadata.predicted_probabilities_column_names: raise InvalidArgumentsException(f"no classes found named '{class_label}'. Please review the given value.") prediction_column_name = metadata.predicted_probabilities_column_names[class_label] else: raise NotImplementedError ( metric_column_name, metric_label, threshold_column_name, drift_column_name, title, ) = _get_drift_column_names_for_feature(prediction_column_name, FeatureType.CONTINUOUS, metric) plot_partition_separator = len(data.value_counts()) > 1 fig = _step_plot( table=data, metric_column_name=metric_column_name, chunk_column_name=CHUNK_KEY_COLUMN_NAME, drift_column_name=drift_column_name, threshold_column_name=threshold_column_name, hover_labels=['Chunk', metric_label, 'Target data'], title=title, y_axis_title=metric_label, v_line_separating_analysis_period=plot_partition_separator, statistically_significant_column_name=drift_column_name, ) return fig def _plot_feature_distribution(data: List[Chunk], drift_data: pd.DataFrame, feature: Feature) -> go.Figure: """Plots the data distribution and associated drift for each chunk of a given continuous feature.""" if feature.feature_type is FeatureType.CONTINUOUS: return _plot_continuous_feature_distribution(data, drift_data, feature) elif feature.feature_type is FeatureType.CATEGORICAL: return _plot_categorical_feature_distribution(data, drift_data, feature) def _plot_continuous_feature_distribution(data: List[Chunk], drift_data: pd.DataFrame, feature: Feature) -> go.Figure: """Plots the data distribution and associated drift for each chunk of a given continuous feature.""" feature_column_name = feature.column_name x_axis_title = f'{feature_column_name}' drift_column_name = f'{feature_column_name}_alert' title = f'Distribution over time for {feature.label}' fig = _joy_plot( feature_table=_create_feature_table(data=data), drift_table=drift_data, chunk_column_name=CHUNK_KEY_COLUMN_NAME, drift_column_name=drift_column_name, feature_column_name=feature_column_name, x_axis_title=x_axis_title, title=title, style='vertical', ) return fig def _plot_categorical_feature_distribution(data: List[Chunk], drift_data: pd.DataFrame, feature: Feature) -> go.Figure: """Plots the data distribution and associated drift for each chunk of a given categorical feature.""" feature_column_name = feature.column_name yaxis_title = f'{feature_column_name}' drift_column_name = f'{feature_column_name}_alert' title = f'Distribution over time for {feature.label}' fig = _stacked_bar_plot( feature_table=_create_feature_table(data=data), drift_table=drift_data, chunk_column_name=CHUNK_KEY_COLUMN_NAME, drift_column_name=drift_column_name, feature_column_name=feature_column_name, yaxis_title=yaxis_title, title=title, ) return fig def _plot_prediction_distribution( data: List[Chunk], drift_data: pd.DataFrame, metadata: ModelMetadata, class_label: str = None ) -> go.Figure: """Plots the data distribution and associated drift for each chunk of the model predictions. Parameters ---------- data : pd.DataFrame The original model inputs and outputs drift_data : pd.DataFrame The results of the drift calculation metadata: ModelMetadata The metadata for the monitored model class_label: str, default=None The label of the class to plot the prediction distribution for. Only required in case of multiclass models. Returns ------- fig: plotly.graph_objects.Figure A visualization of the data distribution and drift using joy-plots. """ if isinstance(metadata, BinaryClassificationMetadata): predicted_probability_column_name = metadata.predicted_probability_column_name elif isinstance(metadata, MulticlassClassificationMetadata): if not class_label or class_label == "": raise InvalidArgumentsException("value for 'class_label' must be set when plotting for multiclass models") predicted_probability_column_name = metadata.predicted_probabilities_column_names[class_label] else: raise NotImplementedError x_axis_title = f'{predicted_probability_column_name}' drift_column_name = f'{predicted_probability_column_name}_alert' title = f'Distribution over time for {predicted_probability_column_name}' fig = _joy_plot( feature_table=_create_feature_table(data=data), drift_table=drift_data, chunk_column_name=CHUNK_KEY_COLUMN_NAME, drift_column_name=drift_column_name, feature_column_name=predicted_probability_column_name, x_axis_title=x_axis_title, post_kde_clip=(0, 1), title=title, style='vertical', ) return fig def _create_feature_table( data: List[Chunk], ) -> pd.DataFrame: return pd.concat([chunk.data.assign(key=chunk.key) for chunk in data])