Source code for nannyml.drift.model_outputs.univariate.statistical.results

#  Author:   Niels Nuyttens  <niels@nannyml.com>
#
#  License: Apache Software License 2.0

"""Module containing univariate statistical drift calculation results and associated plotting implementations."""

from typing import List, Optional, Tuple

import pandas as pd
import plotly.graph_objects as go

from nannyml._typing import ProblemType
from nannyml.base import AbstractCalculator, AbstractCalculatorResult, _column_is_categorical, _column_is_continuous
from nannyml.chunk import Chunk
from nannyml.exceptions import InvalidArgumentsException
from nannyml.plots import CHUNK_KEY_COLUMN_NAME
from nannyml.plots._joy_plot import _joy_plot
from nannyml.plots._stacked_bar_plot import _stacked_bar_plot
from nannyml.plots._step_plot import _step_plot

"""Contains the results of the model output statistical drift calculation and provides plotting functionality."""


[docs]class UnivariateDriftResult(AbstractCalculatorResult):
    """Contains the results of the model output statistical drift calculation and provides plotting functionality."""

    def __init__(self, results_data: pd.DataFrame, calculator: AbstractCalculator):
        super().__init__(results_data)

        from .calculator import StatisticalOutputDriftCalculator

        if not isinstance(calculator, StatisticalOutputDriftCalculator):
            raise RuntimeError(
                f"{calculator.__class__.__name__} is not an instance of type " f"UnivariateStatisticalDriftCalculator"
            )
        self.calculator = calculator

    @property
    def calculator_name(self) -> str:
        return 'univariate_statistical_output_drift'

[docs]    def plot(
        self,
        kind: str = 'prediction_drift',
        metric: str = 'statistic',
        class_label: str = None,
        plot_reference: bool = False,
        *args,
        **kwargs,
    ) -> Optional[go.Figure]:
        """Renders plots for metrics returned by the univariate statistical drift calculator.

        For both model predictions and outputs you can render the statistic value or p-values as a step plot,
        or create a distribution plot. For multiclass use cases it is required to provide a ``class_label`` parameter
        when rendering model output plots.

        Select a plot using the ``kind`` parameter:

        - ``prediction_drift``
                plots the drift metric per :class:`~nannyml.chunk.Chunk` for the model predictions ``y_pred``.
        - ``prediction_distribution``
                plots the distribution per :class:`~nannyml.chunk.Chunk` for the model predictions ``y_pred``.
        - ``score_drift``
                plots the drift metric per :class:`~nannyml.chunk.Chunk` for the model outputs ``y_pred_proba``.
        - ``score_distribution``
                plots the distribution per per :class:`~nannyml.chunk.Chunk` for the model outputs ``y_pred_proba``


        Parameters
        ----------
        kind: str, default=`prediction_drift`
            The kind of plot you want to have. Allowed values are ``prediction_drift``,
            ``prediction_distribution``, ``score_drift`` and ``score_distribution``.
        metric : str, default=``statistic``
            The metric to plot. Allowed values are ``statistic`` and ``p_value``.
            Not applicable when plotting distributions.
        plot_reference: bool, default=False
            Indicates whether to include the reference period in the plot or not. Defaults to ``False``.
        class_label: str, default=None
            The label of the class to plot the prediction distribution for.
            Only required in case of multiclass use cases.


        Returns
        -------
        fig: :class:`plotly.graph_objs._figure.Figure`
            A :class:`~plotly.graph_objs._figure.Figure` object containing the requested drift plot.

            Can be saved to disk using the :meth:`~plotly.graph_objs._figure.Figure.write_image` method
            or shown rendered on screen using the :meth:`~plotly.graph_objs._figure.Figure.show` method.


        Examples
        --------
        >>> import nannyml as nml
        >>>
        >>> reference_df, analysis_df, _ = nml.load_synthetic_binary_classification_dataset()
        >>>
        >>> calc = nml.StatisticalOutputDriftCalculator(
        >>>     y_pred_proba='y_pred_proba',
        >>>     y_pred='y_pred',
        >>>     timestamp_column_name='timestamp'
        >>> )
        >>> calc.fit(reference_df)
        >>> results = calc.calculate(analysis_df)
        >>>
        >>> print(results.data)  # check the numbers
                     key  start_index  ...  y_pred_proba_alert y_pred_proba_threshold
        0       [0:4999]            0  ...                True                   0.05
        1    [5000:9999]         5000  ...               False                   0.05
        2  [10000:14999]        10000  ...               False                   0.05
        3  [15000:19999]        15000  ...               False                   0.05
        4  [20000:24999]        20000  ...               False                   0.05
        5  [25000:29999]        25000  ...                True                   0.05
        6  [30000:34999]        30000  ...                True                   0.05
        7  [35000:39999]        35000  ...                True                   0.05
        8  [40000:44999]        40000  ...                True                   0.05
        9  [45000:49999]        45000  ...                True                   0.05
        >>>
        >>> results.plot(kind='score_drift', plot_reference=True).show()
        >>> results.plot(kind='score_distribution', plot_reference=True).show()
        >>> results.plot(kind='prediction_drift', plot_reference=True).show()
        >>> results.plot(kind='prediction_distribution', plot_reference=True).show()

        """
        if kind == 'prediction_drift':
            return _plot_prediction_drift(
                self.data,
                self.calculator,
                self.calculator.y_pred,
                plot_reference,
                metric,
            )
        elif kind == 'prediction_distribution':
            return _plot_prediction_distribution(
                data=self.calculator.previous_analysis_data,
                drift_data=self.data,
                calculator=self.calculator,
                plot_reference=plot_reference,
            )
        elif kind == 'score_drift':
            return _plot_score_drift(self.data, self.calculator, plot_reference, metric, class_label)
        elif kind == 'score_distribution':
            return _plot_score_distribution(
                data=self.calculator.previous_analysis_data,
                drift_data=self.data,
                calculator=self.calculator,
                plot_reference=plot_reference,
                class_label=class_label,
            )
        else:
            raise InvalidArgumentsException(
                f"unknown plot kind '{kind}'. "
                "Please provide on of: ['prediction_drift', 'prediction_distribution', 'score_drift',"
                "'score_distribution']."
            )

    # @property
    # def plots(self) -> Dict[str, go.Figure]:
    #     plots: Dict[str, go.Figure] = {}
    #
    #     if isinstance(self.metadata, BinaryClassificationMetadata):
    #         prediction_column_name = self.metadata.predicted_probability_column_name
    #         plots[f'{prediction_column_name}_drift_statistic'] = _plot_prediction_drift(
    #             self.data, self.metadata, 'statistic'
    #         )
    #         plots[f'{prediction_column_name}_drift_p_value'] = _plot_prediction_drift(
    #             self.data, self.metadata, 'p_value'
    #         )
    #         plots[f'{prediction_column_name}_distribution'] = _plot_prediction_distribution(
    #             data=self._analysis_data, drift_data=self.data, metadata=self.metadata
    #         )
    #     elif isinstance(self.metadata, MulticlassClassificationMetadata):
    #         for class_label, prediction_column_name in self.metadata.predicted_probabilities_column_names.items():
    #             plots[f'{prediction_column_name}_drift_statistic'] = _plot_prediction_drift(
    #                 self.data, self.metadata, 'statistic', class_label
    #             )
    #             plots[f'{prediction_column_name}_drift_p_value'] = _plot_prediction_drift(
    #                 self.data, self.metadata, 'p_value', class_label
    #             )
    #             plots[f'{prediction_column_name}_distribution'] = _plot_prediction_distribution(
    #                 data=self._analysis_data, drift_data=self.data, metadata=self.metadata, class_label=class_label
    #             )
    #     elif isinstance(self.metadata, RegressionMetadata):
    #         prediction_column_name = self.metadata.prediction_column_name
    #         plots[f'{prediction_column_name}_drift_statistic'] = _plot_prediction_drift(
    #             self.data, self.metadata, 'statistic'
    #         )
    #         plots[f'{prediction_column_name}_drift_p_value'] = _plot_prediction_drift(
    #             self.data, self.metadata, 'p_value'
    #         )
    #         plots[f'{prediction_column_name}_distribution'] = _plot_prediction_distribution(
    #             data=self._analysis_data, drift_data=self.data, metadata=self.metadata
    #         )
    #
    #     return plots


def _get_drift_column_names_for_feature(feature_column_name: str, feature_type: str, metric: str) -> Tuple:
    metric_column_name, metric_label, threshold_column_name = None, None, None
    if metric == 'statistic':
        if feature_type == 'categorical':
            metric_column_name = f'{feature_column_name}_chi2'
            metric_label = 'Chi-square statistic'
        elif feature_type == 'continuous':
            metric_column_name = f'{feature_column_name}_dstat'
            metric_label = 'KS statistic'
        threshold_column_name = None
    elif metric == 'p_value':
        metric_column_name = f'{feature_column_name}_p_value'
        metric_label = 'P-value'
        threshold_column_name = f'{feature_column_name}_threshold'

    drift_column_name = f'{feature_column_name}_alert'
    title = f'{metric_label} for {feature_column_name}'

    return metric_column_name, metric_label, threshold_column_name, drift_column_name, title


def _plot_prediction_drift(
    data: pd.DataFrame,
    calculator,
    y_pred: str,
    plot_reference: bool,
    metric: str = 'statistic',
) -> go.Figure:
    """Renders a line plot of the drift metric for a given feature."""
    (
        metric_column_name,
        metric_label,
        threshold_column_name,
        drift_column_name,
        title,
    ) = _get_drift_column_names_for_feature(
        y_pred,
        'continuous' if calculator.problem_type == ProblemType.REGRESSION else 'categorical',
        metric,
    )

    plot_period_separator = plot_reference

    data['period'] = 'analysis'
    if plot_reference:
        reference_results = calculator.previous_reference_results.copy()
        reference_results['period'] = 'reference'
        data = pd.concat([reference_results, data], ignore_index=True)

    fig = _step_plot(
        table=data,
        metric_column_name=metric_column_name,
        chunk_column_name=CHUNK_KEY_COLUMN_NAME,
        drift_column_name=drift_column_name,
        lower_threshold_column_name=threshold_column_name,
        hover_labels=['Chunk', metric_label, 'Target data'],
        title=title,
        y_axis_title=metric_label,
        v_line_separating_analysis_period=plot_period_separator,
        statistically_significant_column_name=drift_column_name,
    )
    return fig


def _plot_prediction_distribution(
    data: pd.DataFrame,
    drift_data: pd.DataFrame,
    calculator,
    plot_reference: bool,
) -> go.Figure:
    """Plots the data distribution and associated drift for each chunk of the model predictions.

    Parameters
    ----------
    data : pd.DataFrame
        The original model inputs and outputs
    drift_data : pd.DataFrame
        The results of the drift calculation

    Returns
    -------
    fig: plotly.graph_objects.Figure
        A visualization of the data distribution and drift using joy-plots.
    """

    clip: Optional[Tuple[int, int]] = None
    if calculator.problem_type in [ProblemType.CLASSIFICATION_BINARY, ProblemType.CLASSIFICATION_MULTICLASS]:
        clip = (0, 1)

    prediction_column_name = calculator.y_pred
    axis_title = f'{prediction_column_name}'
    drift_column_name = f'{prediction_column_name}_alert'
    title = f'Distribution over time for {prediction_column_name}'

    drift_data['period'] = 'analysis'
    data['period'] = 'analysis'

    feature_table = _create_feature_table(calculator.chunker.split(data, calculator.timestamp_column_name))

    if plot_reference:
        reference_drift = calculator.previous_reference_results.copy()
        if reference_drift is None:
            raise RuntimeError(
                f"could not plot distribution for '{prediction_column_name}': "
                f"calculator is missing reference results\n{calculator}"
            )
        reference_drift['period'] = 'reference'
        drift_data = pd.concat([reference_drift, drift_data], ignore_index=True)

        reference_feature_table = _create_feature_table(
            calculator.chunker.split(calculator.previous_reference_data, calculator.timestamp_column_name)
        )
        feature_table = pd.concat([reference_feature_table, feature_table], ignore_index=True)

    if calculator.problem_type in [ProblemType.CLASSIFICATION_BINARY, ProblemType.CLASSIFICATION_MULTICLASS]:
        fig = _stacked_bar_plot(
            feature_table=feature_table,
            drift_table=drift_data,
            chunk_column_name='key',
            drift_column_name=drift_column_name,
            feature_column_name=prediction_column_name,
            yaxis_title=axis_title,
            title=title,
        )
    elif calculator.problem_type == ProblemType.REGRESSION:
        fig = _joy_plot(
            feature_table=feature_table,
            drift_table=drift_data,
            chunk_column_name=CHUNK_KEY_COLUMN_NAME,
            drift_column_name=drift_column_name,
            feature_column_name=prediction_column_name,
            x_axis_title=axis_title,
            post_kde_clip=clip,
            title=title,
            style='vertical',
        )
    else:
        raise RuntimeError(
            f"dtype '{data[prediction_column_name].dtype}' is not supported yet.\nPlease convert to one of "
            f"the following dtypes: ['object', 'string', 'category', 'bool'] for categorical data\n"
            f"or ['float64', 'int64'] for continuous data."
        )

    return fig


def _plot_score_drift(
    data: pd.DataFrame,
    calculator,
    plot_reference: bool,
    metric: str = 'statistic',
    class_label: str = None,
) -> go.Figure:
    """Renders a line plot of the drift metric for a given feature."""
    if calculator.problem_type == ProblemType.REGRESSION:
        raise InvalidArgumentsException(
            "plot of kind 'score_drift' don't support "
            "regression problems. Please use the 'prediction_distribution' plot."
        )

    # deal with multiclass stuff
    # if isinstance(calculator.y_pred_proba, Dict):
    if calculator.problem_type == ProblemType.CLASSIFICATION_MULTICLASS:
        if class_label is None:
            raise InvalidArgumentsException(
                "a class label is required when plotting multiclass model"
                "outputs.\nPlease provide one using the 'class_label' parameter."
            )
        if class_label not in calculator.y_pred_proba:
            raise InvalidArgumentsException(
                f"class label '{class_label}' was not found in configured "
                f"model outputs {calculator.y_pred_proba}.\n"
                f"Please provide a value that is present in the model outputs."
            )
        output_column_name = calculator.y_pred_proba[class_label]
    elif calculator.problem_type == ProblemType.CLASSIFICATION_BINARY:
        output_column_name = calculator.y_pred_proba
    else:
        raise InvalidArgumentsException(
            f"parameter 'y_pred_proba' is of type '{type(calculator.y_pred_proba)}' "
            "but should be of type 'Union[str, Dict[str, str].'"
        )

    (
        metric_column_name,
        metric_label,
        threshold_column_name,
        drift_column_name,
        title,
    ) = _get_drift_column_names_for_feature(
        output_column_name,
        'continuous' if _column_is_continuous(calculator.previous_analysis_data[output_column_name]) else 'categorical',
        metric,
    )

    plot_period_separator = plot_reference

    data['period'] = 'analysis'
    if plot_reference:
        reference_results = calculator.previous_reference_results.copy()
        reference_results['period'] = 'reference'
        data = pd.concat([reference_results, data], ignore_index=True)

    fig = _step_plot(
        table=data,
        metric_column_name=metric_column_name,
        chunk_column_name=CHUNK_KEY_COLUMN_NAME,
        drift_column_name=drift_column_name,
        lower_threshold_column_name=threshold_column_name,
        hover_labels=['Chunk', metric_label, 'Target data'],
        title=title,
        y_axis_title=metric_label,
        v_line_separating_analysis_period=plot_period_separator,
        statistically_significant_column_name=drift_column_name,
    )
    return fig


def _plot_score_distribution(
    data: pd.DataFrame, drift_data: pd.DataFrame, calculator, plot_reference: bool, class_label: str = None
) -> go.Figure:
    """Plots the data distribution and associated drift for each chunk of the model predictions.

    Parameters
    ----------
    data : pd.DataFrame
        The original model inputs and outputs
    drift_data : pd.DataFrame
        The results of the drift calculation
    calculator:
        The calculator that produced these results
    plot_reference: bool
        Flag instructing to either include reference data on the plot or not
    class_label: str, default=None
        The label of the class to plot the prediction distribution for. Only required in case of multiclass models.

    Returns
    -------
    fig: plotly.graph_objects.Figure
        A visualization of the data distribution and drift using joy-plots.
    """
    if calculator.problem_type == ProblemType.REGRESSION:
        raise InvalidArgumentsException(
            "plot of kind 'score_distribution' don't support "
            "regression problems. Please use the 'prediction_distribution' plot."
        )

    clip: Optional[Tuple[int, int]] = None

    # deal with multiclass stuff
    if calculator.problem_type == ProblemType.CLASSIFICATION_MULTICLASS:
        if class_label is None:
            raise InvalidArgumentsException(
                "a class label is required when plotting multiclass model"
                "outputs.\nPlease provide one using the 'class_label' parameter."
            )
        if class_label not in calculator.y_pred_proba:
            raise InvalidArgumentsException(
                f"class label '{class_label}' was not found in configured "
                f"model outputs {calculator.y_pred_proba}.\n"
                f"Please provide a value that is present in the model outputs."
            )
        output_column_name = calculator.y_pred_proba[class_label]
        clip = (0, 1)
    # elif isinstance(calculator.y_pred_proba, str):
    elif calculator.problem_type == ProblemType.CLASSIFICATION_BINARY:
        output_column_name = calculator.y_pred_proba
        clip = (0, 1)
    else:
        raise InvalidArgumentsException(
            f"parameter 'y_pred_proba' is of type '{type(calculator.y_pred_proba)}' "
            "but should be of type 'Union[str, Dict[str, str].'"
        )

    axis_title = f'{output_column_name}'
    drift_column_name = f'{output_column_name}_alert'
    title = f'Distribution over time for {output_column_name}'

    drift_data['period'] = 'analysis'
    data['period'] = 'analysis'

    feature_table = _create_feature_table(calculator.chunker.split(data, calculator.timestamp_column_name))

    if plot_reference:
        reference_drift = calculator.previous_reference_results.copy()
        if reference_drift is None:
            raise RuntimeError(
                f"could not plot categorical distribution for feature '{output_column_name}': "
                f"calculator is missing reference results\n{calculator}"
            )
        reference_drift['period'] = 'reference'
        drift_data = pd.concat([reference_drift, drift_data], ignore_index=True)

        reference_feature_table = _create_feature_table(
            calculator.chunker.split(calculator.previous_reference_data, calculator.timestamp_column_name)
        )
        feature_table = pd.concat([reference_feature_table, feature_table], ignore_index=True)

    if _column_is_categorical(data[output_column_name]):
        fig = _stacked_bar_plot(
            feature_table=feature_table,
            drift_table=drift_data,
            chunk_column_name='key',
            drift_column_name=drift_column_name,
            feature_column_name=output_column_name,
            yaxis_title=axis_title,
            title=title,
        )
    elif _column_is_continuous(data[output_column_name]):
        fig = _joy_plot(
            feature_table=feature_table,
            drift_table=drift_data,
            chunk_column_name=CHUNK_KEY_COLUMN_NAME,
            drift_column_name=drift_column_name,
            feature_column_name=output_column_name,
            x_axis_title=axis_title,
            post_kde_clip=clip,
            title=title,
            style='vertical',
        )
    else:
        raise RuntimeError(
            f"dtype '{data[output_column_name].dtype}' is not supported yet.\nPlease convert to one of "
            f"the following dtypes: ['object', 'string', 'category', 'bool'] for categorical data\n"
            f"or ['float64', 'int64'] for continuous data."
        )
    return fig


def _create_feature_table(
    data: List[Chunk],
) -> pd.DataFrame:
    return pd.concat([chunk.data.assign(key=chunk.key) for chunk in data])