Source code for nannyml.plots.blueprints.distributions

#  Author:   Niels Nuyttens  <niels@nannyml.com>
#
#  License: Apache Software License 2.0
import math
from typing import Any, Dict, Optional, Union

import numpy as np
import pandas as pd

from nannyml.chunk import Chunker
from nannyml.exceptions import InvalidArgumentsException
from nannyml.plots import Colors
from nannyml.plots.components import Figure, Hover
from nannyml.plots.components.joy_plot import alert as joy_alert
from nannyml.plots.components.joy_plot import calculate_chunk_distributions, joy
from nannyml.plots.components.stacked_bar_plot import alert as stacked_bar_alert
from nannyml.plots.components.stacked_bar_plot import calculate_value_counts, stacked_bar
from nannyml.plots.util import is_time_based_x_axis


# Intended to work only with Univariate Drift Result for now...
[docs]def plot_distributions( result, reference_data: pd.DataFrame, # TODO: move distribution calculations to calculator run analysis_data: pd.DataFrame, # TODO: move distribution calculations to calculator run chunker: Chunker, # TODO: move distribution calculations to calculator run title: Optional[str] = 'Column distributions', figure: Optional[Figure] = None, x_axis_time_title: str = 'Time', x_axis_chunk_title: str = 'Chunk', y_axis_title: str = 'Values', figure_args: Optional[Dict[str, Any]] = None, subplot_title_format: str = '<b>{display_names[0]}</b> distribution (alerts for <b>{display_names[1]})</b>', number_of_columns: Optional[int] = None, ) -> Figure: from nannyml.drift.univariate import Result assert isinstance(result, Result) number_of_plots = len(result.keys()) if number_of_columns is None: number_of_columns = min(number_of_plots, 1) number_of_rows = math.ceil(number_of_plots / number_of_columns) if figure_args is None: figure_args = {} if figure is None: figure = Figure( **dict( title=title, x_axis_title=x_axis_time_title if is_time_based_x_axis(result.chunk_start_dates, result.chunk_end_dates) else x_axis_chunk_title, y_axis_title=y_axis_title, legend=dict(traceorder="grouped", itemclick=False, itemdoubleclick=False), height=number_of_plots * 500 / number_of_columns, subplot_args=dict( cols=number_of_columns, rows=number_of_rows, subplot_titles=[ subplot_title_format.format(display_names=key.display_names) for key in result.keys() ], ), **figure_args, ) ) reference_result = result.filter(period='reference') analysis_result = result.filter(period='analysis') for idx, key in enumerate(result.keys()): row = (idx // number_of_columns) + 1 col = (idx % number_of_columns) + 1 column_name, method = key.properties analysis_chunk_start_dates = analysis_result.chunk_start_dates analysis_chunk_end_dates = analysis_result.chunk_end_dates x_axis_is_time_based = is_time_based_x_axis(analysis_chunk_start_dates, analysis_chunk_end_dates) if column_name in result.categorical_column_names and method in result.categorical_method_names: figure = _plot_stacked_bar( figure=figure, row=row, col=col, chunker=chunker, column_name=column_name, metric_display_name=method, reference_data=reference_data[column_name], reference_data_timestamps=reference_data[result.timestamp_column_name] if x_axis_is_time_based else None, reference_alerts=reference_result.alerts(key), reference_chunk_keys=reference_result.chunk_keys, reference_chunk_periods=reference_result.chunk_periods, reference_chunk_indices=reference_result.chunk_indices, reference_chunk_start_dates=reference_result.chunk_start_dates, reference_chunk_end_dates=reference_result.chunk_end_dates, analysis_data=analysis_data[column_name], analysis_data_timestamps=analysis_data[result.timestamp_column_name] if x_axis_is_time_based else None, analysis_alerts=analysis_result.alerts(key), analysis_chunk_keys=analysis_result.chunk_keys, analysis_chunk_periods=analysis_result.chunk_periods, analysis_chunk_indices=analysis_result.chunk_indices, analysis_chunk_start_dates=analysis_chunk_start_dates, analysis_chunk_end_dates=analysis_chunk_end_dates, ) elif column_name in result.continuous_column_names and method in result.continuous_method_names: figure = _plot_joyplot( figure=figure, row=row, col=col, chunker=chunker, metric_display_name=method, reference_data=reference_data[column_name], reference_data_timestamps=reference_data[result.timestamp_column_name] if x_axis_is_time_based else None, reference_alerts=reference_result.alerts(key), reference_chunk_keys=reference_result.chunk_keys, reference_chunk_periods=reference_result.chunk_periods, reference_chunk_indices=reference_result.chunk_indices, reference_chunk_start_dates=reference_result.chunk_start_dates, reference_chunk_end_dates=reference_result.chunk_end_dates, analysis_data=analysis_data[column_name], analysis_data_timestamps=analysis_data[result.timestamp_column_name] if x_axis_is_time_based else None, analysis_alerts=analysis_result.alerts(key), analysis_chunk_keys=analysis_result.chunk_keys, analysis_chunk_periods=analysis_result.chunk_periods, analysis_chunk_indices=analysis_result.chunk_indices, analysis_chunk_start_dates=analysis_chunk_start_dates, analysis_chunk_end_dates=analysis_chunk_end_dates, ) else: raise InvalidArgumentsException( f"can not plot column '{column_name}' since the column is not in " f"the continuous or categorical columns lists." ) return figure
def _plot_joyplot( figure: Figure, metric_display_name: str, reference_data: Union[np.ndarray, pd.Series], reference_data_timestamps: Union[np.ndarray, pd.Series], analysis_data: Union[np.ndarray, pd.Series], analysis_data_timestamps: Union[np.ndarray, pd.Series], chunker: Chunker, reference_alerts: Optional[Union[np.ndarray, pd.Series]] = None, reference_chunk_keys: Optional[Union[np.ndarray, pd.Series]] = None, reference_chunk_periods: Optional[Union[np.ndarray, pd.Series]] = None, reference_chunk_indices: Optional[Union[np.ndarray, pd.Series]] = None, reference_chunk_start_dates: Optional[Union[np.ndarray, pd.Series]] = None, reference_chunk_end_dates: Optional[Union[np.ndarray, pd.Series]] = None, analysis_alerts: Optional[Union[np.ndarray, pd.Series]] = None, analysis_chunk_keys: Optional[Union[np.ndarray, pd.Series]] = None, analysis_chunk_periods: Optional[Union[np.ndarray, pd.Series]] = None, analysis_chunk_indices: Optional[Union[np.ndarray, pd.Series]] = None, analysis_chunk_start_dates: Optional[Union[np.ndarray, pd.Series]] = None, analysis_chunk_end_dates: Optional[Union[np.ndarray, pd.Series]] = None, row: Optional[int] = None, col: Optional[int] = None, hover: Optional[Hover] = None, ) -> Figure: is_subplot = row is not None and col is not None subplot_args = dict(row=row, col=col) if is_subplot else None has_reference_results = reference_chunk_indices is not None and len(reference_chunk_indices) > 0 if figure is None: figure = Figure(title='continuous distribution', x_axis_title='time', y_axis_title='value', height=500) if has_reference_results: # TODO: move distribution calculations to calculator run reference_distributions = calculate_chunk_distributions( data=reference_data, chunker=chunker, data_periods=pd.Series('reference', index=range(len(reference_data))), timestamps=reference_data_timestamps, ) figure = joy( fig=figure, data_distributions=reference_distributions, chunk_indices=reference_chunk_indices, chunk_start_dates=reference_chunk_start_dates, chunk_end_dates=reference_chunk_end_dates, name='Reference', color=Colors.BLUE_SKY_CRAYOLA, subplot_args=subplot_args, ) assert reference_chunk_indices is not None analysis_chunk_indices = analysis_chunk_indices + (max(reference_chunk_indices) + 1) analysis_distributions = calculate_chunk_distributions( # TODO: move distribution calculations to calculator run data=analysis_data, chunker=chunker, data_periods=pd.Series('analysis', index=range(len(analysis_data))), timestamps=analysis_data_timestamps, ) figure = joy( fig=figure, data_distributions=analysis_distributions, chunk_indices=analysis_chunk_indices, chunk_start_dates=analysis_chunk_start_dates, chunk_end_dates=analysis_chunk_end_dates, name='Analysis', color=Colors.INDIGO_PERSIAN, subplot_args=subplot_args, ) if analysis_alerts is not None: figure = joy_alert( fig=figure, alerts=analysis_alerts, data_distributions=analysis_distributions, color=Colors.RED_IMPERIAL, name='Alerts', chunk_indices=analysis_chunk_indices, chunk_start_dates=analysis_chunk_start_dates, chunk_end_dates=analysis_chunk_end_dates, subplot_args=subplot_args, ) return figure def _plot_stacked_bar( figure: Figure, column_name: str, metric_display_name: str, reference_data: Union[np.ndarray, pd.Series], reference_data_timestamps: Union[np.ndarray, pd.Series], analysis_data: Union[np.ndarray, pd.Series], analysis_data_timestamps: Union[np.ndarray, pd.Series], chunker: Chunker, reference_alerts: Optional[Union[np.ndarray, pd.Series]] = None, reference_chunk_keys: Optional[Union[np.ndarray, pd.Series]] = None, reference_chunk_periods: Optional[Union[np.ndarray, pd.Series]] = None, reference_chunk_indices: Optional[Union[np.ndarray, pd.Series]] = None, reference_chunk_start_dates: Optional[Union[np.ndarray, pd.Series]] = None, reference_chunk_end_dates: Optional[Union[np.ndarray, pd.Series]] = None, analysis_alerts: Optional[Union[np.ndarray, pd.Series]] = None, analysis_chunk_keys: Optional[Union[np.ndarray, pd.Series]] = None, analysis_chunk_periods: Optional[Union[np.ndarray, pd.Series]] = None, analysis_chunk_indices: Optional[Union[np.ndarray, pd.Series]] = None, analysis_chunk_start_dates: Optional[Union[np.ndarray, pd.Series]] = None, analysis_chunk_end_dates: Optional[Union[np.ndarray, pd.Series]] = None, row: Optional[int] = None, col: Optional[int] = None, hover: Optional[Hover] = None, ) -> Figure: is_subplot = row is not None and col is not None subplot_args = dict(row=row, col=col) if is_subplot else None has_reference_results = reference_chunk_indices is not None and len(reference_chunk_indices) > 0 if figure is None: figure = Figure(title='continuous distribution', x_axis_title='time', y_axis_title='value', height=500) figure.update_xaxes( dict(mirror=False, showline=False), overwrite=True, matches='x', title=figure.layout.xaxis.title, row=row, col=col, ) figure.update_yaxes( dict(mirror=False, showline=False), overwrite=True, title=figure.layout.yaxis.title, row=row, col=col ) if has_reference_results: reference_value_counts = calculate_value_counts( data=reference_data, chunker=chunker, timestamps=reference_data_timestamps, max_number_of_categories=5, missing_category_label='Missing', ) figure = stacked_bar( figure=figure, stacked_bar_table=reference_value_counts, color=Colors.BLUE_SKY_CRAYOLA, chunk_indices=reference_chunk_indices, chunk_start_dates=reference_chunk_start_dates, chunk_end_dates=reference_chunk_end_dates, annotation='Reference', showlegend=True, legendgrouptitle_text=f'<b>{column_name}</b>', legendgroup=column_name, subplot_args=subplot_args, ) assert reference_chunk_indices is not None analysis_chunk_indices = analysis_chunk_indices + (max(reference_chunk_indices) + 1) analysis_value_counts = calculate_value_counts( data=analysis_data, chunker=chunker, timestamps=analysis_data_timestamps, max_number_of_categories=5, missing_category_label='Missing', ) figure = stacked_bar( figure=figure, stacked_bar_table=analysis_value_counts, color=Colors.INDIGO_PERSIAN, chunk_indices=analysis_chunk_indices, chunk_start_dates=analysis_chunk_start_dates, chunk_end_dates=analysis_chunk_end_dates, annotation='Analysis', showlegend=False, legendgroup=column_name, subplot_args=subplot_args, ) figure = stacked_bar_alert( figure=figure, alerts=analysis_alerts, stacked_bar_table=analysis_value_counts, color=Colors.RED_IMPERIAL, chunk_indices=analysis_chunk_indices, chunk_start_dates=analysis_chunk_start_dates, chunk_end_dates=analysis_chunk_end_dates, showlegend=True, legendgroup=column_name, subplot_args=subplot_args, ) return figure