# Author: Niels Nuyttens <niels@nannyml.com>
#
# License: Apache Software License 2.0
"""Module containing ways to rank drifting features."""
import abc
import logging
from typing import Any, Callable, Dict
import pandas as pd
from nannyml.drift.univariate.result import Result
from nannyml.exceptions import InvalidArgumentsException
[docs]class Ranking(abc.ABC):
"""Class that abstracts ranking features by impact on model performance."""
[docs] def rank(
self,
drift_calculation_result: Result,
only_drifting: bool = False,
) -> pd.DataFrame:
"""Ranks the features within a drift calculation according to impact on model performance.
Parameters
----------
drift_calculation_result : nannyml.drift.model_inputs.univariate.statistical.Result
The drift calculation results.
only_drifting : bool
Omits non-drifting features from the ranking if True.
Returns
-------
feature_ranking: pd.DataFrame
A DataFrame containing at least a feature name and a rank per row.
"""
raise NotImplementedError
def __call__(self, *args, **kwargs):
return self(**kwargs)
[docs]class Ranker:
"""Factory class to easily access Ranking implementations."""
registry: Dict[str, Ranking] = {}
@classmethod
def _logger(cls) -> logging.Logger:
return logging.getLogger(__name__)
[docs] @classmethod
def register(cls, key: str) -> Callable:
"""Adds a Ranking to the registry using the provided key.
Just use the decorator above any :class:`~nannyml.drift.ranking.Ranking` subclass to have it automatically
registered.
Examples
--------
>>> @Ranker.register('alert_count')
>>> class AlertCountRanking(Ranking):
>>> pass
>>>
>>> # Use the Ranking
>>> ranker = nml.Ranker.by('alert_count')
>>> ranked_features = ranker.rank(results, only_drifting=False)
"""
def inner_wrapper(wrapped_class: Ranking) -> Ranking:
if key in cls.registry:
cls._logger().warning(f"re-registering Ranking for key='{key}'")
cls.registry[key] = wrapped_class
return wrapped_class
return inner_wrapper
[docs] @classmethod
def by(cls, key: str = 'alert_count', ranking_args: Dict[str, Any] = None) -> Ranking:
"""Returns a Ranking subclass instance given a key value.
If the provided key equals ``None``, then a new instance of the default Ranking (AlertCountRanking)
will be returned.
If a non-existent key is provided an ``InvalidArgumentsException`` is raised.
Parameters
----------
key : str, default='alert_count'
The key used to retrieve a Ranking. When providing a key that is already in the index, the value
will be overwritten.
ranking_args: Dict[str, Any], default=None
A dictionary of arguments that will be passed to the Ranking during creation.
Returns
-------
ranking: Ranking
A new instance of a specific Ranking subclass.
Examples
--------
>>> ranking = Ranker.by('alert_count')
"""
if ranking_args is None:
ranking_args = {}
if key not in cls.registry:
raise InvalidArgumentsException(
f"ranking {key} unknown. " f"Please provide one of the following: {cls.registry.keys()}"
)
ranking_class = cls.registry[key]
return ranking_class(**ranking_args)
[docs]@Ranker.register('alert_count')
class AlertCountRanking(Ranking):
"""Ranks features by the number of drift 'alerts' they've caused."""
ALERT_COLUMN_SUFFIX = '_alert'
[docs] def rank(
self,
drift_calculation_result: Result,
only_drifting: bool = False,
) -> pd.DataFrame:
"""Compares the number of alerts for each feature and ranks them accordingly.
Parameters
----------
drift_calculation_result : nannyml.drift.model_inputs.univariate.statistical.Result
The drift calculation results. Requires alert columns to be present. These are recognized and parsed
using the ALERT_COLUMN_SUFFIX pattern, currently equal to ``'_alert'``.
only_drifting : bool, default=False
Omits features without alerts from the ranking results.
Returns
-------
feature_ranking: pd.DataFrame
A DataFrame containing the feature names and their ranks (the highest rank starts at 1,
second-highest rank is 2, etc.)
Examples
--------
>>> import nannyml as nml
>>> from IPython.display import display
>>>
>>> reference_df = nml.load_synthetic_binary_classification_dataset()[0]
>>> analysis_df = nml.load_synthetic_binary_classification_dataset()[1]
>>> target_df = nml.load_synthetic_binary_classification_dataset()[2]
>>>
>>> display(reference_df.head())
>>>
>>> column_names = [
>>> col for col in reference_df.columns if col not in ['timestamp', 'y_pred_proba', 'period',
>>> 'y_pred', 'repaid']]
>>>
>>> calc = nml.UnivariateStatisticalDriftCalculator(column_names=column_names,
>>> timestamp_column_name='timestamp')
>>>
>>> calc.fit(reference_df)
>>>
>>> results = calc.calculate(analysis_df.merge(target_df, on='identifier'))
>>>
>>> ranker = nml.Ranker.by('alert_count')
>>> ranked_features = ranker.rank(results, only_drifting=False)
>>> display(ranked_features)
column_name number_of_alerts rank
0 identifier 10 1
1 distance_from_office 5 2
2 salary_range 5 3
3 public_transportation_cost 5 4
4 wfh_prev_workday 5 5
5 tenure 2 6
6 gas_price_per_litre 0 7
7 workday 0 8
8 work_home_actual 0 9
"""
if drift_calculation_result.data.empty:
raise InvalidArgumentsException('drift results contain no data to use for ranking')
non_chunk = list(set(drift_calculation_result.data.columns.get_level_values(0)) - {'chunk'})
ranking = (
drift_calculation_result.filter(period='analysis')
.to_df()
.loc[:, (non_chunk, slice(None), 'alert')]
.sum()
.reset_index()[['level_0', 0]]
)
ranking = ranking.groupby('level_0').sum()
ranking.columns = ['number_of_alerts']
ranking['column_name'] = ranking.index
ranking = ranking.sort_values(['number_of_alerts', 'column_name'], ascending=False)
ranking = ranking.reset_index(drop=True)
ranking['rank'] = ranking.index + 1
if only_drifting:
ranking = ranking.loc[ranking['number_of_alerts'] != 0, :]
return ranking