Source code for nannyml.drift.ranking

#  Author:   Niels Nuyttens  <niels@nannyml.com>
#
#  License: Apache Software License 2.0

"""Module containing ways to rank drifting features."""

import abc
import logging
from typing import Any, Callable, Dict

import pandas as pd

from nannyml.drift.model_inputs.univariate.statistical import UnivariateStatisticalDriftCalculatorResult
from nannyml.exceptions import InvalidArgumentsException


[docs]class Ranking(abc.ABC): """Class that abstracts ranking features by impact on model performance."""
[docs] def rank( self, drift_calculation_result: UnivariateStatisticalDriftCalculatorResult, only_drifting: bool = False, ) -> pd.DataFrame: """Ranks the features within a drift calculation according to impact on model performance. Parameters ---------- drift_calculation_result : UnivariateStatisticalDriftCalculatorResult The drift calculation results. only_drifting : bool Omits non-drifting features from the ranking if True. Returns ------- feature_ranking: pd.DataFrame A DataFrame containing at least a feature name and a rank per row. """ raise NotImplementedError
def __call__(self, *args, **kwargs): return self(**kwargs)
[docs]class Ranker: """Factory class to easily access Ranking implementations.""" registry: Dict[str, Ranking] = {} @classmethod def _logger(cls) -> logging.Logger: return logging.getLogger(__name__)
[docs] @classmethod def register(cls, key: str) -> Callable: """Adds a Ranking to the registry using the provided key. Just use the decorator above any :class:`~nannyml.drift.ranking.Ranking` subclass to have it automatically registered. Examples -------- >>> @Ranker.register('alert_count') >>> class AlertCountRanking(Ranking): >>> pass >>> >>> # Use the Ranking >>> ranker = nml.Ranker.by('alert_count') >>> ranked_features = ranker.rank(results, only_drifting=False) """ def inner_wrapper(wrapped_class: Ranking) -> Ranking: if key in cls.registry: cls._logger().warning(f"re-registering Ranking for key='{key}'") cls.registry[key] = wrapped_class return wrapped_class return inner_wrapper
[docs] @classmethod def by(cls, key: str = 'alert_count', ranking_args: Dict[str, Any] = None) -> Ranking: """Returns a Ranking subclass instance given a key value. If the provided key equals ``None``, then a new instance of the default Ranking (AlertCountRanking) will be returned. If a non-existent key is provided an ``InvalidArgumentsException`` is raised. Parameters ---------- key : str, default='alert_count' The key used to retrieve a Ranking. When providing a key that is already in the index, the value will be overwritten. ranking_args: Dict[str, Any], default=None A dictionary of arguments that will be passed to the Ranking during creation. Returns ------- ranking: Ranking A new instance of a specific Ranking subclass. Examples -------- >>> ranking = Ranker.by('alert_count') """ if ranking_args is None: ranking_args = {} if key not in cls.registry: raise InvalidArgumentsException( f"ranking {key} unknown. " f"Please provide one of the following: {cls.registry.keys()}" ) ranking_class = cls.registry[key] return ranking_class(**ranking_args)
[docs]@Ranker.register('alert_count') class AlertCountRanking(Ranking): """Ranks features by the number of drift 'alerts' they've caused.""" ALERT_COLUMN_SUFFIX = '_alert'
[docs] def rank( self, drift_calculation_result: UnivariateStatisticalDriftCalculatorResult, only_drifting: bool = False, ) -> pd.DataFrame: """Compares the number of alerts for each feature and ranks them accordingly. Parameters ---------- drift_calculation_result : pd.DataFrame The drift calculation results. Requires alert columns to be present. These are recognized and parsed using the ALERT_COLUMN_SUFFIX pattern, currently equal to ``'_alert'``. only_drifting : bool, default=False Omits features without alerts from the ranking results. Returns ------- feature_ranking: pd.DataFrame A DataFrame containing the feature names and their ranks (the highest rank starts at 1, second-highest rank is 2, etc.) Examples -------- >>> import nannyml as nml >>> from IPython.display import display >>> >>> reference_df = nml.load_synthetic_binary_classification_dataset()[0] >>> analysis_df = nml.load_synthetic_binary_classification_dataset()[1] >>> target_df = nml.load_synthetic_binary_classification_dataset()[2] >>> >>> display(reference_df.head()) >>> >>> feature_column_names = [ >>> col for col in reference_df.columns if col not in ['timestamp', 'y_pred_proba', 'period', >>> 'y_pred', 'repaid']] >>> >>> calc = nml.UnivariateStatisticalDriftCalculator(feature_column_names=feature_column_names, >>> timestamp_column_name='timestamp') >>> >>> calc.fit(reference_df) >>> >>> results = calc.calculate(analysis_df.merge(target_df, on='identifier')) >>> >>> ranker = nml.Ranker.by('alert_count') >>> ranked_features = ranker.rank(results, only_drifting=False) >>> display(ranked_features) feature number_of_alerts rank 0 identifier 10 1 1 distance_from_office 5 2 2 salary_range 5 3 3 public_transportation_cost 5 4 4 wfh_prev_workday 5 5 5 tenure 2 6 6 gas_price_per_litre 0 7 7 workday 0 8 8 work_home_actual 0 9 """ if drift_calculation_result.data.empty: raise InvalidArgumentsException('drift results contain no data to use for ranking') alert_column_names = [ f'{name}{self.ALERT_COLUMN_SUFFIX}' for name in drift_calculation_result.calculator.feature_column_names ] ranking = pd.DataFrame(drift_calculation_result.data[alert_column_names].sum()).reset_index() ranking.columns = ['feature', 'number_of_alerts'] ranking['feature'] = ranking['feature'].str.replace(self.ALERT_COLUMN_SUFFIX, '') ranking = ranking.sort_values('number_of_alerts', ascending=False, ignore_index=True) ranking['rank'] = ranking.index + 1 if only_drifting: ranking = ranking.loc[ranking['number_of_alerts'] != 0, :] return ranking