Source code for nannyml.drift.ranking

#  Author:   Niels Nuyttens  <niels@nannyml.com>
#
#  License: Apache Software License 2.0

"""Module containing ways to rank drifting features."""

import abc
import logging
from typing import Any, Callable, Dict

import pandas as pd

from nannyml.drift.univariate.result import Result
from nannyml.exceptions import InvalidArgumentsException


[docs]class Ranking(abc.ABC):
    """Class that abstracts ranking features by impact on model performance."""

[docs]    def rank(
        self,
        drift_calculation_result: Result,
        only_drifting: bool = False,
    ) -> pd.DataFrame:
        """Ranks the features within a drift calculation according to impact on model performance.

        Parameters
        ----------
        drift_calculation_result : nannyml.drift.model_inputs.univariate.statistical.Result
            The drift calculation results.
        only_drifting : bool
            Omits non-drifting features from the ranking if True.

        Returns
        -------
        feature_ranking: pd.DataFrame
            A DataFrame containing at least a feature name and a rank per row.

        """
        raise NotImplementedError

    def __call__(self, *args, **kwargs):
        return self(**kwargs)


[docs]class Ranker:
    """Factory class to easily access Ranking implementations."""

    registry: Dict[str, Ranking] = {}

    @classmethod
    def _logger(cls) -> logging.Logger:
        return logging.getLogger(__name__)

[docs]    @classmethod
    def register(cls, key: str) -> Callable:
        """Adds a Ranking to the registry using the provided key.

        Just use the decorator above any :class:`~nannyml.drift.ranking.Ranking` subclass to have it automatically
        registered.

        Examples
        --------
        >>> @Ranker.register('alert_count')
        >>> class AlertCountRanking(Ranking):
        >>>     pass
        >>>
        >>> # Use the Ranking
        >>> ranker = nml.Ranker.by('alert_count')
        >>> ranked_features = ranker.rank(results, only_drifting=False)
        """

        def inner_wrapper(wrapped_class: Ranking) -> Ranking:
            if key in cls.registry:
                cls._logger().warning(f"re-registering Ranking for key='{key}'")
            cls.registry[key] = wrapped_class

            return wrapped_class

        return inner_wrapper

[docs]    @classmethod
    def by(cls, key: str = 'alert_count', ranking_args: Dict[str, Any] = None) -> Ranking:
        """Returns a Ranking subclass instance given a key value.

        If the provided key equals ``None``, then a new instance of the default Ranking (AlertCountRanking)
        will be returned.

        If a non-existent key is provided an ``InvalidArgumentsException`` is raised.

        Parameters
        ----------
        key : str, default='alert_count'
            The key used to retrieve a Ranking. When providing a key that is already in the index, the value
            will be overwritten.
        ranking_args: Dict[str, Any], default=None
            A dictionary of arguments that will be passed to the Ranking during creation.

        Returns
        -------
        ranking: Ranking
            A new instance of a specific Ranking subclass.

        Examples
        --------
        >>> ranking = Ranker.by('alert_count')
        """
        if ranking_args is None:
            ranking_args = {}

        if key not in cls.registry:
            raise InvalidArgumentsException(
                f"ranking {key} unknown. " f"Please provide one of the following: {cls.registry.keys()}"
            )

        ranking_class = cls.registry[key]
        return ranking_class(**ranking_args)


[docs]@Ranker.register('alert_count')
class AlertCountRanking(Ranking):
    """Ranks features by the number of drift 'alerts' they've caused."""

    ALERT_COLUMN_SUFFIX = '_alert'

[docs]    def rank(
        self,
        drift_calculation_result: Result,
        only_drifting: bool = False,
    ) -> pd.DataFrame:
        """Compares the number of alerts for each feature and ranks them accordingly.

        Parameters
        ----------
        drift_calculation_result : nannyml.drift.model_inputs.univariate.statistical.Result
            The drift calculation results. Requires alert columns to be present. These are recognized and parsed
            using the ALERT_COLUMN_SUFFIX pattern, currently equal to ``'_alert'``.
        only_drifting : bool, default=False
            Omits features without alerts from the ranking results.

        Returns
        -------
        feature_ranking: pd.DataFrame
            A DataFrame containing the feature names and their ranks (the highest rank starts at 1,
            second-highest rank is 2, etc.)

        Examples
        --------
        >>> import nannyml as nml
        >>> from IPython.display import display
        >>>
        >>> reference_df = nml.load_synthetic_binary_classification_dataset()[0]
        >>> analysis_df = nml.load_synthetic_binary_classification_dataset()[1]
        >>> target_df = nml.load_synthetic_binary_classification_dataset()[2]
        >>>
        >>> display(reference_df.head())
        >>>
        >>> column_names = [
        >>>     col for col in reference_df.columns if col not in ['timestamp', 'y_pred_proba', 'period',
        >>>                                                        'y_pred', 'repaid']]
        >>>
        >>> calc = nml.UnivariateStatisticalDriftCalculator(column_names=column_names,
        >>>                                                 timestamp_column_name='timestamp')
        >>>
        >>> calc.fit(reference_df)
        >>>
        >>> results = calc.calculate(analysis_df.merge(target_df, on='identifier'))
        >>>
        >>> ranker = nml.Ranker.by('alert_count')
        >>> ranked_features = ranker.rank(results, only_drifting=False)
        >>> display(ranked_features)
                          column_name  number_of_alerts  rank
        0                  identifier                10     1
        1        distance_from_office                 5     2
        2                salary_range                 5     3
        3  public_transportation_cost                 5     4
        4            wfh_prev_workday                 5     5
        5                      tenure                 2     6
        6         gas_price_per_litre                 0     7
        7                     workday                 0     8
        8            work_home_actual                 0     9
        """
        if drift_calculation_result.data.empty:
            raise InvalidArgumentsException('drift results contain no data to use for ranking')

        non_chunk = list(set(drift_calculation_result.data.columns.get_level_values(0)) - {'chunk'})
        ranking = (
            drift_calculation_result.filter(period='analysis')
            .to_df()
            .loc[:, (non_chunk, slice(None), 'alert')]
            .sum()
            .reset_index()[['level_0', 0]]
        )
        ranking = ranking.groupby('level_0').sum()
        ranking.columns = ['number_of_alerts']
        ranking['column_name'] = ranking.index
        ranking = ranking.sort_values(['number_of_alerts', 'column_name'], ascending=False)
        ranking = ranking.reset_index(drop=True)
        ranking['rank'] = ranking.index + 1
        if only_drifting:
            ranking = ranking.loc[ranking['number_of_alerts'] != 0, :]
        return ranking