Source code for nannyml.drift.ranking

#  Author:   Niels Nuyttens  <niels@nannyml.com>
#
#  License: Apache Software License 2.0

"""Module containing ways to rank drifting features."""

import abc
from typing import Dict, Optional

import pandas as pd

from nannyml.drift.base import DriftResult
from nannyml.exceptions import InvalidArgumentsException
from nannyml.metadata.base import ModelMetadata


[docs]class Ranking(abc.ABC):
    """Used to rank drifting features according to impact."""

[docs]    def rank(
        self,
        drift_calculation_result: DriftResult,
        model_metadata: ModelMetadata,
        only_drifting: bool = False,
    ) -> pd.DataFrame:
        """Ranks the features within a drift calculation according to impact.

        Parameters
        ----------
        drift_calculation_result : pd.DataFrame
            The drift calculation results.
        model_metadata: ModelMetadata
            Metadata describing the monitored model.
        only_drifting : bool
            Omits non-drifting features from the ranking if True.

        Returns
        -------
        feature_ranking: pd.DataFrame
            A DataFrame containing at least a feature name and a rank per row.

        """
        raise NotImplementedError


[docs]class AlertCountRanking(Ranking):
    """Ranks drifting features by the number of 'alerts' they've caused."""

    ALERT_COLUMN_SUFFIX = '_alert'

[docs]    def rank(
        self,
        drift_calculation_result: DriftResult,
        model_metadata: ModelMetadata,
        only_drifting: bool = False,
    ) -> pd.DataFrame:
        """Compares the number of alerts for each feature and uses that for ranking.

        Parameters
        ----------
        drift_calculation_result : pd.DataFrame
            The drift calculation results. Requires alert columns to be present. These are recognized and parsed
            using the ALERT_COLUMN_SUFFIX pattern, currently equal to ``'_alert'``.
        model_metadata: ModelMetadata
            Metadata describing the monitored model, used to check what the features are and exclude predictions
            from ranking results.
        only_drifting : bool
            Omits features without alerts from the ranking results.

        Returns
        -------
        feature_ranking: pd.DataFrame
            A DataFrame containing the feature names and their ranks (the highest rank starts at 1,
            second-highest rank is 2, etc.)

        Examples
        --------
        >>> import nannyml as nml
        >>> reference_df, analysis_df, target_df = nml.load_synthetic_binary_classification_dataset()
        >>> metadata = nml.extract_metadata(reference_df)
        >>> metadata.target_column_name = 'work_home_actual'
        >>> calc = nml.UnivariateStatisticalDriftCalculator(metadata, chunk_size=5000)
        >>> calc.fit(reference_df)
        >>> drift = calc.calculate(analysis_df)
        >>>
        >>> ranked = Ranker.by('alert_count').rank(drift, metadata)
        >>> ranked
        """
        if drift_calculation_result.data.empty:
            raise InvalidArgumentsException('drift results contain no data to use for ranking')

        alert_column_names = [f'{feature.column_name}{self.ALERT_COLUMN_SUFFIX}' for feature in model_metadata.features]

        if len(alert_column_names) == 0:
            raise InvalidArgumentsException('drift results are not statistical drift results.')

        if (
            len(list(filter(lambda col: col.endswith(self.ALERT_COLUMN_SUFFIX), drift_calculation_result.data.columns)))
            == 0
        ):
            raise InvalidArgumentsException('drift results are not statistical drift results.')

        ranking = pd.DataFrame(drift_calculation_result.data[alert_column_names].sum()).reset_index()
        ranking.columns = ['feature', 'number_of_alerts']
        ranking['feature'] = ranking['feature'].str.replace(self.ALERT_COLUMN_SUFFIX, '')
        ranking = ranking.sort_values('number_of_alerts', ascending=False, ignore_index=True)
        ranking['rank'] = ranking.index + 1
        if only_drifting:
            ranking = ranking.loc[ranking['number_of_alerts'] != 0, :]
        return ranking


[docs]class Ranker:
    """Factory class to easily access Ranking implementations."""

    _rankings: Dict[str, Ranking] = {'alert_count': AlertCountRanking()}

[docs]    @classmethod
    def register_ranking(cls, key: str, ranking: Ranking):
        """Registers a new calibrator to the index.

        This index associates a certain key with a Ranking instance.

        Parameters
        ----------
        key: str
            The key used to retrieve a Calibrator. When providing a key that is already in the index, the value
            will be overwritten.
        ranking: Ranking
            An instance of a Ranking subclass.

        Examples
        --------
        >>> Ranker.register_ranking('alert_count', AlertCountRanking())
        """
        cls._rankings[key] = ranking

[docs]    @classmethod
    def by(cls, key: Optional[str], **kwargs):
        """Returns a Ranking subclass instance given a key value.

        If the provided key equals ``None``, then a new instance of the default Ranking (AlertCountRanking)
        will be returned.

        If a non-existent key is provided an ``InvalidArgumentsException`` is raised.

        Parameters
        ----------
        key : str
            The key used to retrieve a Ranking. When providing a key that is already in the index, the value
            will be overwritten.

        Returns
        -------
        ranking: Ranking
            A new instance of a specific Ranking subclass.

        Examples
        --------
        >>> ranking = Ranker.by('alert_count')
        """
        default = AlertCountRanking()
        if key is None:
            return default

        if key not in cls._rankings:
            raise InvalidArgumentsException(
                f"ranking {key} unknown. " f"Please provide one of the following: {cls._rankings.keys()}"
            )

        return cls._rankings.get(key, default)