Source code for nannyml.drift.ranker

#  Author:   Niels Nuyttens  <niels@nannyml.com>
#
#  License: Apache Software License 2.0

"""Module containing ways to rank features according to drift.

This model allows you to rank the columns within a
:class:`~nannyml.drift.univariate.calculator.UnivariateDriftCalculator` result according to their degree of drift.

The following rankers are currently available:

- :class:`~nannyml.drift.ranker.AlertCountRanker`: ranks the features according
  to the number of drift detection alerts they cause.
- :class:`~nannyml.drift.ranker.CorrelationRanker`: ranks the features according to their correlation with changes
  in realized or estimated performance.

"""
from __future__ import annotations

import logging
from typing import Optional, Union

import numpy as np
import pandas as pd
from scipy.stats import pearsonr

from nannyml._typing import Metric
from nannyml.data_quality.missing.result import Result as MissingValueResults
from nannyml.data_quality.unseen.result import Result as UnseenValuesResults
from nannyml.drift.univariate.result import Result as UnivariateResults
from nannyml.exceptions import InvalidArgumentsException, NotFittedException
from nannyml.performance_calculation.result import Result as PerformanceCalculationResults
from nannyml.performance_estimation.confidence_based.metrics import Metric as CBPEMetric
from nannyml.performance_estimation.confidence_based.results import Result as CBPEResults
from nannyml.performance_estimation.direct_loss_estimation.result import Result as DLEResults
from nannyml.stats.avg.result import Result as StatsAvgResults
from nannyml.stats.count import Result as StatsCountResults
from nannyml.stats.median import Result as StatsMedianResults
from nannyml.stats.std import Result as StatsStdResults
from nannyml.stats.sum import Result as StatsSumResults
from nannyml.usage_logging import UsageEvent, log_usage

RankableResult = Union[
    UnivariateResults,
    MissingValueResults,
    UnseenValuesResults,
    StatsAvgResults,
    StatsCountResults,
    StatsStdResults,
    StatsSumResults,
    StatsMedianResults,
]
PerformanceResults = Union[CBPEResults, DLEResults, PerformanceCalculationResults]

_logger = logging.getLogger(__name__)


def _validate_drift_result(rankable_result: RankableResult):
    if not isinstance(
        rankable_result,
        (
            UnivariateResults,
            MissingValueResults,
            UnseenValuesResults,
            StatsAvgResults,
            StatsCountResults,
            StatsStdResults,
            StatsSumResults,
            StatsMedianResults,
        ),
    ):
        raise InvalidArgumentsException(
            f"`rankable_result` should be one of `[UnivariateResults, MissingValueResults, "
            f"UnseenValuesResults, StatsAvgResults, StatsCountResults, StatsStdResults, "
            f"StatsSumResults, StatsMedianResults]`."
            f"\ngot {str(type(rankable_result))}"
        )

    if rankable_result.empty:
        raise InvalidArgumentsException('rankable_result contains no data to use for ranking')

    if isinstance(rankable_result, UnivariateResults):
        if len(rankable_result.categorical_method_names) > 1:
            raise InvalidArgumentsException(
                f"Only one categorical drift method should be present in the univariate results."
                f"\nFound: {rankable_result.categorical_method_names}"
            )

        if len(rankable_result.continuous_method_names) > 1:
            raise InvalidArgumentsException(
                f"Only one continuous drift method should be present in the univariate results."
                f"\nFound: {rankable_result.continuous_method_names}"
            )


def _validate_performance_result(performance_result: PerformanceResults):
    """Validate Inputs before performing ranking.

    Parameters
    ----------
    performance_result: Performance Estimation or Calculation results. Can be an instance of:
            nml.performance_estimation.confidence_based.results.Result,
            nml.performance_estimation.direct_loss_estimation.result.Result,
            nml.performance_calculation.result.Result
    """

    if not isinstance(performance_result, (CBPEResults, DLEResults, PerformanceCalculationResults)):
        raise InvalidArgumentsException(
            "Estimated or Realized Performance results object required for performance_results argument."
        )

    if len(performance_result.metrics) != 1:
        raise InvalidArgumentsException(
            "Just one metric should be present in performance_results used to rank CorrelationRanker."
        )


[docs]class AlertCountRanker:
    """Ranks the features according to the number of drift detection alerts they cause."""

[docs]    @log_usage(UsageEvent.RANKER_ALERT_COUNT_RUN)
    def rank(
        self,
        rankable_result: RankableResult,
        only_drifting: bool = False,
    ) -> pd.DataFrame:
        """Ranks the features according to the number of drift detection alerts they cause.

        Parameters
        ----------
        rankable_result : RankableResult
            The result of a univariate drift calculation.
        only_drifting : bool, default=False
            Omits features without alerts from the ranking results.

        Returns
        -------
        ranking: pd.DataFrame
            A DataFrame containing the feature names and their ranks (the highest rank starts at 1,
            second-highest rank is 2, etc.). Features with the same number of alerts are ranked alphanumerically on
            the feature name.

        Examples
        --------
        >>> import nannyml as nml
        >>> from IPython.display import display
        >>> reference_df, analysis_df, analysis_targets_df = nml.load_synthetic_car_loan_dataset()
        >>> analysis_full_df = analysis_df.merge(analysis_targets_df, left_index=True, right_index=True)
        >>> feature_column_names = [
        ...     'car_value', 'salary_range', 'debt_to_income_ratio', 'loan_length', 'repaid_loan_on_prev_car',
        ...     'size_of_downpayment', 'driver_tenure', 'y_pred_proba', 'y_pred', 'repaid'
        >>> ]
        >>> univ_calc = nml.UnivariateDriftCalculator(
        ...     column_names=feature_column_names,
        ...     treat_as_categorical=['y_pred', 'repaid'],
        ...     timestamp_column_name='timestamp',
        ...     continuous_methods=['kolmogorov_smirnov', 'jensen_shannon'],
        ...     categorical_methods=['chi2', 'jensen_shannon'],
        ...     chunk_size=5000
        >>> )
        >>> univ_calc.fit(reference_df)
        >>> univariate_results = univ_calc.calculate(analysis_full_df)
        >>> alert_count_ranker = nml.AlertCountRanker()
        >>> alert_count_ranked_features = alert_count_ranker.rank(
        ...     univariate_results.filter(methods=['jensen_shannon']),
        ...     only_drifting = False)
        >>> display(alert_count_ranked_features)
                number_of_alerts                 column_name  rank
        0                      5                y_pred_proba     1
        1                      5                salary_range     2
        2                      5     repaid_loan_on_prev_car     3
        3                      5                 loan_length     4
        4                      0                   car_value     5
        5                      0                      y_pred     6
        6                      0         size_of_downpayment     7
        7                      0                      repaid     8
        8                      0               driver_tenure     9
        9                      0        debt_to_income_ratio     10
        """
        _validate_drift_result(rankable_result)

        key_list = rankable_result.keys()
        ranking = (
            pd.concat([rankable_result.alerts(_key) for _key in key_list], axis=1).sum().reset_index()[['level_0', 0]]
        )
        ranking = ranking.groupby('level_0').sum()
        ranking.columns = ['number_of_alerts']
        ranking['column_name'] = ranking.index
        ranking = ranking.sort_values(['number_of_alerts', 'column_name'], ascending=False)
        ranking = ranking.reset_index(drop=True)
        ranking['rank'] = ranking.index + 1
        if only_drifting:
            ranking = ranking.loc[ranking['number_of_alerts'] != 0, :]
        return ranking


[docs]class CorrelationRanker:
    """Ranks the features according to their correlation with changes in realized or estimated performance.

    Examples
        --------
        >>> import nannyml as nml
        >>> from IPython.display import display
        >>> reference_df, analysis_df, analysis_targets_df = nml.load_synthetic_car_loan_dataset()
        >>> analysis_full_df = analysis_df.merge(analysis_targets_df, left_index=True, right_index=True)
        >>> feature_column_names = [
        ...     'car_value', 'salary_range', 'debt_to_income_ratio', 'loan_length', 'repaid_loan_on_prev_car',
        ...     'size_of_downpayment', 'driver_tenure', 'y_pred_proba', 'y_pred', 'repaid'
        >>> ]
        >>> univ_calc = nml.UnivariateDriftCalculator(
        ...     column_names=feature_column_names,
        ...     treat_as_categorical=['y_pred', 'repaid'],
        ...     timestamp_column_name='timestamp',
        ...     continuous_methods=['kolmogorov_smirnov', 'jensen_shannon'],
        ...     categorical_methods=['chi2', 'jensen_shannon'],
        ...     chunk_size=5000
        >>> )
        >>> univ_calc.fit(reference_df)
        >>> univariate_results = univ_calc.calculate(analysis_full_df)
        >>> realized_calc = nml.PerformanceCalculator(
        ...     y_pred_proba='y_pred_proba',
        ...     y_pred='y_pred',
        ...     y_true='repaid',
        ...     timestamp_column_name='timestamp',
        ...     problem_type='classification_binary',
        ...     metrics=['roc_auc', 'recall',],
        ...     chunk_size=5000)
        >>> realized_calc.fit(reference_df)
        >>> realized_perf_results = realized_calc.calculate(analysis_full_df)
        >>> ranker2 = nml.CorrelationRanker()
        >>> # ranker fits on one metric and reference period data only
        >>> ranker2.fit(
        ...     realized_perf_results.filter(period='reference', metrics=['recall']))
        >>> # ranker ranks on one drift method and one performance metric
        >>> correlation_ranked_features2 = ranker2.rank(
        ...     univariate_results.filter(period='analysis', methods=['jensen_shannon']),
        ...     realized_perf_results.filter(period='analysis', metrics=['recall']),
        ...     only_drifting = False)
        >>> display(correlation_ranked_features2)
                          column_name  pearsonr_correlation  pearsonr_pvalue  has_drifted  rank
        0     repaid_loan_on_prev_car               0.96897      3.90719e-06         True     1
        1                y_pred_proba              0.966157      5.50918e-06         True     2
        2                 loan_length              0.965298      6.08385e-06         True     3
        3                   car_value              0.963623      7.33185e-06         True     4
        4                salary_range              0.963456      7.46561e-06         True     5
        5         size_of_downpayment              0.308948         0.385072        False     6
        6        debt_to_income_ratio              0.307373         0.387627        False     7
        7                      y_pred             -0.357571         0.310383        False     8
        8                      repaid             -0.395842         0.257495        False     9
        9               driver_tenure             -0.575807        0.0815202        False     10
    """

    def __init__(self) -> None:
        """Creates a new CorrelationRanker instance."""
        super().__init__()

        self.metric: Metric
        self.mean_reference_performance: Optional[float] = None
        self.absolute_performance_change: Optional[float] = None

        self._is_fitted: bool = False

[docs]    @log_usage(UsageEvent.RANKER_CORRELATION_FIT)
    def fit(
        self,
        reference_performance_calculation_result: Optional[PerformanceResults] = None,
    ) -> CorrelationRanker:
        """Calculates the average performance during the reference period.
        This value is saved at the `mean_reference_performance` property of the ranker.

        Parameters
        ----------
        reference_performance_calculation_result : Union[CBPEResults, DLEResults, PerformanceCalculationResults]
            Results from any performance calculator or estimator, e.g.
            :class:`~nannyml.performance_calculation.calculator.PerformanceCalculator`
            :class:`~nannyml.performance_estimation.confidence_based.cbpe.CBPE`
            :class:`~nannyml.performance_estimation.direct_loss_estimation.dle.DLE`

        Returns
        -------
        ranking: CorrelationRanker
        """

        if reference_performance_calculation_result is None:
            raise InvalidArgumentsException("reference performance calculation results can not be None.")
        _validate_performance_result(reference_performance_calculation_result)

        # we're expecting to have filtered inputs, so we should only have a single input.
        self.metric = reference_performance_calculation_result.metrics[0]

        # TODO: this will fail for estimated confusion matrix
        metric_column_name = self.metric.name if isinstance(self.metric, CBPEMetric) else self.metric.column_name

        self.mean_reference_performance = (
            reference_performance_calculation_result.to_df().loc[:, (metric_column_name, 'value')].mean()
        )
        self._is_fitted = True
        return self

[docs]    @log_usage(UsageEvent.RANKER_CORRELATION_RUN)
    def rank(
        self,
        rankable_result: RankableResult,
        performance_result: Optional[PerformanceResults] = None,
        only_drifting: bool = False,
    ):
        """Compares the number of alerts for each feature and ranks them accordingly.

        Parameters
        ----------
        rankable_result: RankableResult
            The univariate, data quality or simple statistic drift results containing the features we want to rank.
        performance_result: PerformanceResults
            Results from any performance calculator or estimator, e.g.
            :class:`~nannyml.performance_calculation.calculator.PerformanceCalculator`
            :class:`~nannyml.performance_estimation.confidence_based.cbpe.CBPE`
            :class:`~nannyml.performance_estimation.direct_loss_estimation.dle.DLE`
        only_drifting: bool, default=False
            Omits features without alerts from the ranking results.

        Returns
        -------
        ranking: pd.DataFrame
            A DataFrame containing the feature names and their ranks (the highest rank starts at 1,
            second-highest rank is 2, etc.). Features with the same number of alerts are ranked alphanumerically on
            the feature name.
        """
        if not self._is_fitted or self.metric is None:
            raise NotFittedException("trying to call 'rank()' on an unfitted Ranker. Please call 'fit()' first")

        # Perform input validations
        if performance_result is None:
            raise InvalidArgumentsException("reference performance calculation results can not be None.")

        _validate_drift_result(rankable_result)
        _validate_performance_result(performance_result)

        _drift_index = rankable_result.chunk_start_indices
        _perf_index = performance_result.chunk_start_indices

        if not _drift_index.equals(_perf_index):
            raise InvalidArgumentsException(
                "Drift and Performance results need to be filtered to the same data period."
            )

        # TODO: this will fail for estimated confusion matrix
        metric_column_name = self.metric.name if isinstance(self.metric, CBPEMetric) else self.metric.column_name

        # Start ranking calculations
        abs_perf_change = np.abs(
            performance_result.to_df().loc[:, (metric_column_name, 'value')].to_numpy()
            - self.mean_reference_performance
        )

        self.absolute_performance_change = abs_perf_change

        features1 = []
        spearmanr1 = []
        spearmanr2 = []
        has_drifted = []

        for _key in rankable_result.keys():
            features1.append(_key.display_names[0])
            values = rankable_result.values(_key)

            if values is None or values.empty:
                _logger.info(f"skipped ranking `None` rankable values for key '{_key}'")
                break

            # Remove NaN values
            feature_nan, perf_nan = np.isnan(values.to_numpy()), np.isnan(abs_perf_change)
            filtered_values = values[~(feature_nan | perf_nan)]
            filtered_perf_change = abs_perf_change[~(feature_nan | perf_nan)]

            tmp1 = pearsonr(filtered_values.ravel(), filtered_perf_change)
            spearmanr1.append(tmp1[0])
            spearmanr2.append(tmp1[1])

            alerts = rankable_result.alerts(_key)
            has_drifted.append(alerts.any() if alerts is not None else False)

        ranked = pd.DataFrame(
            {
                'column_name': features1,
                'pearsonr_correlation': spearmanr1,
                'pearsonr_pvalue': spearmanr2,
                'has_drifted': has_drifted,
            }
        )

        # we want first row to be the most impactful feature
        ranked.sort_values('pearsonr_correlation', ascending=False, inplace=True)
        ranked.reset_index(drop=True, inplace=True)
        ranked['rank'] = ranked.index + 1

        if only_drifting:
            ranked = ranked.loc[ranked.has_drifted == True].reset_index(drop=True)  # noqa: E712

        return ranked