Source code for nannyml.thresholds

#  Author:   Niels Nuyttens  <niels@nannyml.com>
#
#  License: Apache Software License 2.0
from __future__ import annotations

import abc
import logging
from typing import Any, Callable, ClassVar, Dict, Optional, Tuple, Type, Union

import numpy as np

from nannyml.exceptions import InvalidArgumentsException, ThresholdException


[docs]class Threshold(abc.ABC):
    """A base class used to calculate lower and upper threshold values given one or multiple arrays.

    Any subclass should implement the abstract `thresholds` method.
    It takes an array or list of arrays and converts them into lower and upper threshold values, represented
    as a tuple of optional floats.

    A `None` threshold value is interpreted as if there is no upper or lower threshold.
    One or both values might be `None`.
    """

    _registry: ClassVar[Dict[str, Type[Threshold]]] = {}

    @property
    def _logger(self):
        return logging.getLogger(self.__name__)

    def __str__(self):
        return self.__str__()

    def __repr__(self):
        return self.__class__.__name__ + str(vars(self))

    def __init_subclass__(cls, type: str, **kwargs) -> None:
        super().__init_subclass__(**kwargs)
        Threshold._registry[type] = cls

[docs]    @abc.abstractmethod
    def thresholds(self, data: np.ndarray, **kwargs) -> Tuple[Optional[float], Optional[float]]:
        """Returns lower and upper threshold values when given one or more np.ndarray instances.

        Parameters:
            data: np.ndarray
                An array of values used to calculate the thresholds on. This will most often represent a metric
                calculated on one or more sets of data, e.g. a list of F1 scores of multiple data chunks.
            kwargs: Dict[str, Any]
                Optional keyword arguments passed to the implementing subclass.

        Returns:
            lower, upper: Tuple[Optional[float], Optional[float]]
                The lower and upper threshold values. One or both might be `None`.
        """

[docs]    @classmethod
    def parse_object(cls, object: Dict[str, Any]) -> Threshold:
        """Parse object as :class:`Threshold`"""
        type = object.pop('type', '')

        try:
            threshold_cls = cls._registry[type]
        except KeyError:
            accepted_values = ', '.join(map(repr, cls._registry))
            raise InvalidArgumentsException(
                f"Expected one of {accepted_values} for threshold type, but received '{type}'"
            )

        return threshold_cls(**object)


[docs]class ConstantThreshold(Threshold, type="constant"):
    """A `Thresholder` implementation that returns a constant lower and or upper threshold value.

    Attributes:
        lower: Optional[float]
            The constant lower threshold value. Defaults to `None`, meaning there is no lower threshold.
        upper: Optional[float]
            The constant upper threshold value. Defaults to `None`, meaning there is no upper threshold.

    Raises:
        InvalidArgumentsException: raised when an argument was given using an incorrect type or name
        ThresholdException: raised when the ConstantThreshold could not be created using the given argument values

    Examples:
        >>> data = np.array(range(10))
        >>> t = ConstantThreshold(lower=None, upper=0.1)
        >>> lower, upper = t.threshold()
        >>> print(lower, upper)
        None 0.1
    """

    def __init__(self, lower: Optional[Union[float, int]] = None, upper: Optional[Union[float, int]] = None):
        """Creates a new ConstantThreshold instance.

        Args:
            lower: Optional[Union[float, int]], default=None
                The constant lower threshold value. Defaults to `None`, meaning there is no lower threshold.
            upper: Optional[Union[float, int]], default=None
                The constant upper threshold value. Defaults to `None`, meaning there is no upper threshold.

        Raises:
            InvalidArgumentsException: raised when an argument was given using an incorrect type or name
            ThresholdException: raised when the ConstantThreshold could not be created using the given argument values
        """
        self._validate_inputs(lower, upper)

        self.lower = lower
        self.upper = upper

[docs]    def thresholds(self, data: np.ndarray, **kwargs) -> Tuple[Optional[float], Optional[float]]:
        return self.lower, self.upper

    @staticmethod
    def _validate_inputs(lower: Optional[Union[float, int]] = None, upper: Optional[Union[float, int]] = None):
        if lower is not None and not isinstance(lower, (float, int)) or isinstance(lower, bool):
            raise InvalidArgumentsException(
                f"expected type of 'lower' to be 'float', 'int' or None " f"but got '{type(lower).__name__}'"
            )

        if upper is not None and not isinstance(upper, (float, int)) or isinstance(upper, bool):
            raise InvalidArgumentsException(
                f"expected type of 'upper' to be 'float', 'int' or None " f"but got '{type(upper).__name__}'"
            )

        # explicit None check is required due to special interpretation of the value 0.0 as False
        if lower is not None and upper is not None and lower >= upper:
            raise ThresholdException(f"lower threshold {lower} must be less than upper threshold {upper}")


[docs]class StandardDeviationThreshold(Threshold, type="standard_deviation"):
    """A Thresholder that offsets the mean of an array by a multiple of the standard deviation of the array values.

    This thresholder will take the aggregate of an array of values, the mean by default and add or subtract an offset
    to get the upper and lower threshold values.
    This offset is calculated as a multiplier, by default 3, times the standard deviation of the given array.

    Attributes:
        std_lower_multiplier: float
        std_upper_multiplier: float

    Examples:
        >>> data = np.array(range(10))
        >>> t = ConstantThreshold(lower=None, upper=0.1)
        >>> lower, upper = t.threshold()
        >>> print(lower, upper)
        -4.116843969807043 13.116843969807043
    """

    def __init__(
        self,
        std_lower_multiplier: Optional[Union[float, int]] = 3,
        std_upper_multiplier: Optional[Union[float, int]] = 3,
        offset_from: Callable[[np.ndarray], Any] = np.nanmean,
    ):
        """Creates a new StandardDeviationThreshold instance.

        Args:
            std_lower_multiplier: float, default=3
                The number the standard deviation of the input array will be multiplied with to form the lower offset.
                This value will be subtracted from the aggregate of the input array.
                Defaults to 3.
            std_upper_multiplier: float, default=3
                The number the standard deviation of the input array will be multiplied with to form the upper offset.
                This value will be added to the aggregate of the input array.
                Defaults to 3.
            offset_from: Callable[[np.ndarray], Any], default=np.nanmean
                A function that will be applied to the input array to aggregate it into a single value.
                Adding the upper offset to this value will yield the upper threshold, subtracting the lower offset
                will yield the lower threshold.
        """

        self._validate_inputs(std_lower_multiplier, std_upper_multiplier)

        self.std_lower_multiplier = std_lower_multiplier
        self.std_upper_multiplier = std_upper_multiplier
        self.offset_from = offset_from

[docs]    def thresholds(self, data: np.ndarray, **kwargs) -> Tuple[Optional[float], Optional[float]]:
        aggregate = self.offset_from(data)
        std = np.nanstd(data)

        lower_threshold = aggregate - std * self.std_lower_multiplier if self.std_lower_multiplier is not None else None

        upper_threshold = aggregate + std * self.std_upper_multiplier if self.std_upper_multiplier is not None else None

        return lower_threshold, upper_threshold

    @staticmethod
    def _validate_inputs(
        std_lower_multiplier: Optional[Union[float, int]] = 3, std_upper_multiplier: Optional[Union[float, int]] = 3
    ):
        if (
            std_lower_multiplier is not None
            and not isinstance(std_lower_multiplier, (float, int))
            or isinstance(std_lower_multiplier, bool)
        ):
            raise InvalidArgumentsException(
                f"expected type of 'std_lower_multiplier' to be 'float', 'int' or None "
                f"but got '{type(std_lower_multiplier).__name__}'"
            )

        if std_lower_multiplier and std_lower_multiplier < 0:
            raise ThresholdException(
                f"'std_lower_multiplier' should be greater than 0 " f"but got value {std_lower_multiplier}"
            )

        if (
            std_upper_multiplier is not None
            and not isinstance(std_upper_multiplier, (float, int))
            or isinstance(std_upper_multiplier, bool)
        ):
            raise InvalidArgumentsException(
                f"expected type of 'std_upper_multiplier' to be 'float', 'int' or None "
                f"but got '{type(std_upper_multiplier).__name__}'"
            )

        if std_upper_multiplier and std_upper_multiplier < 0:
            raise ThresholdException(
                f"'std_upper_multiplier' should be greater than 0 " f"but got value {std_upper_multiplier}"
            )


[docs]def calculate_threshold_values(
    threshold: Threshold,
    data: np.ndarray,
    lower_threshold_value_limit: Optional[float] = None,
    upper_threshold_value_limit: Optional[float] = None,
    override_using_none: bool = False,
    logger: Optional[logging.Logger] = None,
    metric_name: Optional[str] = None,
) -> Tuple[Optional[float], Optional[float]]:
    """Calculate lower and upper threshold values with respect to the provided Threshold and value limits.

    Parameters:
        threshold: Threshold
            The Threshold instance that determines how the lower and upper threshold values will be calculated.
        data: np.ndarray
            The data used by the Threshold instance to calculate the lower and upper threshold values.
            This will often be the values of a drift detection method or performance metric on chunks of reference data.
        lower_threshold_value_limit: Optional[float], default=None
            An optional value that serves as a limit for the lower threshold value. Any calculated lower threshold
            values that end up below this limit will be replaced by this limit value.
            The limit is often a theoretical constraint enforced by a specific drift detection method or performance
            metric.
        upper_threshold_value_limit: Optional[float], default=None
            An optional value that serves as a limit for the lower threshold value. Any calculated lower threshold
            values that end up below this limit will be replaced by this limit value.
            The limit is often a theoretical constraint enforced by a specific drift detection method or performance
            metric.
        override_using_none: bool, default=False
            When set to True use None to override threshold values that exceed value limits.
            This will prevent them from being rendered on plots.
        logger: Optional[logging.Logger], default=None
            An optional Logger instance. When provided a warning will be logged when a calculated threshold value
            gets overridden by a threshold value limit.
        metric_name: Optional[str], default=None
            When provided the metric name will be included within any log messages for additional clarity.
    """

    lower_threshold_value, upper_threshold_value = threshold.thresholds(data)

    if (
        lower_threshold_value_limit is not None
        and lower_threshold_value is not None
        and lower_threshold_value <= lower_threshold_value_limit
    ):
        override_value = None if override_using_none else lower_threshold_value_limit
        if logger:
            logger.warning(
                f"{metric_name + ' ' if metric_name else ''}lower threshold value {lower_threshold_value} "
                f"overridden by lower threshold value limit {override_value}"
            )
        lower_threshold_value = override_value

    if (
        upper_threshold_value_limit is not None
        and upper_threshold_value is not None
        and upper_threshold_value >= upper_threshold_value_limit
    ):
        override_value = None if override_using_none else upper_threshold_value_limit
        if logger:
            logger.warning(
                f"{metric_name + ' ' if metric_name else ''}upper threshold value {upper_threshold_value} "
                f"overridden by upper threshold value limit {override_value}"
            )
        upper_threshold_value = override_value

    return lower_threshold_value, upper_threshold_value