Source code for nannyml.performance_calculation.metrics.regression

#  Author:   Niels Nuyttens  <niels@nannyml.com>
#
#  License: Apache Software License 2.0
"""Performance Calculation Regression Metrics Module."""

import warnings
from typing import Optional, Tuple

import numpy as np
import pandas as pd
from sklearn.metrics import (
    mean_absolute_error,
    mean_absolute_percentage_error,
    mean_squared_error,
    mean_squared_log_error,
)

from nannyml._typing import ProblemType
from nannyml.base import (
    _list_missing,
    _raise_exception_for_negative_values,
    common_nan_removal,
)
from nannyml.performance_calculation.metrics.base import Metric, MetricFactory
from nannyml.sampling_error.regression import (
    mae_sampling_error,
    mae_sampling_error_components,
    mape_sampling_error,
    mape_sampling_error_components,
    mse_sampling_error,
    mse_sampling_error_components,
    msle_sampling_error,
    msle_sampling_error_components,
    rmse_sampling_error,
    rmse_sampling_error_components,
    rmsle_sampling_error,
    rmsle_sampling_error_components,
)
from nannyml.thresholds import Threshold


[docs]@MetricFactory.register(metric="mae", use_case=ProblemType.REGRESSION) class MAE(Metric): """Mean Absolute Error metric.""" y_pred: str def __init__( self, y_true: str, y_pred: str, threshold: Threshold, y_pred_proba: Optional[str] = None, **kwargs, ): """Creates a new MAE instance. Parameters ---------- y_true: str The name of the column containing target values. y_pred: str The name of the column containing your model predictions. threshold: Threshold The Threshold instance that determines how the lower and upper threshold values will be calculated. y_pred_proba: Optional[str], default=None Name of the column containing your model output. """ super().__init__( name="mae", y_true=y_true, y_pred=y_pred, y_pred_proba=y_pred_proba, threshold=threshold, lower_threshold_limit=0, components=[("MAE", "mae")], ) # sampling error self._sampling_error_components: Tuple = ()
[docs] def __str__(self): """Get string representation of metric.""" return "MAE"
def _fit(self, reference_data: pd.DataFrame): _list_missing([self.y_true, self.y_pred], list(reference_data.columns)) reference_data, empty = common_nan_removal( reference_data[[self.y_true, self.y_pred]], [self.y_true, self.y_pred] ) if empty: self._sampling_error_components = (np.nan,) else: self._sampling_error_components = mae_sampling_error_components( y_true_reference=reference_data[self.y_true], y_pred_reference=reference_data[self.y_pred], ) def _calculate(self, data: pd.DataFrame): """Redefine to handle NaNs and edge cases.""" _list_missing([self.y_true, self.y_pred], list(data.columns)) data, empty = common_nan_removal( data[[self.y_true, self.y_pred]], [self.y_true, self.y_pred] ) if empty: warnings.warn( f"No data or too many missing values, cannot calculate {self.display_name}. " f"Returning NaN." ) return np.nan y_true = data[self.y_true] y_pred = data[self.y_pred] return mean_absolute_error(y_true, y_pred) def _sampling_error(self, data: pd.DataFrame) -> float: data, empty = common_nan_removal( data[[self.y_true, self.y_pred]], [self.y_true, self.y_pred] ) if empty: warnings.warn( f"Too many missing values, cannot calculate {self.display_name} sampling error. " "Returning NaN." ) return np.nan else: return mae_sampling_error(self._sampling_error_components, data)
[docs]@MetricFactory.register(metric="mape", use_case=ProblemType.REGRESSION) class MAPE(Metric): """Mean Absolute Percentage Error metric.""" y_pred: str def __init__( self, y_true: str, y_pred: str, threshold: Threshold, y_pred_proba: Optional[str] = None, **kwargs, ): """Creates a new MAPE instance. Parameters ---------- y_true: str The name of the column containing target values. y_pred: str The name of the column containing your model predictions. threshold: Threshold The Threshold instance that determines how the lower and upper threshold values will be calculated. y_pred_proba: Optional[str], default=None Name of the column containing your model output. """ super().__init__( name="mape", y_true=y_true, y_pred=y_pred, y_pred_proba=y_pred_proba, threshold=threshold, lower_threshold_limit=0, components=[("MAPE", "mape")], ) # sampling error self._sampling_error_components: Tuple = ()
[docs] def __str__(self): """Get string representation of metric.""" return "MAPE"
def _fit(self, reference_data: pd.DataFrame): _list_missing([self.y_true, self.y_pred], list(reference_data.columns)) reference_data, empty = common_nan_removal( reference_data[[self.y_true, self.y_pred]], [self.y_true, self.y_pred] ) if empty: self._sampling_error_components = (np.nan,) else: self._sampling_error_components = mape_sampling_error_components( y_true_reference=reference_data[self.y_true], y_pred_reference=reference_data[self.y_pred], ) def _calculate(self, data: pd.DataFrame): """Redefine to handle NaNs and edge cases.""" _list_missing([self.y_true, self.y_pred], list(data.columns)) data, empty = common_nan_removal( data[[self.y_true, self.y_pred]], [self.y_true, self.y_pred] ) if empty: warnings.warn( f"No data or too many missing values, cannot calculate {self.display_name}. " f"Returning NaN." ) return np.nan y_true = data[self.y_true] y_pred = data[self.y_pred] return mean_absolute_percentage_error(y_true, y_pred) def _sampling_error(self, data: pd.DataFrame) -> float: data, empty = common_nan_removal( data[[self.y_true, self.y_pred]], [self.y_true, self.y_pred] ) if empty: warnings.warn( f"Too many missing values, cannot calculate {self.display_name} sampling error. " "Returning NaN." ) return np.nan else: return mape_sampling_error(self._sampling_error_components, data)
[docs]@MetricFactory.register(metric="mse", use_case=ProblemType.REGRESSION) class MSE(Metric): """Mean Squared Error metric.""" y_pred: str def __init__( self, y_true: str, y_pred: str, threshold: Threshold, y_pred_proba: Optional[str] = None, **kwargs, ): """Creates a new MSE instance. Parameters ---------- y_true: str The name of the column containing target values. y_pred: str The name of the column containing your model predictions. threshold: Threshold The Threshold instance that determines how the lower and upper threshold values will be calculated. y_pred_proba: Optional[str], default=None Name of the column containing your model output. """ super().__init__( name="mse", y_true=y_true, y_pred=y_pred, y_pred_proba=y_pred_proba, threshold=threshold, lower_threshold_limit=0, components=[("MSE", "mse")], ) # sampling error self._sampling_error_components: Tuple = ()
[docs] def __str__(self): """Get string representation of metric.""" return "MSE"
def _fit(self, reference_data: pd.DataFrame): _list_missing([self.y_true, self.y_pred], list(reference_data.columns)) reference_data, empty = common_nan_removal( reference_data[[self.y_true, self.y_pred]], [self.y_true, self.y_pred] ) if empty: self._sampling_error_components = (np.nan,) else: self._sampling_error_components = mse_sampling_error_components( y_true_reference=reference_data[self.y_true], y_pred_reference=reference_data[self.y_pred], ) def _calculate(self, data: pd.DataFrame): """Redefine to handle NaNs and edge cases.""" _list_missing([self.y_true, self.y_pred], list(data.columns)) data, empty = common_nan_removal( data[[self.y_true, self.y_pred]], [self.y_true, self.y_pred] ) if empty: warnings.warn( f"No data or too many missing values, cannot calculate {self.display_name}. " f"Returning NaN." ) return np.nan y_true = data[self.y_true] y_pred = data[self.y_pred] return mean_squared_error(y_true, y_pred) def _sampling_error(self, data: pd.DataFrame) -> float: data, empty = common_nan_removal( data[[self.y_true, self.y_pred]], [self.y_true, self.y_pred] ) if empty: warnings.warn( f"Too many missing values, cannot calculate {self.display_name} sampling error. " "Returning NaN." ) return np.nan else: return mse_sampling_error(self._sampling_error_components, data)
[docs]@MetricFactory.register(metric="msle", use_case=ProblemType.REGRESSION) class MSLE(Metric): """Mean Squared Logarithmic Error metric.""" y_pred: str def __init__( self, y_true: str, y_pred: str, threshold: Threshold, y_pred_proba: Optional[str] = None, **kwargs, ): """Creates a new MSLE instance. Parameters ---------- y_true: str The name of the column containing target values. y_pred: str The name of the column containing your model predictions. threshold: Threshold The Threshold instance that determines how the lower and upper threshold values will be calculated. y_pred_proba: Optional[str], default=None Name of the column containing your model output. """ super().__init__( name="msle", y_true=y_true, y_pred=y_pred, y_pred_proba=y_pred_proba, threshold=threshold, lower_threshold_limit=0, components=[("MSLE", "msle")], ) # sampling error self._sampling_error_components: Tuple = ()
[docs] def __str__(self): """Get string representation of metric.""" return "MSLE"
def _fit(self, reference_data: pd.DataFrame): _list_missing([self.y_true, self.y_pred], list(reference_data.columns)) reference_data, empty = common_nan_removal( reference_data[[self.y_true, self.y_pred]], [self.y_true, self.y_pred] ) if empty: self._sampling_error_components = (np.nan,) else: self._sampling_error_components = msle_sampling_error_components( y_true_reference=reference_data[self.y_true], y_pred_reference=reference_data[self.y_pred], ) def _calculate(self, data: pd.DataFrame): """Redefine to handle NaNs and edge cases.""" _list_missing([self.y_true, self.y_pred], list(data.columns)) data, empty = common_nan_removal( data[[self.y_true, self.y_pred]], [self.y_true, self.y_pred] ) if empty: warnings.warn( f"No data or too many missing values, cannot calculate {self.display_name}. " f"Returning NaN." ) return np.nan y_true = data[self.y_true] y_pred = data[self.y_pred] # TODO: include option to drop negative values as well? _raise_exception_for_negative_values(y_true) _raise_exception_for_negative_values(y_pred) return mean_squared_log_error(y_true, y_pred) def _sampling_error(self, data: pd.DataFrame) -> float: data, empty = common_nan_removal( data[[self.y_true, self.y_pred]], [self.y_true, self.y_pred] ) if empty: warnings.warn( f"Too many missing values, cannot calculate {self.display_name} sampling error. " "Returning NaN." ) return np.nan else: return msle_sampling_error(self._sampling_error_components, data)
[docs]@MetricFactory.register(metric="rmse", use_case=ProblemType.REGRESSION) class RMSE(Metric): """Root Mean Squared Error metric.""" y_pred: str def __init__( self, y_true: str, y_pred: str, threshold: Threshold, y_pred_proba: Optional[str] = None, **kwargs, ): """Creates a new RMSE instance. Parameters ---------- y_true: str The name of the column containing target values. y_pred: str The name of the column containing your model predictions. threshold: Threshold The Threshold instance that determines how the lower and upper threshold values will be calculated. y_pred_proba: Optional[str], default=None Name of the column containing your model output. """ super().__init__( name="rmse", y_true=y_true, y_pred=y_pred, y_pred_proba=y_pred_proba, threshold=threshold, lower_threshold_limit=0, components=[("RMSE", "rmse")], ) # sampling error self._sampling_error_components: Tuple = ()
[docs] def __str__(self): """Get string representation of metric.""" return "RMSE"
def _fit(self, reference_data: pd.DataFrame): _list_missing([self.y_true, self.y_pred], list(reference_data.columns)) reference_data, empty = common_nan_removal( reference_data[[self.y_true, self.y_pred]], [self.y_true, self.y_pred] ) if empty: self._sampling_error_components = (np.nan,) else: self._sampling_error_components = rmse_sampling_error_components( y_true_reference=reference_data[self.y_true], y_pred_reference=reference_data[self.y_pred], ) def _calculate(self, data: pd.DataFrame): """Redefine to handle NaNs and edge cases.""" _list_missing([self.y_true, self.y_pred], list(data.columns)) data, empty = common_nan_removal( data[[self.y_true, self.y_pred]], [self.y_true, self.y_pred] ) if empty: warnings.warn( f"No data or too many missing values, cannot calculate {self.display_name}. " f"Returning NaN." ) return np.nan y_true = data[self.y_true] y_pred = data[self.y_pred] # Deal with breaking API change in sklearn 1.4 # https://scikit-learn.org/1.5/modules/generated/sklearn.metrics.root_mean_squared_error.html try: from sklearn.metrics import root_mean_squared_error return root_mean_squared_error(y_true, y_pred) except ImportError: from sklearn.metrics import mean_squared_error return mean_squared_error(y_true, y_pred, squared=False) def _sampling_error(self, data: pd.DataFrame) -> float: data, empty = common_nan_removal( data[[self.y_true, self.y_pred]], [self.y_true, self.y_pred] ) if empty: warnings.warn( f"Too many missing values, cannot calculate {self.display_name} sampling error. " "Returning NaN." ) return np.nan else: return rmse_sampling_error(self._sampling_error_components, data)
[docs]@MetricFactory.register(metric="rmsle", use_case=ProblemType.REGRESSION) class RMSLE(Metric): """Root Mean Squared Logarithmic Error metric.""" y_pred: str def __init__( self, y_true: str, y_pred: str, threshold: Threshold, y_pred_proba: Optional[str] = None, **kwargs, ): """Creates a new RMSLE instance. Parameters ---------- y_true: str The name of the column containing target values. y_pred: str The name of the column containing your model predictions. threshold: Threshold The Threshold instance that determines how the lower and upper threshold values will be calculated. y_pred_proba: Optional[str], default=None Name of the column containing your model output. """ super().__init__( name="rmsle", y_true=y_true, y_pred=y_pred, y_pred_proba=y_pred_proba, threshold=threshold, lower_threshold_limit=0, components=[("RMSLE", "rmsle")], ) # sampling error self._sampling_error_components: Tuple = ()
[docs] def __str__(self): """Get string representation of metric.""" return "RMSLE"
def _fit(self, reference_data: pd.DataFrame): _list_missing([self.y_true, self.y_pred], list(reference_data.columns)) reference_data, empty = common_nan_removal( reference_data[[self.y_true, self.y_pred]], [self.y_true, self.y_pred] ) if empty: self._sampling_error_components = (np.nan,) else: self._sampling_error_components = rmsle_sampling_error_components( y_true_reference=reference_data[self.y_true], y_pred_reference=reference_data[self.y_pred], ) def _calculate(self, data: pd.DataFrame): """Redefine to handle NaNs and edge cases.""" _list_missing([self.y_true, self.y_pred], list(data.columns)) data, empty = common_nan_removal( data[[self.y_true, self.y_pred]], [self.y_true, self.y_pred] ) if empty: warnings.warn( f"No data or too many missing values, cannot calculate {self.display_name}. " f"Returning NaN." ) return np.nan y_true = data[self.y_true] y_pred = data[self.y_pred] # TODO: include option to drop negative values as well? _raise_exception_for_negative_values(y_true) _raise_exception_for_negative_values(y_pred) # Deal with breaking API change in sklearn 1.4 # https://scikit-learn.org/1.5/modules/generated/sklearn.metrics.root_mean_squared_log_error.html try: from sklearn.metrics import root_mean_squared_log_error return root_mean_squared_log_error(y_true, y_pred) except ImportError: from sklearn.metrics import mean_squared_log_error return mean_squared_log_error(y_true, y_pred, squared=False) def _sampling_error(self, data: pd.DataFrame) -> float: data, empty = common_nan_removal( data[[self.y_true, self.y_pred]], [self.y_true, self.y_pred] ) if empty: warnings.warn( f"Too many missing values, cannot calculate {self.display_name} sampling error. " "Returning NaN." ) return np.nan else: return rmsle_sampling_error(self._sampling_error_components, data)