Source code for nannyml.performance_estimation.base

#  Author:   Niels Nuyttens  <niels@nannyml.com>
#
#  License: Apache Software License 2.0

"""Module containing base classes for performance estimation."""
from __future__ import annotations

import abc
from typing import List

import pandas as pd
import plotly.graph_objects as go

from nannyml.chunk import Chunker, CountBasedChunker, DefaultChunker, PeriodBasedChunker, SizeBasedChunker
from nannyml.metadata import ModelMetadata


[docs]class PerformanceEstimatorResult(abc.ABC): """Contains performance estimation results and provides additional functionality on them.""" def __init__(self, estimated_data: pd.DataFrame, model_metadata: ModelMetadata): """Creates a new DriftResult instance. Parameters ---------- estimated_data: pd.DataFrame The results of the :meth:`~nannyml.performance_estimation.base.PerformanceEstimator.estimate` call. model_metadata: ModelMetadata The metadata describing the monitored model. """ self.data = estimated_data.copy(deep=True) self.metadata = model_metadata
[docs] def plot(self, *args, **kwargs) -> go.Figure: """Plot drift results.""" raise NotImplementedError
[docs]class PerformanceEstimator(abc.ABC): """Abstract class for performance estimation.""" def __init__( self, model_metadata: ModelMetadata, features: List[str] = None, chunk_size: int = None, chunk_number: int = None, chunk_period: str = None, chunker: Chunker = None, ): """Creates a new instance of a performance estimator. Parameters ---------- model_metadata: ModelMetadata Metadata telling the DriftCalculator what columns are required for drift calculation. features: List[str] An optional list of feature column names. When set only these columns will be included in the drift calculation. If not set it will default to all feature column names. chunk_size: int Splits the data into chunks containing `chunks_size` observations. Only one of `chunk_size`, `chunk_number` or `chunk_period` should be given. chunk_number: int Splits the data into `chunk_number` pieces. Only one of `chunk_size`, `chunk_number` or `chunk_period` should be given. chunk_period: str Splits the data according to the given period. Only one of `chunk_size`, `chunk_number` or `chunk_period` should be given. chunker : Chunker The `Chunker` used to split the data sets into a lists of chunks. """ self.model_metadata = model_metadata if not features: features = [f.column_name for f in self.model_metadata.features] self.selected_features = features if chunker is None: if chunk_size: self.chunker = SizeBasedChunker(chunk_size=chunk_size) # type: ignore elif chunk_number: self.chunker = CountBasedChunker(chunk_count=chunk_number) # type: ignore elif chunk_period: self.chunker = PeriodBasedChunker(offset=chunk_period) # type: ignore else: self.chunker = DefaultChunker() # type: ignore else: self.chunker = chunker # type: ignore
[docs] def fit(self, reference_data: pd.DataFrame) -> PerformanceEstimator: """Fits the data on a reference data set.""" raise NotImplementedError
[docs] def estimate(self, data: pd.DataFrame) -> PerformanceEstimatorResult: """Estimate performance given a data set lacking ground truth.""" raise NotImplementedError