Source code for nannyml.drift.target.target_distribution.calculator

#  Author:   Niels Nuyttens  <niels@nannyml.com>
#
#  License: Apache Software License 2.0

"""Module for target distribution monitoring."""
from __future__ import annotations

import warnings
from typing import Dict

import numpy as np
import pandas as pd
from scipy.stats import chi2_contingency

from nannyml.chunk import Chunker, CountBasedChunker, DefaultChunker, PeriodBasedChunker, SizeBasedChunker
from nannyml.drift.target.target_distribution.result import TargetDistributionResult
from nannyml.exceptions import InvalidArgumentsException
from nannyml.metadata.base import (
    NML_METADATA_COLUMNS,
    NML_METADATA_PARTITION_COLUMN_NAME,
    NML_METADATA_TARGET_COLUMN_NAME,
    ModelMetadata,
)
from nannyml.preprocessing import preprocess


[docs]class TargetDistributionCalculator: """Calculates target distribution for a given dataset.""" def __init__( self, model_metadata: ModelMetadata, chunk_size: int = None, chunk_number: int = None, chunk_period: str = None, chunker: Chunker = None, ): """Constructs a new TargetDistributionCalculator. Parameters ---------- model_metadata: ModelMetadata Metadata for the model whose data is to be processed. chunk_size: int Splits the data into chunks containing `chunks_size` observations. Only one of `chunk_size`, `chunk_number` or `chunk_period` should be given. chunk_number: int Splits the data into `chunk_number` pieces. Only one of `chunk_size`, `chunk_number` or `chunk_period` should be given. chunk_period: str Splits the data according to the given period. Only one of `chunk_size`, `chunk_number` or `chunk_period` should be given. chunker : Chunker The `Chunker` used to split the data sets into a lists of chunks. Examples -------- >>> import nannyml as nml >>> ref_df, ana_df, _ = nml.load_synthetic_binary_classification_dataset() >>> metadata = nml.extract_metadata(ref_df, model_type=nml.ModelType.CLASSIFICATION_BINARY) >>> # Create a calculator that will chunk by week >>> target_distribution_calc = nml.TargetDistributionCalculator(model_metadata=metadata, chunk_period='W') """ self.metadata = model_metadata if chunker is None: # Note: # minimum chunk size is only needed if a chunker with a user specified minimum chunk size is not provided if chunk_size: self.chunker = SizeBasedChunker(chunk_size=chunk_size) # type: ignore elif chunk_number: self.chunker = CountBasedChunker(chunk_count=chunk_number) # type: ignore elif chunk_period: self.chunker = PeriodBasedChunker(offset=chunk_period) # type: ignore else: self.chunker = DefaultChunker() # type: ignore else: self.chunker = chunker # type: ignore self._reference_targets: pd.Series = None # type: ignore # TODO: determine better min_chunk_size for target distribution self._minimum_chunk_size = 300
[docs] def fit(self, reference_data: pd.DataFrame) -> TargetDistributionCalculator: """Fits the calculator to reference data. During fitting the reference target data is validated and stored for later use. Examples -------- >>> import nannyml as nml >>> ref_df, ana_df, _ = nml.load_synthetic_binary_classification_dataset() >>> metadata = nml.extract_metadata(ref_df, model_type=nml.ModelType.CLASSIFICATION_BINARY) >>> target_distribution_calc = nml.TargetDistributionCalculator(model_metadata=metadata, chunk_period='W') >>> # fit the calculator on reference data >>> target_distribution_calc.fit(ref_df) """ if reference_data.empty: raise InvalidArgumentsException('data contains no rows. Please provide a valid data set.') if self.metadata.target_column_name not in reference_data.columns: raise InvalidArgumentsException( f"data does not contain target data column '{self.metadata.target_column_name}'." ) self._reference_targets = preprocess(data=reference_data, metadata=self.metadata, reference=True)[ NML_METADATA_TARGET_COLUMN_NAME ] return self
[docs] def calculate(self, data: pd.DataFrame): """Calculates the target distribution of a binary classifier. Requires fitting the calculator on reference data first. Parameters ---------- data: pd.DataFrame Data for the model, i.e. model inputs, predictions and targets. Examples -------- >>> import nannyml as nml >>> ref_df, ana_df, _ = nml.load_synthetic_binary_classification_dataset() >>> metadata = nml.extract_metadata(ref_df, model_type=nml.ModelType.CLASSIFICATION_BINARY) >>> target_distribution_calc = nml.TargetDistributionCalculator(model_metadata=metadata, chunk_period='W') >>> target_distribution_calc.fit(ref_df) >>> # calculate target distribution >>> target_distribution = target_distribution_calc.calculate(ana_df) """ if data.empty: raise InvalidArgumentsException('data contains no rows. Please provide a valid data set.') if self.metadata.target_column_name not in data.columns: raise InvalidArgumentsException( f"data does not contain target data column '{self.metadata.target_column_name}'." ) # Preprocess data data = preprocess(data=data, metadata=self.metadata) data['NML_TARGET_INCOMPLETE'] = data[NML_METADATA_TARGET_COLUMN_NAME].isna().astype(np.int16) # Generate chunks features_and_metadata = NML_METADATA_COLUMNS + ['NML_TARGET_INCOMPLETE'] chunks = self.chunker.split(data, columns=features_and_metadata, minimum_chunk_size=self._minimum_chunk_size) # Construct result frame res = pd.DataFrame.from_records( [ { 'key': chunk.key, 'start_index': chunk.start_index, 'end_index': chunk.end_index, 'start_date': chunk.start_datetime, 'end_date': chunk.end_datetime, 'partition': 'analysis' if chunk.is_transition else chunk.partition, 'targets_missing_rate': ( chunk.data['NML_TARGET_INCOMPLETE'].sum() / chunk.data['NML_TARGET_INCOMPLETE'].count() ), **_calculate_target_drift_for_chunk(self._reference_targets, chunk.data), } for chunk in chunks ] ) return TargetDistributionResult(target_distribution=res, model_metadata=self.metadata)
def _calculate_target_drift_for_chunk(reference_targets: pd.Series, data: pd.DataFrame) -> Dict: targets = data[NML_METADATA_TARGET_COLUMN_NAME] statistic, p_value, _, _ = chi2_contingency( pd.concat([reference_targets.value_counts(), targets.value_counts()], axis=1).fillna(0) ) _ALERT_THRESHOLD_P_VALUE = 0.05 is_analysis = 'analysis' in set(data[NML_METADATA_PARTITION_COLUMN_NAME].unique()) is_binary_targets = data[NML_METADATA_TARGET_COLUMN_NAME].nunique() > 2 if is_binary_targets: warnings.warn( f"the target column contains {data[NML_METADATA_TARGET_COLUMN_NAME].nunique()} unique values. " "NannyML cannot provide a value for 'metric_target_drift' " "when there are more than 2 unique values. " "All 'metric_target_drift' values will be set to np.NAN" ) is_string_targets = ( data[NML_METADATA_TARGET_COLUMN_NAME].dtype == 'object' or data[NML_METADATA_TARGET_COLUMN_NAME].dtype == 'string' ) if is_string_targets: warnings.warn( "the target column contains non-numerical values. NannyML cannot provide a value for " "'metric_target_drift'." "All 'metric_target_drift' values will be set to np.NAN" ) return { 'metric_target_drift': targets.mean() if not (is_binary_targets or is_string_targets) else np.NAN, 'statistical_target_drift': statistic, 'p_value': p_value, 'thresholds': _ALERT_THRESHOLD_P_VALUE, 'alert': (p_value < _ALERT_THRESHOLD_P_VALUE) and is_analysis, 'significant': p_value < _ALERT_THRESHOLD_P_VALUE, }