Source code for nannyml.metadata.regression

#  Author:   Niels Nuyttens  <niels@nannyml.com>
#
#  License: Apache Software License 2.0
from typing import Any, Dict, List, Tuple

import numpy as np
import pandas as pd

from nannyml.metadata.base import ModelMetadata, ModelType, _check_for_nan, _extract_features
from nannyml.metadata.feature import FeatureType

NML_METADATA_PREDICTION_COLUMN_NAME = 'nml_meta_prediction'


[docs]class RegressionMetadata(ModelMetadata): def __init__(self, prediction_column_name: str = None, *args, **kwargs): super().__init__(ModelType.REGRESSION, *args, **kwargs) self._prediction_column_name = prediction_column_name @property def prediction_column_name(self): # noqa: D102 return self._prediction_column_name @prediction_column_name.setter def prediction_column_name(self, column_name: str): # noqa: D102 self._prediction_column_name = column_name self._remove_from_features(column_name) @property def metadata_columns(self): return [NML_METADATA_PREDICTION_COLUMN_NAME]
[docs] def to_dict(self) -> Dict[str, Any]: res = super().to_dict() res['prediction_column_name'] = self.prediction_column_name return res
[docs] def to_df(self) -> pd.DataFrame: res = super().to_df() df = pd.DataFrame( [ { 'label': 'prediction_column_name', 'column_name': self.prediction_column_name, 'type': FeatureType.CONTINUOUS.value, 'description': 'predicted value', } ] ) return res.append(df, ignore_index=True).reset_index(drop=True)
[docs] def enrich(self, data: pd.DataFrame) -> pd.DataFrame: df = super().enrich(data) if self.prediction_column_name in data.columns: df[NML_METADATA_PREDICTION_COLUMN_NAME] = data[self.prediction_column_name] else: df[NML_METADATA_PREDICTION_COLUMN_NAME] = np.NAN return df
[docs] def is_complete(self) -> Tuple[bool, List[str]]: ok, missing = super().is_complete() if self.prediction_column_name is None: ok = False missing.append('prediction_column_name') return ok, missing
[docs] def extract(self, data: pd.DataFrame, model_name: str = None, exclude_columns: List[str] = None): if super().extract(data, model_name, exclude_columns) is None: return None predictions = _guess_predictions(data) _check_for_nan(data, predictions) self.prediction_column_name = None if len(predictions) == 0 else predictions[0] # type: ignore not_feature_cols = [] if exclude_columns: not_feature_cols = exclude_columns if self.prediction_column_name: not_feature_cols += [self.prediction_column_name] self.features = _extract_features(data, exclude_columns=not_feature_cols) return self
def _guess_predictions(data: pd.DataFrame) -> List[str]: def _guess_if_prediction(col: pd.Series) -> bool: return col.name in ['p', 'pred', 'prediction', 'out', 'output', 'y_pred'] return [col for col in data.columns if _guess_if_prediction(data[col])]