Source code for nannyml.sampling_error.summary_stats

#  Author:   Nikolaos Perrakis  <nikos@nannyml.com>
#
#  License: Apache Software License 2.0

import warnings
from logging import getLogger
from typing import Tuple

import numpy as np
import pandas as pd
from scipy.stats import gaussian_kde, moment

logger = getLogger(__name__)


[docs]def summary_stats_std_sampling_error_components(col: pd.Series) -> Tuple: """ Calculate sampling error components for Summary Stats Standard Deviation using reference data. Parameters ---------- col: pd.Series column for which we are calculating sampling error components Returns ------- (std, moment_4th): Tuple[np.ndarray] """ std = col.std() moment_4th = moment(col.to_numpy(), 4) return (std, moment_4th)
[docs]def summary_stats_std_sampling_error(sampling_error_components, col) -> float: """ Calculate sampling error for Summary Stats Standard Deviation using reference data. Standard Error of Standard Deviation, https://stats.stackexchange.com/a/157305 CR Rao (1973) Linear Statistical Inference and its Applications 2nd Ed, John Wiley & Sons, NY Parameters ---------- sampling_error_components: a set of parameters that were derived from reference data. col: the (analysis) column you want to calculate sampling error for. Returns ------- sampling_error: float """ _std = sampling_error_components[0] _mu4 = sampling_error_components[1] _size = col.shape[0] err_var_parenthesis_part = _mu4 - ((_size - 3) * (_std**4) / (_size - 1)) if not (np.isfinite(err_var_parenthesis_part) and err_var_parenthesis_part >= 0): logger.debug( "Summary Stats sampling error calculation imputed to nan because of non finite positive parenthesis factor." ) return np.nan err_var = np.sqrt((1 / _size) * err_var_parenthesis_part) return (1 / (2 * _std)) * err_var
[docs]def summary_stats_median_sampling_error_components(col: pd.Series) -> Tuple: """ Calculate sampling error components for Summary Stats Median using reference data. Parameters ---------- col: pd.Series column for which we are calculating sampling error components Returns ------- (median, pdf(median): Tuple[np.ndarray] """ median = col.median() try: kernel = gaussian_kde(col) fmedian = kernel.evaluate(median)[0] except np.linalg.LinAlgError as ex: logger.warning("Suppressing LinAlgError in summary_stats_median_sampling_error_components: %r", ex) warnings.warn(f"Suppressing LinAlgError in summary_stats_median_sampling_error_components: {ex}") fmedian = np.inf return (median, fmedian)
[docs]def summary_stats_median_sampling_error(sampling_error_components, col) -> float: """ Calculate sampling error for Summary Stats Median using reference data. Using Asymptotic variance formula from https://stats.stackexchange.com/a/61759 https://en.wikipedia.org/wiki/Median#Sampling_distribution Parameters ---------- sampling_error_components : a set of parameters that were derived from reference data. col : the (analysis) column you want to calculate sampling error for. Returns ------- sampling_error: float """ fmedian = sampling_error_components[1] _size = col.shape[0] err = np.sqrt(1 / (4 * _size * (fmedian**2))) return err