Source code for nannyml.datasets.datasets

#  Author:  Nikolaos Perrakis  <nikos@nannyml.com>
#
#  License: Apache Software License 2.0

"""Utility module offering curated datasets for quick experimentation."""

from importlib import resources

from pandas import DataFrame, read_csv, read_parquet

DATA_MODULE = "nannyml.datasets.data"


[docs]def load_csv_file_to_df(local_file: str) -> DataFrame:
    """Loads a data file from within the NannyML package.

    Parameters
    ----------
    local_file : str, required
        string with the name of the data file to be loaded.

    Returns
    -------
    df: pd.DataFrame
        A DataFrame containing the requested data
    """
    with resources.path(DATA_MODULE, local_file) as data:
        return read_csv(data)


[docs]def load_pq_file_to_df(local_file: str) -> DataFrame:
    """Loads a data file from within the NannyML package.

    Parameters
    ----------
    local_file : str, required
        string with the name of the data file to be loaded.

    Returns
    -------
    df: pd.DataFrame
        A DataFrame containing the requested data
    """
    with resources.path(DATA_MODULE, local_file) as data:
        return read_parquet(data)


[docs]def load_synthetic_binary_classification_dataset():
    """Loads the synthetic binary classification dataset provided for testing the NannyML package.

    Returns
    -------
    reference : pd.DataFrame
        A DataFrame containing reference period of synthetic binary classification dataset
    analysis : pd.DataFrame
        A DataFrame containing analysis period of synthetic binary classification dataset
    analysis_tgt : pd.DataFrame
        A DataFrame containing target values for the analysis period of synthetic binary
        classification dataset

    Examples
    --------
    >>> from nannyml.datasets import load_synthetic_binary_classification_dataset
    >>> reference_df, analysis_df, analysis_targets_df = load_synthetic_binary_classification_dataset()

    """
    reference = load_csv_file_to_df('synthetic_sample_reference.csv')
    analysis = load_csv_file_to_df('synthetic_sample_analysis.csv')
    analysis_gt = load_csv_file_to_df('synthetic_sample_analysis_gt.csv')

    return reference, analysis, analysis_gt


[docs]def load_titanic_dataset():
    """Loads the titanic the NannyML package.

    The dataset has been created by combining two sources, the kaggle dataset[1] and the
    data world dataset[2]. Note that we have made the reference period align with the kaggle
    train set and the analysis period align with the kaggle test set.


    [1]: https://www.kaggle.com/competitions/titanic/data
    [2]: https://data.world/nrippner/titanic-disaster-dataset


    Returns
    -------
    reference : pd.DataFrame
        A DataFrame containing reference period of the titanic dataset
    analysis : pd.DataFrame
        A DataFrame containing analysis period of the titanic dataset
    analysis_tgt : pd.DataFrame
        A DataFrame containing target values for the analysis period of
        the titanic dataset

    Examples
    --------
    >>> from nannyml.datasets import load_titanic_dataset
    >>> reference_df, analysis_df, analysis_targets_df = load_titanic_dataset()

    """
    reference = load_csv_file_to_df('titanic_reference.csv')
    analysis = load_csv_file_to_df('titanic_analysis.csv')
    analysis_gt = load_csv_file_to_df('titanic_target.csv')

    return reference, analysis, analysis_gt


[docs]def load_synthetic_multiclass_classification_dataset():
    """Loads the synthetic multiclass classification dataset provided for testing the NannyML package.

    Returns
    -------
    reference : pd.DataFrame
        A DataFrame containing reference period of synthetic multiclass classification dataset
    analysis : pd.DataFrame
        A DataFrame containing analysis period of synthetic multiclass classification dataset
    analysis_tgt : pd.DataFrame
        A DataFrame containing target values for the analysis period of synthetic
        multiclass classification dataset

    Examples
    --------
    >>> from nannyml.datasets import load_synthetic_multiclass_classification_dataset
    >>> reference_df, analysis_df, analysis_targets_df = load_synthetic_multiclass_classification_dataset()

    """
    reference = load_csv_file_to_df('mc_reference.csv')
    analysis = load_csv_file_to_df('mc_analysis.csv')
    analysis_gt = load_csv_file_to_df('mc_analysis_gt.csv')

    return reference, analysis, analysis_gt


[docs]def load_modified_california_housing_dataset():
    """Loads the modified california housing dataset provided for testing the NannyML package.

    This dataset has been altered to represent a binary classification problem over time.
    More information about the dataset can be found at:
    :ref:`dataset-california`

    Returns
    -------
    reference : pd.DataFrame
        A DataFrame containing reference period of modified california housing dataset
    analysis : pd.DataFrame
        A DataFrame containing analysis period of modified california housing dataset
    analysis_tgt : pd.DataFrame
        A DataFrame containing target values for the analysis period of modified california housing dataset

    Examples
    --------
    >>> from nannyml.datasets import load_modified_california_housing_dataset
    >>> reference_df, analysis_df, analysis_targets_df = load_modified_california_housing_dataset()
    """
    reference = load_csv_file_to_df('california_housing_reference.csv')
    analysis = load_csv_file_to_df('california_housing_analysis.csv')
    analysis_gt = load_csv_file_to_df('california_housing_analysis_gt.csv')

    return reference, analysis, analysis_gt


[docs]def load_synthetic_car_loan_dataset():
    """Loads the synthetic car loan binary classification dataset provided for testing the NannyML package.

    Returns
    -------
    reference : pd.DataFrame
        A DataFrame containing reference period of synthetic binary classification dataset
    analysis : pd.DataFrame
        A DataFrame containing analysis period of synthetic binary classification dataset
    analysis_tgt : pd.DataFrame
        A DataFrame containing target values for the analysis period of synthetic binary
        classification dataset

    Examples
    --------
    >>> from nannyml.datasets import load_synthetic_car_loan_dataset
    >>> reference_df, analysis_df, analysis_targets_df = load_synthetic_car_loan_dataset()

    """
    reference = load_csv_file_to_df('synthetic_car_loan_reference.csv')
    analysis = load_csv_file_to_df('synthetic_car_loan_analysis.csv')
    analysis_gt = load_csv_file_to_df('synthetic_car_loan_analysis_target.csv')

    return reference, analysis, analysis_gt


[docs]def load_synthetic_car_loan_data_quality_dataset():
    """Loads the synthetic car loan binary classification dataset that contains missing values
    provided for testing the NannyML package.

    Returns
    -------
    reference : pd.DataFrame
        A DataFrame containing reference period of
        synthetic car loan binary classification dataset that contains missing values
    analysis : pd.DataFrame
        A DataFrame containing analysis period of
        synthetic car loan binary classification dataset that contains missing values
    analysis_tgt : pd.DataFrame
        A DataFrame containing target values for the analysis period of
        synthetic car loan binary classification dataset that contains missing values

    Examples
    --------
    >>> from nannyml.datasets import load_synthetic_car_loan_w_missing_dataset
    >>> reference_df, analysis_df, analysis_targets_df = load_synthetic_car_loan_w_missing_dataset()

    """
    reference = load_csv_file_to_df('synthetic_car_loan_dq_reference.csv')
    analysis = load_csv_file_to_df('synthetic_car_loan_dq_analysis.csv')
    analysis_gt = load_csv_file_to_df('synthetic_car_loan_analysis_target.csv')

    return reference, analysis, analysis_gt


[docs]def load_synthetic_car_price_dataset():
    """Loads the synthetic car price dataset provided for testing the NannyML package on regression problems.

    Returns
    -------
    reference : pd.DataFrame
        A DataFrame containing reference period of synthetic car price dataset
    analysis : pd.DataFrame
        A DataFrame containing analysis period of synthetic car price dataset
    analysis_tgt : pd.DataFrame
        A DataFrame containing target values for the analysis period of synthetic car price dataset

    Examples
    --------
    >>> from nannyml.datasets import load_synthetic_car_price_dataset
    >>> reference, analysis, analysis_tgt = load_synthetic_car_price_dataset()

    """

    reference = load_csv_file_to_df('regression_synthetic_reference.csv')
    analysis = load_csv_file_to_df('regression_synthetic_analysis.csv')
    analysis_tgt = load_csv_file_to_df('regression_synthetic_analysis_targets.csv')

    return reference, analysis, analysis_tgt


[docs]def load_us_census_ma_employment_data():
    """Loads the real-world binary classification dataset - predicting whether an individual is employed.


    Returns
    -------
    reference : pd.DataFrame
        A DataFrame containing reference period of synthetic car price dataset
    analysis : pd.DataFrame
        A DataFrame containing analysis period of synthetic car price dataset
    analysis_tgt : pd.DataFrame
        A DataFrame containing target values for the analysis period of synthetic car price dataset

    Examples
    --------
    >>> from nannyml.datasets import load_us_census_ma_employment_reference_and_analysis_data
    >>> reference, analysis, analysis_tgt = load_us_census_ma_employment_reference_and_analysis_data()

    """

    reference = load_pq_file_to_df('employment_MA_reference.pq')
    analysis = load_pq_file_to_df('employment_MA_analysis.pq')
    analysis_tgt = load_pq_file_to_df('employment_MA_analysis_target.pq')

    return reference, analysis, analysis_tgt