Source code for nannyml.datasets.datasets

#  Author:  Nikolaos Perrakis  <nikos@nannyml.com>
#
#  License: Apache Software License 2.0

"""Utility module offering curated datasets for quick experimentation."""

from importlib import resources

from pandas import DataFrame, read_csv, read_parquet

DATA_MODULE = "nannyml.datasets.data"


[docs]def load_csv_file_to_df(local_file: str) -> DataFrame: """Loads a data file from within the NannyML package. Parameters ---------- local_file : str, required string with the name of the data file to be loaded. Returns ------- df: pd.DataFrame A DataFrame containing the requested data """ with resources.path(DATA_MODULE, local_file) as data: return read_csv(data)
[docs]def load_pq_file_to_df(local_file: str) -> DataFrame: """Loads a data file from within the NannyML package. Parameters ---------- local_file : str, required string with the name of the data file to be loaded. Returns ------- df: pd.DataFrame A DataFrame containing the requested data """ with resources.path(DATA_MODULE, local_file) as data: return read_parquet(data)
[docs]def load_synthetic_binary_classification_dataset(): """Loads the synthetic binary classification dataset provided for testing the NannyML package. Returns ------- reference : pd.DataFrame A DataFrame containing reference period of synthetic binary classification dataset analysis : pd.DataFrame A DataFrame containing analysis period of synthetic binary classification dataset analysis_tgt : pd.DataFrame A DataFrame containing target values for the analysis period of synthetic binary classification dataset Examples -------- >>> from nannyml.datasets import load_synthetic_binary_classification_dataset >>> reference_df, analysis_df, analysis_targets_df = load_synthetic_binary_classification_dataset() """ reference = load_csv_file_to_df('synthetic_sample_reference.csv') analysis = load_csv_file_to_df('synthetic_sample_analysis.csv') analysis_gt = load_csv_file_to_df('synthetic_sample_analysis_gt.csv') return reference, analysis, analysis_gt
[docs]def load_titanic_dataset(): """Loads the titanic the NannyML package. The dataset has been created by combining two sources, the kaggle dataset[1] and the data world dataset[2]. Note that we have made the reference period align with the kaggle train set and the analysis period align with the kaggle test set. [1]: https://www.kaggle.com/competitions/titanic/data [2]: https://data.world/nrippner/titanic-disaster-dataset Returns ------- reference : pd.DataFrame A DataFrame containing reference period of the titanic dataset analysis : pd.DataFrame A DataFrame containing analysis period of the titanic dataset analysis_tgt : pd.DataFrame A DataFrame containing target values for the analysis period of the titanic dataset Examples -------- >>> from nannyml.datasets import load_titanic_dataset >>> reference_df, analysis_df, analysis_targets_df = load_titanic_dataset() """ reference = load_csv_file_to_df('titanic_reference.csv') analysis = load_csv_file_to_df('titanic_analysis.csv') analysis_gt = load_csv_file_to_df('titanic_target.csv') return reference, analysis, analysis_gt
[docs]def load_synthetic_multiclass_classification_dataset(): """Loads the synthetic multiclass classification dataset provided for testing the NannyML package. Returns ------- reference : pd.DataFrame A DataFrame containing reference period of synthetic multiclass classification dataset analysis : pd.DataFrame A DataFrame containing analysis period of synthetic multiclass classification dataset analysis_tgt : pd.DataFrame A DataFrame containing target values for the analysis period of synthetic multiclass classification dataset Examples -------- >>> from nannyml.datasets import load_synthetic_multiclass_classification_dataset >>> reference_df, analysis_df, analysis_targets_df = load_synthetic_multiclass_classification_dataset() """ reference = load_csv_file_to_df('mc_reference.csv') analysis = load_csv_file_to_df('mc_analysis.csv') analysis_gt = load_csv_file_to_df('mc_analysis_gt.csv') return reference, analysis, analysis_gt
[docs]def load_modified_california_housing_dataset(): """Loads the modified california housing dataset provided for testing the NannyML package. This dataset has been altered to represent a binary classification problem over time. More information about the dataset can be found at: :ref:`dataset-california` Returns ------- reference : pd.DataFrame A DataFrame containing reference period of modified california housing dataset analysis : pd.DataFrame A DataFrame containing analysis period of modified california housing dataset analysis_tgt : pd.DataFrame A DataFrame containing target values for the analysis period of modified california housing dataset Examples -------- >>> from nannyml.datasets import load_modified_california_housing_dataset >>> reference_df, analysis_df, analysis_targets_df = load_modified_california_housing_dataset() """ reference = load_csv_file_to_df('california_housing_reference.csv') analysis = load_csv_file_to_df('california_housing_analysis.csv') analysis_gt = load_csv_file_to_df('california_housing_analysis_gt.csv') return reference, analysis, analysis_gt
[docs]def load_synthetic_car_loan_dataset(): """Loads the synthetic car loan binary classification dataset provided for testing the NannyML package. Returns ------- reference : pd.DataFrame A DataFrame containing reference period of synthetic binary classification dataset analysis : pd.DataFrame A DataFrame containing analysis period of synthetic binary classification dataset analysis_tgt : pd.DataFrame A DataFrame containing target values for the analysis period of synthetic binary classification dataset Examples -------- >>> from nannyml.datasets import load_synthetic_car_loan_dataset >>> reference_df, analysis_df, analysis_targets_df = load_synthetic_car_loan_dataset() """ reference = load_csv_file_to_df('synthetic_car_loan_reference.csv') analysis = load_csv_file_to_df('synthetic_car_loan_analysis.csv') analysis_gt = load_csv_file_to_df('synthetic_car_loan_analysis_target.csv') return reference, analysis, analysis_gt
[docs]def load_synthetic_car_loan_data_quality_dataset(): """Loads the synthetic car loan binary classification dataset that contains missing values provided for testing the NannyML package. Returns ------- reference : pd.DataFrame A DataFrame containing reference period of synthetic car loan binary classification dataset that contains missing values analysis : pd.DataFrame A DataFrame containing analysis period of synthetic car loan binary classification dataset that contains missing values analysis_tgt : pd.DataFrame A DataFrame containing target values for the analysis period of synthetic car loan binary classification dataset that contains missing values Examples -------- >>> from nannyml.datasets import load_synthetic_car_loan_w_missing_dataset >>> reference_df, analysis_df, analysis_targets_df = load_synthetic_car_loan_w_missing_dataset() """ reference = load_csv_file_to_df('synthetic_car_loan_dq_reference.csv') analysis = load_csv_file_to_df('synthetic_car_loan_dq_analysis.csv') analysis_gt = load_csv_file_to_df('synthetic_car_loan_analysis_target.csv') return reference, analysis, analysis_gt
[docs]def load_synthetic_car_price_dataset(): """Loads the synthetic car price dataset provided for testing the NannyML package on regression problems. Returns ------- reference : pd.DataFrame A DataFrame containing reference period of synthetic car price dataset analysis : pd.DataFrame A DataFrame containing analysis period of synthetic car price dataset analysis_tgt : pd.DataFrame A DataFrame containing target values for the analysis period of synthetic car price dataset Examples -------- >>> from nannyml.datasets import load_synthetic_car_price_dataset >>> reference, analysis, analysis_tgt = load_synthetic_car_price_dataset() """ reference = load_csv_file_to_df('regression_synthetic_reference.csv') analysis = load_csv_file_to_df('regression_synthetic_analysis.csv') analysis_tgt = load_csv_file_to_df('regression_synthetic_analysis_targets.csv') return reference, analysis, analysis_tgt
[docs]def load_us_census_ma_employment_data(): """Loads the real-world binary classification dataset - predicting whether an individual is employed. Returns ------- reference : pd.DataFrame A DataFrame containing reference period of synthetic car price dataset analysis : pd.DataFrame A DataFrame containing analysis period of synthetic car price dataset analysis_tgt : pd.DataFrame A DataFrame containing target values for the analysis period of synthetic car price dataset Examples -------- >>> from nannyml.datasets import load_us_census_ma_employment_reference_and_analysis_data >>> reference, analysis, analysis_tgt = load_us_census_ma_employment_reference_and_analysis_data() """ reference = load_pq_file_to_df('employment_MA_reference.pq') analysis = load_pq_file_to_df('employment_MA_analysis.pq') analysis_tgt = load_pq_file_to_df('employment_MA_analysis_target.pq') return reference, analysis, analysis_tgt