Source code for bluemist.preprocessing.preprocessor

"""
Performs data pre-processing
"""

# Author: Shashank Agrawal
# License: MIT
# Version: 0.1.2
# Email: dew@bluemist-ai.one
# Created: Jun 22, 2022
# Last modified: June 11, 2023


import logging
import os
from logging import config
import numpy as np
import pandas as pd
import sklearn
from sklearn.compose import ColumnTransformer

from bluemist.pipeline.bluemist_pipeline import save_preprocessor
from bluemist.preprocessing import categorical_transformer, numeric_transformer

BLUEMIST_PATH = os.environ["BLUEMIST_PATH"]

config.fileConfig(BLUEMIST_PATH + '/' + 'logging.config')
logger = logging.getLogger("bluemist")

initial_column_metadata_for_deployment = []
encoded_columns_for_deployment = []
target_for_deployment = None


[docs]def preprocess_data( data, target_variable, test_size=0.25, data_randomizer=None, drop_features=None, numerical_features=None, force_numeric_conversion=True, categorical_features=None, convert_values_to_nan=None, data_scaling_strategy='StandardScaler', data_tranformation_strategy=None, missing_values=np.nan, numeric_imputer_strategy='mean', numeric_constant_value=None, categorical_imputer_strategy='most_frequent', categorical_constant_value=None, categorical_encoder='OneHotEncoder', drop_categories_one_hot_encoder=None, handle_unknown_one_hot_encoder=None): """ data: pandas daframe Dataframe to be processed before passing to the ML estimator target_variable: str Target variable to be predicted test_size: float or int, default=0.25 Percentage of the data to be used for testing model performance data_randomizer: int default=None Controls the data split. Provide a value to reproduce the same split. drop_features: str ot list Drops the features from the dataset numerical_features: list, default=None Bluemist AI will automatically identify numerical features from the dataset. Provide the list of features to override the type identified by Bluemist AI. force_numeric_conversion: bool, default=True Gracefully converts features to numeric datatype which are provided under ``numerical_features`` categorical_features: list, default=None Bluemist AI will automatically identify categorical features from the dataset. Provide the list of features to override the type identified by Bluemist. convert_to_nan: str, list, default=None Dataset values to be converted to NumPy NaN data_scaling_strategy: {None, 'StandardScaler', 'MinMaxScaler', 'MaxAbsScaler', 'RobustScaler'}, default='StandardScaler' Scales dataset features, excluding target variable - 'StandardScaler': - 'MinMaxScaler' - 'MaxAbsScaler' - 'RobustScaler' data_tranformation_strategy: {'box-cox', 'yeo-johnson' or None}, default=None Transforms the features, excluding target variable. - 'box-cox': - 'yeo-johnson': missing_values: int, float, str, np.nan, None or pandas.NA, default=np.nan All instances of missing_value will be replaced with the user provided imputer strategy numeric_imputer_strategy: {'mean, 'median', 'most_frequent', 'constant'}, default='mean' Replaces `missing_values` with the strategy provided numeric_constant_value: str or number, default=None ``numeric_constant_value`` will replace the ``missing_values`` when ``numeric_imputer_strategy`` is passed as ``constant`` categorical_imputer_strategy: {'most_frequent', 'constant'}, default='most_frequent' Replaces `missing_values` with the strategy provided categorical_constant_value: str or number, default=None ``categorical_constant_value`` will replace the ``missing_values`` when ``categorical_imputer_strategy`` is passed as ``constant`` categorical_encoder: {'OneHotEncoder', 'OrdinalEncoder'}, default='OneHotEncoder' Encode categorical features drop_categories_one_hot_encoder: {‘first’, ‘if_binary’ or None}, default='None' Determines strategy to drop one category per feature - 'first': drops the first category for each feature. - 'if_binary': drops the first category for features with two categories - None: Keeps all features and categories handle_unknown_one_hot_encoder : {‘error’, ‘ignore’, ‘infrequent_if_exist’}, default=’error’ Handles unknown category during transform - 'error': throws an error if category is unknown - 'ignore': ignores if category is unknown, output encoded column for this feature will be all zeroes - 'infrequent_if_exist': unknown category will be mapped to infrequent category if exists. If infrequent category does not exist, it will be treated as `ignore` Examples --------- *Data preprocessing :: Categorical Encoder* .. raw:: html :file: ../../code_samples/quickstarts/preprocessor/preprocessor_categorical.html """ global target_for_deployment global initial_column_metadata_for_deployment global encoded_columns_for_deployment target_for_deployment = target_variable logger.info('Shape of the dataset :: {}'.format(data.shape)) logger.info('Columns in the dataset :: \n{}'.format(data.columns)) # drop features from the dataset if drop_features is not None: if isinstance(drop_features, str): data.drop([drop_features], axis=1, inplace=True) elif isinstance(drop_features, list): data.drop(drop_features, axis=1, inplace=True) # auto compute numerical and categorical features auto_computed_numerical_features = data.select_dtypes(include='number').columns.tolist() auto_computed_categorical_features = data.select_dtypes(include='object').columns.tolist() final_numerical_features = auto_computed_numerical_features.copy() if target_variable in final_numerical_features: final_numerical_features.remove(target_variable) final_categorical_features = auto_computed_categorical_features.copy() if target_variable in final_categorical_features: final_categorical_features.remove(target_variable) # finalize the list of numerical features if auto_computed_numerical_features is not None: if numerical_features is not None: for numerical_feature in numerical_features: if numerical_feature not in auto_computed_numerical_features: final_numerical_features.append(numerical_feature) if categorical_features is not None: for categorical_feature in categorical_features: if categorical_feature in auto_computed_numerical_features: final_numerical_features.remove(categorical_feature) # finalize the list of categorical features if auto_computed_categorical_features is not None: if categorical_features is not None: for categorical_feature in categorical_features: if categorical_feature not in auto_computed_categorical_features: final_categorical_features.append(categorical_feature) if numerical_features is not None: for numerical_feature in numerical_features: if numerical_feature in auto_computed_categorical_features: final_categorical_features.remove(numerical_feature) # prepare final list of columns after preprocessing column_list = [] if bool(final_numerical_features) and bool(final_categorical_features): column_list = final_numerical_features + final_categorical_features elif bool(final_numerical_features): column_list = final_numerical_features elif bool(final_categorical_features): column_list = final_categorical_features # handle non-numeric data in user provided numeric column if numerical_features is not None: if force_numeric_conversion: numeric_conversion_strategy = 'coerce' else: numeric_conversion_strategy = 'raise' logger.debug('data.dtypes before preprocessing :: \n{}'.format(data.dtypes)) if numerical_features is not None: data[final_numerical_features] = data[final_numerical_features].apply(pd.to_numeric, errors=numeric_conversion_strategy, axis=1) data[final_categorical_features] = data[final_categorical_features].astype(str) logger.debug('data.dtypes after dtype conversion :: \n{}'.format(data.dtypes)) # Creating list of column name and datatype which will be used in generate_api.py for col_name, col_type in data.drop(target_variable, axis=1).dtypes.items(): initial_column_metadata_for_deployment.append((col_name, col_type)) # create transformers for preprocessing pipeline num_transformer = numeric_transformer.build_numeric_transformer_pipeline(**locals()) cat_transformer = categorical_transformer.build_categorical_transformer_pipeline(**locals()) # create preprocessing pipeline preprocessor = ColumnTransformer( transformers=[ ("numeric_transformer", num_transformer, final_numerical_features), ("categorical_transformer", cat_transformer, final_categorical_features) ], verbose_feature_names_out=False ) X = data.drop([target_variable], axis=1) y = data[[target_variable]] logger.debug('Splitting dataset into X_train, X_test, y_train, y_test...') X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=test_size, random_state=data_randomizer) logger.debug('X_train.dtypes before ColumnTransformer :: \n{}'.format(X_train.dtypes)) X_train = pd.DataFrame(preprocessor.fit_transform(X_train), columns=preprocessor.get_feature_names_out()) X_test = pd.DataFrame(preprocessor.transform(X_test), columns=preprocessor.get_feature_names_out()) encoded_columns_for_deployment = preprocessor.get_feature_names_out() logger.debug('X_train Columns after ColumnTransformer :: {}'.format(preprocessor.get_feature_names_out())) logger.debug('X_train.dtypes after ColumnTransformer :: \n{}'.format(X_train.dtypes)) y_train = np.ravel(y_train) y_test = np.ravel(y_test) logger.debug('Saving preprocessor to disk...') save_preprocessor(preprocessor) return X_train, X_test, y_train, y_test