Source code for bluemist.preprocessing.preprocessor

"""
Performs data pre-processing
"""

# Author: Shashank Agrawal
# License: MIT
# Version: 0.1.2
# Email: dew@bluemist-ai.one
# Created: Jun 22, 2022
# Last modified: June 11, 2023


import logging
import os
from logging import config
import numpy as np
import pandas as pd
import sklearn
from sklearn.compose import ColumnTransformer

from bluemist.pipeline.bluemist_pipeline import save_preprocessor
from bluemist.preprocessing import categorical_transformer, numeric_transformer

BLUEMIST_PATH = os.environ["BLUEMIST_PATH"]

config.fileConfig(BLUEMIST_PATH + '/' + 'logging.config')
logger = logging.getLogger("bluemist")

initial_column_metadata_for_deployment = []
encoded_columns_for_deployment = []
target_for_deployment = None


[docs]def preprocess_data(
        data,
        target_variable,
        test_size=0.25,
        data_randomizer=None,
        drop_features=None,
        numerical_features=None,
        force_numeric_conversion=True,
        categorical_features=None,
        convert_values_to_nan=None,
        data_scaling_strategy='StandardScaler',
        data_tranformation_strategy=None,
        missing_values=np.nan,
        numeric_imputer_strategy='mean',
        numeric_constant_value=None,
        categorical_imputer_strategy='most_frequent',
        categorical_constant_value=None,
        categorical_encoder='OneHotEncoder',
        drop_categories_one_hot_encoder=None,
        handle_unknown_one_hot_encoder=None):
    """
        data: pandas daframe
            Dataframe to be processed before passing to the ML estimator
        target_variable: str
            Target variable to be predicted
        test_size: float or int, default=0.25
            Percentage of the data to be used for testing model performance
        data_randomizer: int default=None
            Controls the data split. Provide a value to reproduce the same split.
        drop_features: str ot list
            Drops the features from the dataset
        numerical_features: list, default=None
            Bluemist AI will automatically identify numerical features from the dataset. Provide the list of features to override the type identified by Bluemist AI.
        force_numeric_conversion: bool, default=True
            Gracefully converts features to numeric datatype which are provided under ``numerical_features``
        categorical_features: list, default=None
            Bluemist AI will automatically identify categorical features from the dataset. Provide the list of features to override the type identified by Bluemist.
        convert_to_nan:  str, list, default=None
            Dataset values to be converted to NumPy NaN
        data_scaling_strategy: {None, 'StandardScaler', 'MinMaxScaler', 'MaxAbsScaler', 'RobustScaler'}, default='StandardScaler'
            Scales dataset features, excluding target variable
                - 'StandardScaler':
                - 'MinMaxScaler'
                - 'MaxAbsScaler'
                - 'RobustScaler'
        data_tranformation_strategy: {'box-cox', 'yeo-johnson' or None}, default=None
            Transforms the features, excluding target variable.
                - 'box-cox':
                - 'yeo-johnson':
        missing_values: int, float, str, np.nan, None or pandas.NA, default=np.nan
            All instances of missing_value will be replaced with the user provided imputer strategy
        numeric_imputer_strategy: {'mean, 'median', 'most_frequent', 'constant'}, default='mean'
            Replaces `missing_values` with the strategy provided
        numeric_constant_value: str or number, default=None
            ``numeric_constant_value`` will replace the ``missing_values`` when ``numeric_imputer_strategy`` is passed as ``constant``
        categorical_imputer_strategy:  {'most_frequent', 'constant'}, default='most_frequent'
            Replaces `missing_values` with the strategy provided
        categorical_constant_value: str or number, default=None
            ``categorical_constant_value`` will replace the ``missing_values`` when ``categorical_imputer_strategy`` is passed as ``constant``
        categorical_encoder: {'OneHotEncoder', 'OrdinalEncoder'}, default='OneHotEncoder'
            Encode categorical features
        drop_categories_one_hot_encoder: {‘first’, ‘if_binary’ or None}, default='None'
            Determines strategy to drop one category per feature
                - 'first':
                    drops the first category for each feature.
                - 'if_binary':
                    drops the first category for features with two categories
                - None:
                    Keeps all features and categories
        handle_unknown_one_hot_encoder : {‘error’, ‘ignore’, ‘infrequent_if_exist’}, default=’error’
            Handles unknown category during transform
                - 'error':
                    throws an error if category is unknown
                - 'ignore':
                    ignores if category is unknown, output encoded column for this feature will be all zeroes
                - 'infrequent_if_exist':
                    unknown category will be mapped to infrequent category if exists. If infrequent category does not exist, it
                    will be treated as `ignore`

        Examples
        ---------
        *Data preprocessing :: Categorical Encoder*

        .. raw:: html
           :file: ../../code_samples/quickstarts/preprocessor/preprocessor_categorical.html

    """

    global target_for_deployment
    global initial_column_metadata_for_deployment
    global encoded_columns_for_deployment

    target_for_deployment = target_variable

    logger.info('Shape of the dataset :: {}'.format(data.shape))
    logger.info('Columns in the dataset :: \n{}'.format(data.columns))

    # drop features from the dataset
    if drop_features is not None:
        if isinstance(drop_features, str):
            data.drop([drop_features], axis=1, inplace=True)
        elif isinstance(drop_features, list):
            data.drop(drop_features, axis=1, inplace=True)

    # auto compute numerical and categorical features
    auto_computed_numerical_features = data.select_dtypes(include='number').columns.tolist()
    auto_computed_categorical_features = data.select_dtypes(include='object').columns.tolist()

    final_numerical_features = auto_computed_numerical_features.copy()
    if target_variable in final_numerical_features:
        final_numerical_features.remove(target_variable)

    final_categorical_features = auto_computed_categorical_features.copy()
    if target_variable in final_categorical_features:
        final_categorical_features.remove(target_variable)

    # finalize the list of numerical features
    if auto_computed_numerical_features is not None:
        if numerical_features is not None:
            for numerical_feature in numerical_features:
                if numerical_feature not in auto_computed_numerical_features:
                    final_numerical_features.append(numerical_feature)

        if categorical_features is not None:
            for categorical_feature in categorical_features:
                if categorical_feature in auto_computed_numerical_features:
                    final_numerical_features.remove(categorical_feature)

    # finalize the list of categorical features
    if auto_computed_categorical_features is not None:
        if categorical_features is not None:
            for categorical_feature in categorical_features:
                if categorical_feature not in auto_computed_categorical_features:
                    final_categorical_features.append(categorical_feature)

        if numerical_features is not None:
            for numerical_feature in numerical_features:
                if numerical_feature in auto_computed_categorical_features:
                    final_categorical_features.remove(numerical_feature)

    # prepare final list of columns after preprocessing
    column_list = []
    if bool(final_numerical_features) and bool(final_categorical_features):
        column_list = final_numerical_features + final_categorical_features
    elif bool(final_numerical_features):
        column_list = final_numerical_features
    elif bool(final_categorical_features):
        column_list = final_categorical_features

    # handle non-numeric data in user provided numeric column
    if numerical_features is not None:
        if force_numeric_conversion:
            numeric_conversion_strategy = 'coerce'
        else:
            numeric_conversion_strategy = 'raise'

    logger.debug('data.dtypes before preprocessing  :: \n{}'.format(data.dtypes))

    if numerical_features is not None:
        data[final_numerical_features] = data[final_numerical_features].apply(pd.to_numeric,
                                                                              errors=numeric_conversion_strategy,
                                                                              axis=1)

    data[final_categorical_features] = data[final_categorical_features].astype(str)
    logger.debug('data.dtypes after dtype conversion  :: \n{}'.format(data.dtypes))

    # Creating list of column name and datatype which will be used in generate_api.py
    for col_name, col_type in data.drop(target_variable, axis=1).dtypes.items():
        initial_column_metadata_for_deployment.append((col_name, col_type))

    # create transformers for preprocessing pipeline
    num_transformer = numeric_transformer.build_numeric_transformer_pipeline(**locals())
    cat_transformer = categorical_transformer.build_categorical_transformer_pipeline(**locals())

    # create preprocessing pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            ("numeric_transformer", num_transformer, final_numerical_features),
            ("categorical_transformer", cat_transformer, final_categorical_features)
        ], verbose_feature_names_out=False
    )

    X = data.drop([target_variable], axis=1)
    y = data[[target_variable]]
    logger.debug('Splitting dataset into X_train, X_test, y_train, y_test...')

    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=test_size,
                                                                                random_state=data_randomizer)

    logger.debug('X_train.dtypes before ColumnTransformer :: \n{}'.format(X_train.dtypes))
    X_train = pd.DataFrame(preprocessor.fit_transform(X_train), columns=preprocessor.get_feature_names_out())
    X_test = pd.DataFrame(preprocessor.transform(X_test), columns=preprocessor.get_feature_names_out())

    encoded_columns_for_deployment = preprocessor.get_feature_names_out()

    logger.debug('X_train Columns after ColumnTransformer :: {}'.format(preprocessor.get_feature_names_out()))
    logger.debug('X_train.dtypes after ColumnTransformer :: \n{}'.format(X_train.dtypes))

    y_train = np.ravel(y_train)
    y_test = np.ravel(y_test)

    logger.debug('Saving preprocessor to disk...')
    save_preprocessor(preprocessor)
    return X_train, X_test, y_train, y_test