Source code for bluemist.eda.analyze_data


__author__ = "Shashank Agrawal"
__license__ = "MIT"
__version__ = "0.1.1"
__email__ = "dew@bluemist-ai.one"

import logging
import os
from logging import config

from pandas_profiling import ProfileReport

BLUEMIST_PATH = os.getenv("BLUEMIST_PATH")
EDA_ARTIFACTS_PATH = BLUEMIST_PATH + '/' + 'artifacts/eda'

config.fileConfig(BLUEMIST_PATH + '/' + 'logging.config')
logger = logging.getLogger("bluemist")


[docs]def perform_eda(data, provider='pandas-profiling', sample_size=10000, data_randomizer=2): """ Performs Exploratory Data Analysis (EDA) data: pandas dataframe Dataframe for exploratory data analysis provider : {'pandas-profiling', 'sweetviz', 'dtale'}, default='pandas-profiling' Library provider for exploratory data analysis sample_size: str, default=10000 Number of rows to return from dataframe. ``None`` to perform eda on the complete dataset which can be slower if dataset has large number of rows and columns data_randomizer: int, default=None Controls the data split. Provide a value to reproduce the same split. Examples --------- *EDA using Pandas Profiling* .. raw:: html :file: ../../code_samples/quickstarts/eda/eda_pandas-profiling.html *EDA using SweetVIZ* .. raw:: html :file: ../../code_samples/quickstarts/eda/eda_sweetviz.html *EDA using D-TALE* .. raw:: html :file: ../../code_samples/quickstarts/eda/eda_dtale.html """ if sample_size is not None and data.shape[0] >= sample_size: data = data.sample(n=sample_size, random_state=data_randomizer) output_provider = EDA_ARTIFACTS_PATH + '/' + provider output_file = output_provider + '.html' valid_providers = ['pandas-profiling', 'sweetviz', 'dtale', 'autoviz'] if provider in valid_providers: logger.info('Peforming EDA using :: {}'.format(provider)) if provider == 'pandas-profiling': logger.info('Output file :: {}'.format(output_file)) print('Output file :: {}'.format(output_file)) print('Output file will be opened in the browser after analysis is completed !!') profile = ProfileReport(data, explorative=True) profile.to_file(output_file=output_file, silent=False) elif provider == 'sweetviz': import sweetviz as sv logger.info('Output file :: {}'.format(output_file)) print('Output file :: {}'.format(output_file)) print('Output file will be opened in the browser after analysis is completed !!') sweetviz_report = sv.analyze(data) sweetviz_report.show_html(output_file) elif provider == 'dtale': import dtale print('Opening dtale UI on the browser...') d = dtale.show(data, subprocess=False, reaper_on=True) d.open_browser() else: print('Invalid provider, valid providers are :: {}'.format(valid_providers)) logger.info('Invalid provider, valid providers are :: {}'.format(valid_providers))