Source code for bluemist.llm.wrapper

# Author: Shashank Agrawal
# License: MIT
# Version: 0.1.3
# Email: dew@bluemist-ai.one
# Created: Jul 17, 2023
# Last modified: Oct 25, 2023

import logging
import os
import pandas as pd
from logging import config

from transformers import pipeline
from bluemist.llm.task_models import TaskModels

BLUEMIST_PATH = os.environ["BLUEMIST_PATH"]

config.fileConfig(BLUEMIST_PATH + '/' + 'logging.config')
logger = logging.getLogger("bluemist")

# Instantiate the TaskModels class
task_models = TaskModels()


[docs]def perform_task(task_name, input_data, question=None, min_length=30, max_length=130, do_sample=False,
                 override_models=None, limit=5, evaluate_models=True):
    """
        **Performs the task on the given dataset, evaluate the models and returns comparison metrics**

        task_name : str, default=None
            Supported tasks can be retrieved from the TaskModels class using the get_all_tasks method.
        input_data : str
            Text or information used by the model to perform specific NLP tasks.
        question : str, default=None
            Specific query or question provided as input to the model for question-answering tasks. The model uses this question to find the relevant answer within the provided context.
        min_length: number, default=30
            The minimum length of the generated summary. Defaults to 30. The summarization model ensures that the summary is at least this length.
        max_length : number, default=130
            The maximum length of the generated summary. Defaults to 130. The summarization model limits the summary to a maximum of this length.
        do_sample : boolean, default=False
            Whether to use sampling during summary generation. Defaults to False. When True, the model uses a sampling technique for token selection.
        override_models : str or list, default=None
            Provide additional models not part of the pre-configured list
        limit : int, default=5
            Limit the number of models to be compared. Default is 5.
        evaluate_models : boolean, default=True
            Determine if model comparison is requested. ``False`` will override `limit` as 1
    """

    # Check if the given task name is valid and supported by the available tasks.
    all_tasks = task_models.get_all_tasks()

    if task_name not in all_tasks:
        raise ValueError(f"Task '{task_name}' is not a valid task.")

    # Create an empty DataFrame to store and consolidate the results from different models
    results_df = pd.DataFrame()

    models = []
    if isinstance(override_models, str):
        if task_models.is_model_supported_by_task(override_models, task_name):
            models.append(override_models)
    elif isinstance(override_models, list):
        filtered_models = [model for model in override_models if
                           task_models.is_model_supported_by_task(model, task_name)]
        if filtered_models:
            models.extend(filtered_models)

    models.extend(task_models.get_models_for_task(task_name, limit))
    num_of_models = len(models)

    if not evaluate_models:
        limit = 1

    if limit is not None and 0 < limit <= num_of_models:
        models = models[:limit]

    results_df = process_models(task_name, models, results_df, input_data, question, min_length, max_length, do_sample)
    return results_df


def process_models(task_name, models, results_df, input_data, question=None, min_length=30, max_length=130,
                   do_sample=False):
    """
    Process multiple models with given inputs and consolidate results.

    Args:
        task_name : str, default=None
            Supported tasks can be retrieved from the TaskModels class using the get_all_tasks method.
        models : list
            A list of model names to be processed.
        results_df : pd.DataFrame
            The initial results DataFrame.
        input_data : str, default=None
            Text or information used by the model to perform specific NLP tasks.
        question : str, default=None
            Specific query or question provided as input to the model for question-answering tasks. The model uses this question to find the relevant answer within the provided context.
        min_length: number, default=30
            The minimum length of the generated summary. Defaults to 30. The summarization model ensures that the summary is at least this length.
        max_length : number, default=130
            The maximum length of the generated summary. Defaults to 130. The summarization model limits the summary to a maximum of this length.
        do_sample : boolean, default=False
            Whether to use sampling during summary generation. Defaults to False. When True, the model uses a sampling technique for token selection.

    Returns:
        pd.DataFrame
            The dataFrame containing consolidated results from all models.
    """

    for model in models:
        print('Model :: {}'.format(model))
        logger.info('Model :: {}'.format(model))

        try:
            nlp = pipeline(task=task_name, model=model)
            input_args = {}

            if task_models.is_question_supported(task_name):
                input_args["question"] = question

            if task_name == "question-answering":
                input_args["context"] = input_data
                result = nlp(input_args)
            elif task_name == "document-question-answering":
                input_args["image"] = input_data
                result = nlp(input_args)
            elif task_name == "summarization":
                input_args["min_length"] = min_length
                input_args["max_length"] = max_length
                input_args["do_sample"] = do_sample
                result = nlp(input_data, **input_args)
            elif task_name == "sentiment-analysis":
                result = nlp(input_data)

            results_df = consolidate_results(result, results_df, model)
        except ValueError as e:
            print('Skipping model due to the error.')
            logger.error('An error occurred: %s', e)

    return results_df


def consolidate_results(result, results_df, model):
    """
    Consolidates the given result into the results DataFrame.

    This function takes a result, which can be a dictionary or a list of dictionaries,
    and appends it to the provided results DataFrame. The 'model' argument is used to
    associate the result with a specific model.

    Args:
        result : (dict or list):
            The result to be consolidated into the DataFrame.
        results_df : (pd.DataFrame)
            The DataFrame to which the result will be added.
        model : str
            The model name associated with the result.

    Returns:
        results_df : pd.DataFrame
            Results DataFrame with the consolidated result.
    """

    # Initialize an empty DataFrame
    new_rows_df = pd.DataFrame()

    # Some models return the result as dict while other may return list.
    if isinstance(result, dict):
        new_rows_df = pd.DataFrame([result])
    elif isinstance(result, list):
        new_rows_df = pd.DataFrame(result)

    new_rows_df.insert(0, 'model', model)

    # Store the results in the DataFrame
    results_df = pd.concat([results_df, new_rows_df], ignore_index=True)
    return results_df