Add methods and constants for genai metrics (#2524)

* Added info about required packages

* Update responsibleaidashboard-question-answering-model-debugging.ipynb

* show example prediction

* Update responsibleaidashboard-question-answering-model-debugging.ipynb

* add methods and constants for genai task type

Signed-off-by: Kartik Choudhary <kartikchoudh@umass.edu>

* add missing files for genai metrics

Signed-off-by: Kartik Choudhary <kartikchoudh@umass.edu>

* update copyright information

Signed-off-by: Kartik Choudhary <kartikchoudh@umass.edu>

---------

Signed-off-by: Kartik Choudhary <kartikchoudh@umass.edu>
This commit is contained in:
Kartik Choudhary 2024-02-01 17:26:58 -05:00 коммит произвёл GitHub
Родитель 503643512d
Коммит c8ff9a265e
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: B5690EEEBB952194
13 изменённых файлов: 233 добавлений и 7 удалений

Просмотреть файл

@ -122,6 +122,15 @@ class ResponsibleAIDashboard(Dashboard):
methods=["POST"]
)
def get_generative_text_metrics():
data = request.get_json(force=True)
return jsonify(self.input.get_generative_text_metrics(data))
self.add_url_rule(
get_generative_text_metrics,
'/get_generative_text_metrics',
methods=["POST"]
)
if hasattr(self._service, 'socketio'):
@self._service.socketio.on('handle_object_detection_json')
def handle_object_detection_json(od_json):
@ -132,3 +141,8 @@ class ResponsibleAIDashboard(Dashboard):
def handle_question_answering_json(qa_json):
qa_data = json.loads(qa_json['data'])
return self.input.get_question_answering_metrics(qa_data)
@self._service.socketio.on('handle_generative_text_json')
def handle_generative_text_json(gt_json):
gt_data = json.loads(gt_json['data'])
return self.input.get_generative_text_metrics(gt_data)

Просмотреть файл

@ -171,7 +171,7 @@ class ResponsibleAIDashboardInput:
def debug_ml(self, data):
try:
features = data[0]
features = data[0] # TODO: Remove prompt feature
filters = data[1]
composite_filters = data[2]
max_depth = data[3]
@ -484,3 +484,34 @@ class ResponsibleAIDashboardInput:
"inner error: {}".format(e_str),
WidgetRequestResponseConstants.data: []
}
def get_generative_text_metrics(self, post_data):
"""Flask endpoint function to get Model Overview metrics
for the Generative Text scenario.
:param post_data: List of inputs in the order
[true_y, predicted_y, aggregate_method, class_name, iou_threshold].
:type post_data: List
:return: JSON/dict data response
:rtype: Dict[str, List]
"""
try:
selection_indexes = post_data[0]
generative_text_cache = post_data[1]
exp = self._analysis.compute_genai_metrics(
selection_indexes,
generative_text_cache
)
return {
WidgetRequestResponseConstants.data: exp
}
except Exception as e:
print(e)
traceback.print_exc()
e_str = _format_exception(e)
return {
WidgetRequestResponseConstants.error:
EXP_VIZ_ERR_MSG.format(e_str),
WidgetRequestResponseConstants.data: []
}

Просмотреть файл

@ -18,6 +18,8 @@ class ModelTask(str, Enum):
QUESTION_ANSWERING = 'question_answering'
ENTAILMENT = 'entailment'
SUMMARIZATIONS = 'summarizations'
GENERATIVE_TEXT = 'generative_text'
GENERATIVE_TEXT_CHAT = 'generative_text_chat'
UNKNOWN = 'unknown'
@ -34,3 +36,9 @@ class QuestionAnsweringFields(object):
QUESTION = "question"
CONTEXT = "context"
ANSWERS = "answers"
class GenerativeTextFields(object):
PROMPT = "prompt"
SYS_PROMPT = "sys_prompt"
RESPONSE = "response"

Просмотреть файл

@ -12,6 +12,7 @@ import numpy as np
import pandas as pd
from ml_wrappers import wrap_model
from erroranalysis._internal.constants import ModelTask as ErrorAnalysisTask
from erroranalysis._internal.error_analyzer import ModelAnalyzer
from erroranalysis._internal.error_report import as_error_report
from responsibleai._tools.shared.state_directory_management import \
@ -22,6 +23,7 @@ from responsibleai.managers.error_analysis_manager import \
from responsibleai.managers.error_analysis_manager import as_error_config
from responsibleai_text.common.constants import ModelTask
from responsibleai_text.utils.feature_extractors import get_text_columns
from responsibleai_text.utils.genai_metrics.metrics import get_genai_metric
LABELS = 'labels'
@ -83,6 +85,14 @@ class WrappedIndexPredictorModel:
self.predictions = self.model.predict(
self.dataset.loc[:, ['context', 'questions']])
self.predictions = np.array(self.predictions)
elif self.task_type == ModelTask.GENERATIVE_TEXT:
# TODO: Decide the final metric for error analysis
coherence = get_genai_metric(
'coherence',
predictions=self.model.predict(self.dataset),
references=dataset['prompt'],
wrapper_model=self.model)
self.predictions = np.array(coherence['scores'])
else:
raise ValueError("Unknown task type: {}".format(self.task_type))
@ -193,9 +203,17 @@ class ErrorAnalysisManager(BaseErrorAnalysisManager):
task_type, index_classes)
if categorical_features is None:
categorical_features = []
if task_type == ModelTask.GENERATIVE_TEXT:
sup_task_type = ErrorAnalysisTask.REGRESSION
ext_dataset = ext_dataset.copy()
del ext_dataset['prompt']
ext_dataset['target_score'] = 5
target_column = 'target_score'
else:
sup_task_type = ErrorAnalysisTask.CLASSIFICATION
super(ErrorAnalysisManager, self).__init__(
index_predictor, ext_dataset, target_column,
classes, categorical_features)
classes, categorical_features, model_task=sup_task_type)
@staticmethod
def _create_index_predictor(model, dataset, target_column,

Просмотреть файл

@ -30,6 +30,8 @@ from responsibleai_text.managers.error_analysis_manager import \
from responsibleai_text.managers.explainer_manager import ExplainerManager
from responsibleai_text.utils.feature_extractors import (extract_features,
get_text_columns)
from responsibleai_text.utils.genai_metrics.metrics import \
get_genai_metric_mean
module_logger = logging.getLogger(__name__)
module_logger.setLevel(logging.INFO)
@ -116,7 +118,8 @@ class RAITextInsights(RAIBaseInsights):
serializer: Optional[Any] = None,
maximum_rows_for_test: int = 5000,
feature_metadata: Optional[FeatureMetadata] = None,
text_column: Optional[Union[str, List]] = None):
text_column: Optional[Union[str, List]] = None,
eval_model: Any = None):
"""Creates an RAITextInsights object.
:param model: The model to compute RAI insights for.
@ -148,6 +151,10 @@ class RAITextInsights(RAIBaseInsights):
If not provided, and there is additional feature metadata, then
an exception will be raised.
:type text_column: str or list[str]
:param eval_model: The model to use for evaluation with AI-assisted
metrics. If not provided, then the model passed in the model
parameter will be used.
:type eval_model: object
"""
# drop index as this can cause issues later like when copying
# target column below from test dataset to _ext_test_df
@ -160,6 +167,10 @@ class RAITextInsights(RAIBaseInsights):
self._text_column = text_column
self._feature_metadata = feature_metadata
self._wrapped_model = wrap_model(model, test, task_type)
if eval_model is None:
self._eval_model = self._wrapped_model
else:
self._eval_model = wrap_model(eval_model, test, task_type)
self._validate_rai_insights_input_parameters(
model=self._wrapped_model, test=test,
target_column=target_column, task_type=task_type,
@ -269,7 +280,9 @@ class RAITextInsights(RAIBaseInsights):
target_column, axis=1)
small_test_data = get_text_columns(small_test_data, text_column)
small_test_data = small_test_data.iloc[0]
if task_type != ModelTask.QUESTION_ANSWERING:
if task_type not in [
ModelTask.QUESTION_ANSWERING,
ModelTask.GENERATIVE_TEXT]:
small_test_data = small_test_data.tolist()
# Call the model
try:
@ -319,7 +332,8 @@ class RAITextInsights(RAIBaseInsights):
ModelTask.SENTIMENT_ANALYSIS.value,
ModelTask.QUESTION_ANSWERING.value,
ModelTask.ENTAILMENT.value,
ModelTask.SUMMARIZATIONS.value
ModelTask.SUMMARIZATIONS.value,
ModelTask.GENERATIVE_TEXT.value,
]
if task_type not in valid_tasks:
@ -362,6 +376,10 @@ class RAITextInsights(RAIBaseInsights):
if not target_columns_set.issubset(set(test.columns)):
raise UserConfigValidationException(
'The list of target_column(s) should be in test data')
elif (task_type == ModelTask.GENERATIVE_TEXT.value and
target_column is None):
# target column is optional for generative text
pass
else:
if target_column not in list(test.columns):
raise UserConfigValidationException(
@ -514,6 +532,11 @@ class RAITextInsights(RAIBaseInsights):
dataset = self.test.drop(target_column, axis=1)
elif self.task_type == ModelTask.QUESTION_ANSWERING:
dataset = self.test.drop([self.target_column], axis=1)
elif self.task_type == ModelTask.GENERATIVE_TEXT:
if self.target_column is None:
dataset = self.test.copy()
else:
dataset = self.test.drop([self.target_column], axis=1)
else:
raise ValueError("Unknown task type: {}".format(self.task_type))
dataset = get_text_columns(dataset, self._text_column)
@ -853,3 +876,71 @@ class RAITextInsights(RAIBaseInsights):
except ValueError:
all_cohort_metrics.append([0, 0, 0, 0, 0, 0])
return all_cohort_metrics
def compute_genai_metrics(
self,
selection_indexes,
genai_cache
):
dashboard_dataset = self.get_data().dataset
prompt_idx = dashboard_dataset.feature_names.index('prompt')
prompts = [feat[prompt_idx] for feat in dashboard_dataset.features]
true_y = dashboard_dataset.true_y
predicted_y = dashboard_dataset.predicted_y
all_cohort_metrics = []
for cohort_indices in selection_indexes:
cohort_metrics = dict()
if true_y is None:
true_y_cohort = None
else:
true_y_cohort = [true_y[cohort_index] for cohort_index
in cohort_indices]
predicted_y_cohort = [predicted_y[cohort_index] for cohort_index
in cohort_indices]
prompts_cohort = [prompts[cohort_index] for cohort_index
in cohort_indices]
try:
if true_y_cohort is not None:
exact_match = evaluate.load('exact_match')
cohort_metrics['exact_match'] = exact_match.compute(
predictions=predicted_y_cohort,
references=true_y_cohort)
cohort_metrics['coherence'] = get_genai_metric_mean(
'coherence',
predictions=predicted_y_cohort,
references=prompts_cohort,
wrapper_model=self._eval_model)
if true_y_cohort is not None:
cohort_metrics['equivalence'] = get_genai_metric_mean(
'equivalence',
predictions=predicted_y_cohort,
references=prompts_cohort,
answers=true_y_cohort,
wrapper_model=self._eval_model)
cohort_metrics['fluency'] = get_genai_metric_mean(
'fluency',
predictions=predicted_y_cohort,
references=prompts_cohort,
wrapper_model=self._eval_model)
cohort_metrics['groundedness'] = get_genai_metric_mean(
'groundedness',
predictions=predicted_y_cohort,
references=prompts_cohort,
wrapper_model=self._eval_model)
cohort_metrics['relevance'] = get_genai_metric_mean(
'relevance',
predictions=predicted_y_cohort,
references=prompts_cohort,
wrapper_model=self._eval_model)
all_cohort_metrics.append(cohort_metrics)
except ValueError:
all_cohort_metrics.append({})
return all_cohort_metrics

Просмотреть файл

@ -12,7 +12,8 @@ from negspacy.termsets import termset
from tqdm import tqdm
from nlp_feature_extractors import attribute_extractors as exts
from responsibleai_text.common.constants import (ModelTask,
from responsibleai_text.common.constants import (GenerativeTextFields,
ModelTask,
QuestionAnsweringFields)
nlp = None
@ -60,6 +61,9 @@ def extract_features(text_dataset: pd.DataFrame,
feature_names.append(prefix + "maximum_parse_tree_depth")
feature_names.append("question_type")
feature_names.append("context_overlap")
elif task_type == ModelTask.GENERATIVE_TEXT:
start_meta_index = 0
feature_names = base_feature_names
else:
raise ValueError("Unknown task type: {}".format(task_type))
# copy over the metadata column names
@ -96,6 +100,19 @@ def extract_features(text_dataset: pd.DataFrame,
context_overlap = get_context_overlap(context=context,
question=question)
extracted_features.append(context_overlap)
# append all other metadata features
append_metadata_values(start_meta_index, text_dataset, i,
extracted_features, has_dropped_features,
dropped_features, column_names)
results.append(extracted_features)
elif task_type == ModelTask.GENERATIVE_TEXT:
for i, row in tqdm(text_features.iterrows(),
desc='feature extraction'):
extracted_features = []
add_extracted_features_for_sentence(
row[GenerativeTextFields.PROMPT], extracted_features,
task_type)
# append all other metadata features
append_metadata_values(start_meta_index, text_dataset, i,
extracted_features, has_dropped_features,

Просмотреть файл

@ -0,0 +1,4 @@
# Copyright (c) Microsoft Corporation
# Licensed under the MIT License.
"""Contains the GenAI metrics."""

Просмотреть файл

@ -0,0 +1,4 @@
# Copyright (c) Microsoft Corporation
# Licensed under the MIT License.
"""Contains the implementation of various metrics for GenAI."""

Просмотреть файл

@ -24,7 +24,7 @@ def _compute_metric(template, logger, wrapper_model, **kwargs):
templated_ques = format_str(template, **kwargs)
inp = pd.DataFrame({
'questions': templated_ques,
'prompt': templated_ques,
'sys_prompt': _SYS_PROMPT})
responses = wrapper_model.predict(inp)

Просмотреть файл

@ -48,6 +48,15 @@ Five stars: the predicted answer is completely similar to the correct answer
This rating value should always be an integer between 1 and 5. So the rating \
produced should be 1 or 2 or 3 or 4 or 5.
Some examples of valid responses are:
1
2
5
Some examples of invalid responses are:
1/5
1.5
3.0
5 stars
QUESTION:
{question}

Просмотреть файл

@ -47,6 +47,15 @@ Five stars: the answer has perfect fluency
This rating value should always be an integer between 1 and 5. So the rating \
produced should be 1 or 2 or 3 or 4 or 5.
Some examples of valid responses are:
1
2
5
Some examples of invalid responses are:
1/5
1.5
3.0
5 stars
QUESTION:
{question}

Просмотреть файл

@ -47,6 +47,18 @@ the CONTEXT entails.
Note the ANSWER is generated by a computer system, it can contain certain \
symbols, which should not be a negative factor in the evaluation.
This rating value should always be an integer between 1 and 5. So the rating \
produced should be 1 or 2 or 3 or 4 or 5.
Some examples of valid responses are:
1
2
5
Some examples of invalid responses are:
1/5
1.5
3.0
5 stars
CONTEXT:
{context}

Просмотреть файл

@ -47,6 +47,15 @@ Five stars: the answer has perfect relevance
This rating value should always be an integer between 1 and 5. So the rating \
produced should be 1 or 2 or 3 or 4 or 5.
Some examples of valid responses are:
1
2
5
Some examples of invalid responses are:
1/5
1.5
3.0
5 stars
QUESTION AND CONTEXT:
{question}