Add methods and constants for genai metrics (#2524)
* Added info about required packages * Update responsibleaidashboard-question-answering-model-debugging.ipynb * show example prediction * Update responsibleaidashboard-question-answering-model-debugging.ipynb * add methods and constants for genai task type Signed-off-by: Kartik Choudhary <kartikchoudh@umass.edu> * add missing files for genai metrics Signed-off-by: Kartik Choudhary <kartikchoudh@umass.edu> * update copyright information Signed-off-by: Kartik Choudhary <kartikchoudh@umass.edu> --------- Signed-off-by: Kartik Choudhary <kartikchoudh@umass.edu>
This commit is contained in:
Родитель
503643512d
Коммит
c8ff9a265e
|
@ -122,6 +122,15 @@ class ResponsibleAIDashboard(Dashboard):
|
|||
methods=["POST"]
|
||||
)
|
||||
|
||||
def get_generative_text_metrics():
|
||||
data = request.get_json(force=True)
|
||||
return jsonify(self.input.get_generative_text_metrics(data))
|
||||
self.add_url_rule(
|
||||
get_generative_text_metrics,
|
||||
'/get_generative_text_metrics',
|
||||
methods=["POST"]
|
||||
)
|
||||
|
||||
if hasattr(self._service, 'socketio'):
|
||||
@self._service.socketio.on('handle_object_detection_json')
|
||||
def handle_object_detection_json(od_json):
|
||||
|
@ -132,3 +141,8 @@ class ResponsibleAIDashboard(Dashboard):
|
|||
def handle_question_answering_json(qa_json):
|
||||
qa_data = json.loads(qa_json['data'])
|
||||
return self.input.get_question_answering_metrics(qa_data)
|
||||
|
||||
@self._service.socketio.on('handle_generative_text_json')
|
||||
def handle_generative_text_json(gt_json):
|
||||
gt_data = json.loads(gt_json['data'])
|
||||
return self.input.get_generative_text_metrics(gt_data)
|
||||
|
|
|
@ -171,7 +171,7 @@ class ResponsibleAIDashboardInput:
|
|||
|
||||
def debug_ml(self, data):
|
||||
try:
|
||||
features = data[0]
|
||||
features = data[0] # TODO: Remove prompt feature
|
||||
filters = data[1]
|
||||
composite_filters = data[2]
|
||||
max_depth = data[3]
|
||||
|
@ -484,3 +484,34 @@ class ResponsibleAIDashboardInput:
|
|||
"inner error: {}".format(e_str),
|
||||
WidgetRequestResponseConstants.data: []
|
||||
}
|
||||
|
||||
def get_generative_text_metrics(self, post_data):
|
||||
"""Flask endpoint function to get Model Overview metrics
|
||||
for the Generative Text scenario.
|
||||
|
||||
:param post_data: List of inputs in the order
|
||||
[true_y, predicted_y, aggregate_method, class_name, iou_threshold].
|
||||
:type post_data: List
|
||||
|
||||
:return: JSON/dict data response
|
||||
:rtype: Dict[str, List]
|
||||
"""
|
||||
try:
|
||||
selection_indexes = post_data[0]
|
||||
generative_text_cache = post_data[1]
|
||||
exp = self._analysis.compute_genai_metrics(
|
||||
selection_indexes,
|
||||
generative_text_cache
|
||||
)
|
||||
return {
|
||||
WidgetRequestResponseConstants.data: exp
|
||||
}
|
||||
except Exception as e:
|
||||
print(e)
|
||||
traceback.print_exc()
|
||||
e_str = _format_exception(e)
|
||||
return {
|
||||
WidgetRequestResponseConstants.error:
|
||||
EXP_VIZ_ERR_MSG.format(e_str),
|
||||
WidgetRequestResponseConstants.data: []
|
||||
}
|
||||
|
|
|
@ -18,6 +18,8 @@ class ModelTask(str, Enum):
|
|||
QUESTION_ANSWERING = 'question_answering'
|
||||
ENTAILMENT = 'entailment'
|
||||
SUMMARIZATIONS = 'summarizations'
|
||||
GENERATIVE_TEXT = 'generative_text'
|
||||
GENERATIVE_TEXT_CHAT = 'generative_text_chat'
|
||||
UNKNOWN = 'unknown'
|
||||
|
||||
|
||||
|
@ -34,3 +36,9 @@ class QuestionAnsweringFields(object):
|
|||
QUESTION = "question"
|
||||
CONTEXT = "context"
|
||||
ANSWERS = "answers"
|
||||
|
||||
|
||||
class GenerativeTextFields(object):
|
||||
PROMPT = "prompt"
|
||||
SYS_PROMPT = "sys_prompt"
|
||||
RESPONSE = "response"
|
||||
|
|
|
@ -12,6 +12,7 @@ import numpy as np
|
|||
import pandas as pd
|
||||
from ml_wrappers import wrap_model
|
||||
|
||||
from erroranalysis._internal.constants import ModelTask as ErrorAnalysisTask
|
||||
from erroranalysis._internal.error_analyzer import ModelAnalyzer
|
||||
from erroranalysis._internal.error_report import as_error_report
|
||||
from responsibleai._tools.shared.state_directory_management import \
|
||||
|
@ -22,6 +23,7 @@ from responsibleai.managers.error_analysis_manager import \
|
|||
from responsibleai.managers.error_analysis_manager import as_error_config
|
||||
from responsibleai_text.common.constants import ModelTask
|
||||
from responsibleai_text.utils.feature_extractors import get_text_columns
|
||||
from responsibleai_text.utils.genai_metrics.metrics import get_genai_metric
|
||||
|
||||
LABELS = 'labels'
|
||||
|
||||
|
@ -83,6 +85,14 @@ class WrappedIndexPredictorModel:
|
|||
self.predictions = self.model.predict(
|
||||
self.dataset.loc[:, ['context', 'questions']])
|
||||
self.predictions = np.array(self.predictions)
|
||||
elif self.task_type == ModelTask.GENERATIVE_TEXT:
|
||||
# TODO: Decide the final metric for error analysis
|
||||
coherence = get_genai_metric(
|
||||
'coherence',
|
||||
predictions=self.model.predict(self.dataset),
|
||||
references=dataset['prompt'],
|
||||
wrapper_model=self.model)
|
||||
self.predictions = np.array(coherence['scores'])
|
||||
else:
|
||||
raise ValueError("Unknown task type: {}".format(self.task_type))
|
||||
|
||||
|
@ -193,9 +203,17 @@ class ErrorAnalysisManager(BaseErrorAnalysisManager):
|
|||
task_type, index_classes)
|
||||
if categorical_features is None:
|
||||
categorical_features = []
|
||||
if task_type == ModelTask.GENERATIVE_TEXT:
|
||||
sup_task_type = ErrorAnalysisTask.REGRESSION
|
||||
ext_dataset = ext_dataset.copy()
|
||||
del ext_dataset['prompt']
|
||||
ext_dataset['target_score'] = 5
|
||||
target_column = 'target_score'
|
||||
else:
|
||||
sup_task_type = ErrorAnalysisTask.CLASSIFICATION
|
||||
super(ErrorAnalysisManager, self).__init__(
|
||||
index_predictor, ext_dataset, target_column,
|
||||
classes, categorical_features)
|
||||
classes, categorical_features, model_task=sup_task_type)
|
||||
|
||||
@staticmethod
|
||||
def _create_index_predictor(model, dataset, target_column,
|
||||
|
|
|
@ -30,6 +30,8 @@ from responsibleai_text.managers.error_analysis_manager import \
|
|||
from responsibleai_text.managers.explainer_manager import ExplainerManager
|
||||
from responsibleai_text.utils.feature_extractors import (extract_features,
|
||||
get_text_columns)
|
||||
from responsibleai_text.utils.genai_metrics.metrics import \
|
||||
get_genai_metric_mean
|
||||
|
||||
module_logger = logging.getLogger(__name__)
|
||||
module_logger.setLevel(logging.INFO)
|
||||
|
@ -116,7 +118,8 @@ class RAITextInsights(RAIBaseInsights):
|
|||
serializer: Optional[Any] = None,
|
||||
maximum_rows_for_test: int = 5000,
|
||||
feature_metadata: Optional[FeatureMetadata] = None,
|
||||
text_column: Optional[Union[str, List]] = None):
|
||||
text_column: Optional[Union[str, List]] = None,
|
||||
eval_model: Any = None):
|
||||
"""Creates an RAITextInsights object.
|
||||
|
||||
:param model: The model to compute RAI insights for.
|
||||
|
@ -148,6 +151,10 @@ class RAITextInsights(RAIBaseInsights):
|
|||
If not provided, and there is additional feature metadata, then
|
||||
an exception will be raised.
|
||||
:type text_column: str or list[str]
|
||||
:param eval_model: The model to use for evaluation with AI-assisted
|
||||
metrics. If not provided, then the model passed in the model
|
||||
parameter will be used.
|
||||
:type eval_model: object
|
||||
"""
|
||||
# drop index as this can cause issues later like when copying
|
||||
# target column below from test dataset to _ext_test_df
|
||||
|
@ -160,6 +167,10 @@ class RAITextInsights(RAIBaseInsights):
|
|||
self._text_column = text_column
|
||||
self._feature_metadata = feature_metadata
|
||||
self._wrapped_model = wrap_model(model, test, task_type)
|
||||
if eval_model is None:
|
||||
self._eval_model = self._wrapped_model
|
||||
else:
|
||||
self._eval_model = wrap_model(eval_model, test, task_type)
|
||||
self._validate_rai_insights_input_parameters(
|
||||
model=self._wrapped_model, test=test,
|
||||
target_column=target_column, task_type=task_type,
|
||||
|
@ -269,7 +280,9 @@ class RAITextInsights(RAIBaseInsights):
|
|||
target_column, axis=1)
|
||||
small_test_data = get_text_columns(small_test_data, text_column)
|
||||
small_test_data = small_test_data.iloc[0]
|
||||
if task_type != ModelTask.QUESTION_ANSWERING:
|
||||
if task_type not in [
|
||||
ModelTask.QUESTION_ANSWERING,
|
||||
ModelTask.GENERATIVE_TEXT]:
|
||||
small_test_data = small_test_data.tolist()
|
||||
# Call the model
|
||||
try:
|
||||
|
@ -319,7 +332,8 @@ class RAITextInsights(RAIBaseInsights):
|
|||
ModelTask.SENTIMENT_ANALYSIS.value,
|
||||
ModelTask.QUESTION_ANSWERING.value,
|
||||
ModelTask.ENTAILMENT.value,
|
||||
ModelTask.SUMMARIZATIONS.value
|
||||
ModelTask.SUMMARIZATIONS.value,
|
||||
ModelTask.GENERATIVE_TEXT.value,
|
||||
]
|
||||
|
||||
if task_type not in valid_tasks:
|
||||
|
@ -362,6 +376,10 @@ class RAITextInsights(RAIBaseInsights):
|
|||
if not target_columns_set.issubset(set(test.columns)):
|
||||
raise UserConfigValidationException(
|
||||
'The list of target_column(s) should be in test data')
|
||||
elif (task_type == ModelTask.GENERATIVE_TEXT.value and
|
||||
target_column is None):
|
||||
# target column is optional for generative text
|
||||
pass
|
||||
else:
|
||||
if target_column not in list(test.columns):
|
||||
raise UserConfigValidationException(
|
||||
|
@ -514,6 +532,11 @@ class RAITextInsights(RAIBaseInsights):
|
|||
dataset = self.test.drop(target_column, axis=1)
|
||||
elif self.task_type == ModelTask.QUESTION_ANSWERING:
|
||||
dataset = self.test.drop([self.target_column], axis=1)
|
||||
elif self.task_type == ModelTask.GENERATIVE_TEXT:
|
||||
if self.target_column is None:
|
||||
dataset = self.test.copy()
|
||||
else:
|
||||
dataset = self.test.drop([self.target_column], axis=1)
|
||||
else:
|
||||
raise ValueError("Unknown task type: {}".format(self.task_type))
|
||||
dataset = get_text_columns(dataset, self._text_column)
|
||||
|
@ -853,3 +876,71 @@ class RAITextInsights(RAIBaseInsights):
|
|||
except ValueError:
|
||||
all_cohort_metrics.append([0, 0, 0, 0, 0, 0])
|
||||
return all_cohort_metrics
|
||||
|
||||
def compute_genai_metrics(
|
||||
self,
|
||||
selection_indexes,
|
||||
genai_cache
|
||||
):
|
||||
dashboard_dataset = self.get_data().dataset
|
||||
prompt_idx = dashboard_dataset.feature_names.index('prompt')
|
||||
prompts = [feat[prompt_idx] for feat in dashboard_dataset.features]
|
||||
true_y = dashboard_dataset.true_y
|
||||
predicted_y = dashboard_dataset.predicted_y
|
||||
|
||||
all_cohort_metrics = []
|
||||
for cohort_indices in selection_indexes:
|
||||
cohort_metrics = dict()
|
||||
|
||||
if true_y is None:
|
||||
true_y_cohort = None
|
||||
else:
|
||||
true_y_cohort = [true_y[cohort_index] for cohort_index
|
||||
in cohort_indices]
|
||||
predicted_y_cohort = [predicted_y[cohort_index] for cohort_index
|
||||
in cohort_indices]
|
||||
prompts_cohort = [prompts[cohort_index] for cohort_index
|
||||
in cohort_indices]
|
||||
try:
|
||||
if true_y_cohort is not None:
|
||||
exact_match = evaluate.load('exact_match')
|
||||
cohort_metrics['exact_match'] = exact_match.compute(
|
||||
predictions=predicted_y_cohort,
|
||||
references=true_y_cohort)
|
||||
|
||||
cohort_metrics['coherence'] = get_genai_metric_mean(
|
||||
'coherence',
|
||||
predictions=predicted_y_cohort,
|
||||
references=prompts_cohort,
|
||||
wrapper_model=self._eval_model)
|
||||
|
||||
if true_y_cohort is not None:
|
||||
cohort_metrics['equivalence'] = get_genai_metric_mean(
|
||||
'equivalence',
|
||||
predictions=predicted_y_cohort,
|
||||
references=prompts_cohort,
|
||||
answers=true_y_cohort,
|
||||
wrapper_model=self._eval_model)
|
||||
|
||||
cohort_metrics['fluency'] = get_genai_metric_mean(
|
||||
'fluency',
|
||||
predictions=predicted_y_cohort,
|
||||
references=prompts_cohort,
|
||||
wrapper_model=self._eval_model)
|
||||
|
||||
cohort_metrics['groundedness'] = get_genai_metric_mean(
|
||||
'groundedness',
|
||||
predictions=predicted_y_cohort,
|
||||
references=prompts_cohort,
|
||||
wrapper_model=self._eval_model)
|
||||
|
||||
cohort_metrics['relevance'] = get_genai_metric_mean(
|
||||
'relevance',
|
||||
predictions=predicted_y_cohort,
|
||||
references=prompts_cohort,
|
||||
wrapper_model=self._eval_model)
|
||||
|
||||
all_cohort_metrics.append(cohort_metrics)
|
||||
except ValueError:
|
||||
all_cohort_metrics.append({})
|
||||
return all_cohort_metrics
|
||||
|
|
|
@ -12,7 +12,8 @@ from negspacy.termsets import termset
|
|||
from tqdm import tqdm
|
||||
|
||||
from nlp_feature_extractors import attribute_extractors as exts
|
||||
from responsibleai_text.common.constants import (ModelTask,
|
||||
from responsibleai_text.common.constants import (GenerativeTextFields,
|
||||
ModelTask,
|
||||
QuestionAnsweringFields)
|
||||
|
||||
nlp = None
|
||||
|
@ -60,6 +61,9 @@ def extract_features(text_dataset: pd.DataFrame,
|
|||
feature_names.append(prefix + "maximum_parse_tree_depth")
|
||||
feature_names.append("question_type")
|
||||
feature_names.append("context_overlap")
|
||||
elif task_type == ModelTask.GENERATIVE_TEXT:
|
||||
start_meta_index = 0
|
||||
feature_names = base_feature_names
|
||||
else:
|
||||
raise ValueError("Unknown task type: {}".format(task_type))
|
||||
# copy over the metadata column names
|
||||
|
@ -96,6 +100,19 @@ def extract_features(text_dataset: pd.DataFrame,
|
|||
context_overlap = get_context_overlap(context=context,
|
||||
question=question)
|
||||
extracted_features.append(context_overlap)
|
||||
# append all other metadata features
|
||||
append_metadata_values(start_meta_index, text_dataset, i,
|
||||
extracted_features, has_dropped_features,
|
||||
dropped_features, column_names)
|
||||
results.append(extracted_features)
|
||||
elif task_type == ModelTask.GENERATIVE_TEXT:
|
||||
for i, row in tqdm(text_features.iterrows(),
|
||||
desc='feature extraction'):
|
||||
extracted_features = []
|
||||
add_extracted_features_for_sentence(
|
||||
row[GenerativeTextFields.PROMPT], extracted_features,
|
||||
task_type)
|
||||
|
||||
# append all other metadata features
|
||||
append_metadata_values(start_meta_index, text_dataset, i,
|
||||
extracted_features, has_dropped_features,
|
||||
|
|
|
@ -0,0 +1,4 @@
|
|||
# Copyright (c) Microsoft Corporation
|
||||
# Licensed under the MIT License.
|
||||
|
||||
"""Contains the GenAI metrics."""
|
|
@ -0,0 +1,4 @@
|
|||
# Copyright (c) Microsoft Corporation
|
||||
# Licensed under the MIT License.
|
||||
|
||||
"""Contains the implementation of various metrics for GenAI."""
|
|
@ -24,7 +24,7 @@ def _compute_metric(template, logger, wrapper_model, **kwargs):
|
|||
templated_ques = format_str(template, **kwargs)
|
||||
|
||||
inp = pd.DataFrame({
|
||||
'questions': templated_ques,
|
||||
'prompt': templated_ques,
|
||||
'sys_prompt': _SYS_PROMPT})
|
||||
|
||||
responses = wrapper_model.predict(inp)
|
||||
|
|
|
@ -48,6 +48,15 @@ Five stars: the predicted answer is completely similar to the correct answer
|
|||
|
||||
This rating value should always be an integer between 1 and 5. So the rating \
|
||||
produced should be 1 or 2 or 3 or 4 or 5.
|
||||
Some examples of valid responses are:
|
||||
1
|
||||
2
|
||||
5
|
||||
Some examples of invalid responses are:
|
||||
1/5
|
||||
1.5
|
||||
3.0
|
||||
5 stars
|
||||
|
||||
QUESTION:
|
||||
{question}
|
||||
|
|
|
@ -47,6 +47,15 @@ Five stars: the answer has perfect fluency
|
|||
|
||||
This rating value should always be an integer between 1 and 5. So the rating \
|
||||
produced should be 1 or 2 or 3 or 4 or 5.
|
||||
Some examples of valid responses are:
|
||||
1
|
||||
2
|
||||
5
|
||||
Some examples of invalid responses are:
|
||||
1/5
|
||||
1.5
|
||||
3.0
|
||||
5 stars
|
||||
|
||||
QUESTION:
|
||||
{question}
|
||||
|
|
|
@ -47,6 +47,18 @@ the CONTEXT entails.
|
|||
Note the ANSWER is generated by a computer system, it can contain certain \
|
||||
symbols, which should not be a negative factor in the evaluation.
|
||||
|
||||
This rating value should always be an integer between 1 and 5. So the rating \
|
||||
produced should be 1 or 2 or 3 or 4 or 5.
|
||||
Some examples of valid responses are:
|
||||
1
|
||||
2
|
||||
5
|
||||
Some examples of invalid responses are:
|
||||
1/5
|
||||
1.5
|
||||
3.0
|
||||
5 stars
|
||||
|
||||
CONTEXT:
|
||||
{context}
|
||||
|
||||
|
|
|
@ -47,6 +47,15 @@ Five stars: the answer has perfect relevance
|
|||
|
||||
This rating value should always be an integer between 1 and 5. So the rating \
|
||||
produced should be 1 or 2 or 3 or 4 or 5.
|
||||
Some examples of valid responses are:
|
||||
1
|
||||
2
|
||||
5
|
||||
Some examples of invalid responses are:
|
||||
1/5
|
||||
1.5
|
||||
3.0
|
||||
5 stars
|
||||
|
||||
QUESTION AND CONTEXT:
|
||||
{question}
|
||||
|
|
Загрузка…
Ссылка в новой задаче