diff --git a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md index 6dac94564c8e..db792cd06f95 100644 --- a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md +++ b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md @@ -4,6 +4,7 @@ ## 1.0.0b5 (Unreleased) ### Features Added +- Added `GroundednessProEvaluator`, which is a service-based evaluator for determining response groundedness. - Groundedness detection in Non Adversarial Simulator via query/context pairs ```python import importlib.resources as pkg_resources diff --git a/sdk/evaluation/azure-ai-evaluation/assets.json b/sdk/evaluation/azure-ai-evaluation/assets.json index 8e5b702580a1..637c5f184670 100644 --- a/sdk/evaluation/azure-ai-evaluation/assets.json +++ b/sdk/evaluation/azure-ai-evaluation/assets.json @@ -2,5 +2,5 @@ "AssetsRepo": "Azure/azure-sdk-assets", "AssetsRepoPrefixPath": "python", "TagPrefix": "python/evaluation/azure-ai-evaluation", - "Tag": "python/evaluation/azure-ai-evaluation_1390701e9d" + "Tag": "python/evaluation/azure-ai-evaluation_5551827d25" } diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py index 87c7ac51de51..f1d59bf13b24 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py @@ -16,6 +16,7 @@ from ._evaluators._fluency import FluencyEvaluator from ._evaluators._gleu import GleuScoreEvaluator from ._evaluators._groundedness import GroundednessEvaluator +from ._evaluators._service_groundedness import GroundednessProEvaluator from ._evaluators._meteor import MeteorScoreEvaluator from ._evaluators._protected_material import ProtectedMaterialEvaluator from ._evaluators._qa import QAEvaluator @@ -40,6 +41,7 @@ "F1ScoreEvaluator", "FluencyEvaluator", "GroundednessEvaluator", + "GroundednessProEvaluator", "RelevanceEvaluator", "SimilarityEvaluator", "QAEvaluator", diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/constants.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/constants.py index c2fb55ed4c2c..8b3746f7466c 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/constants.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/constants.py @@ -35,6 +35,7 @@ class Tasks: CONTENT_HARM = "content harm" PROTECTED_MATERIAL = "protected material" XPIA = "xpia" + GROUNDEDNESS = "groundedness" class _InternalAnnotationTasks: @@ -56,6 +57,7 @@ class EvaluationMetrics(str, Enum, metaclass=CaseInsensitiveEnumMeta): SEXUAL = "sexual" PROTECTED_MATERIAL = "protected_material" XPIA = "xpia" + GROUNDEDNESS = "generic_groundedness" class _InternalEvaluationMetrics(str, Enum, metaclass=CaseInsensitiveEnumMeta): diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/rai_service.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/rai_service.py index 20e7580bde63..cfbed826a302 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/rai_service.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/rai_service.py @@ -7,8 +7,9 @@ import re import time from ast import literal_eval -from typing import Any, Dict, List, Optional, Union, cast +from typing import Dict, List, Optional, Union, cast from urllib.parse import urlparse +from string import Template import jwt @@ -23,7 +24,6 @@ EvaluationMetrics, RAIService, Tasks, - _InternalAnnotationTasks, _InternalEvaluationMetrics, ) from .utils import get_harm_severity_level @@ -34,6 +34,11 @@ version = "unknown" USER_AGENT = "{}/{}".format("azure-ai-evaluation", version) +USER_TEXT_TEMPLATE_DICT: Dict[str, Template] = { + "DEFAULT": Template("{$query}{$response}"), + Tasks.GROUNDEDNESS: Template('{"question": "$query", "answer": "$response", "context": "$context"}'), +} + def get_common_headers(token: str) -> Dict: """Get common headers for the HTTP request @@ -99,7 +104,7 @@ async def ensure_service_availability(rai_svc_url: str, token: str, capability: ) -def generate_payload(normalized_user_text: str, metric: str) -> Dict: +def generate_payload(normalized_user_text: str, metric: str, annotation_task: str) -> Dict: """Generate the payload for the annotation request :param normalized_user_text: The normalized user text to be entered as the "UserTextList" in the payload. @@ -107,19 +112,18 @@ def generate_payload(normalized_user_text: str, metric: str) -> Dict: :param metric: The evaluation metric to use. This determines the task type, and whether a "MetricList" is needed in the payload. :type metric: str + :param annotation_task: The annotation task to be passed to service + :type annotation_task: str :return: The payload for the annotation request. :rtype: Dict """ include_metric = True - task = Tasks.CONTENT_HARM + task = annotation_task if metric == EvaluationMetrics.PROTECTED_MATERIAL: - task = Tasks.PROTECTED_MATERIAL include_metric = False elif metric == _InternalEvaluationMetrics.ECI: - task = _InternalAnnotationTasks.ECI include_metric = False elif metric == EvaluationMetrics.XPIA: - task = Tasks.XPIA include_metric = False return ( { @@ -135,25 +139,25 @@ def generate_payload(normalized_user_text: str, metric: str) -> Dict: ) -async def submit_request(query: str, response: str, metric: str, rai_svc_url: str, token: str) -> str: +async def submit_request(data: dict, metric: str, rai_svc_url: str, token: str, annotation_task: str) -> str: """Submit request to Responsible AI service for evaluation and return operation ID - :param query: The query to evaluate. - :type query: str - :param response: The response to evaluate. - :type response: str + :param data: The data to evaluate. + :type data: dict :param metric: The evaluation metric to use. :type metric: str :param rai_svc_url: The Responsible AI service URL. :type rai_svc_url: str :param token: The Azure authentication token. :type token: str + :param annotation_task: The annotation task to use. + :type annotation_task: str :return: The operation ID. :rtype: str """ - user_text = f"{query}{response}" + user_text = USER_TEXT_TEMPLATE_DICT.get(annotation_task, USER_TEXT_TEMPLATE_DICT["DEFAULT"]).substitute(**data) normalized_user_text = user_text.replace("'", '\\"') - payload = generate_payload(normalized_user_text, metric) + payload = generate_payload(normalized_user_text, metric, annotation_task=annotation_task) url = rai_svc_url + "/submitannotation" headers = get_common_headers(token) @@ -164,7 +168,6 @@ async def submit_request(query: str, response: str, metric: str, rai_svc_url: st if http_response.status_code != 202: print("Fail evaluating '%s' with error message: %s" % (payload["UserTextList"], http_response.text())) http_response.raise_for_status() - result = http_response.json() operation_id = result["location"].split("/")[-1] return operation_id @@ -208,7 +211,7 @@ async def fetch_result(operation_id: str, rai_svc_url: str, credential: TokenCre def parse_response( # pylint: disable=too-many-branches,too-many-statements - batch_response: List[Dict], metric_name: str + batch_response: List[Dict], metric_name: str, metric_display_name: Optional[str] = None ) -> Dict[str, Union[str, float]]: """Parse the annotation response from Responsible AI service for a content harm evaluation. @@ -216,11 +219,20 @@ def parse_response( # pylint: disable=too-many-branches,too-many-statements :type batch_response: List[Dict] :param metric_name: The evaluation metric to use. :type metric_name: str + :param metric_display_name: The evaluation metric display name to use. If unset, use the metric_name. + :type metric_display_name: Optional[str] :return: The parsed annotation result. :rtype: Dict[str, Union[str, float]] """ + if metric_display_name is None: + metric_display_name = metric_name + # non-numeric metrics - if metric_name in {EvaluationMetrics.PROTECTED_MATERIAL, _InternalEvaluationMetrics.ECI, EvaluationMetrics.XPIA}: + if metric_name in { + EvaluationMetrics.PROTECTED_MATERIAL, + _InternalEvaluationMetrics.ECI, + EvaluationMetrics.XPIA, + }: if not batch_response or len(batch_response[0]) == 0 or metric_name not in batch_response[0]: return {} response = batch_response[0][metric_name] @@ -230,38 +242,42 @@ def parse_response( # pylint: disable=too-many-branches,too-many-statements result = {} # Use label instead of score since these are assumed to be boolean results. # Use math.nan as null value since it's ignored by aggregations rather than treated as 0. - result[metric_name + "_label"] = parsed_response["label"] if "label" in parsed_response else math.nan - result[metric_name + "_reason"] = parsed_response["reasoning"] if "reasoning" in parsed_response else "" + result[metric_display_name + "_label"] = parsed_response["label"] if "label" in parsed_response else math.nan + result[metric_display_name + "_reason"] = parsed_response["reasoning"] if "reasoning" in parsed_response else "" if metric_name == EvaluationMetrics.XPIA: # Add "manipulated_content", "intrusion" and "information_gathering" to the result # if present else set them to math.nan - result[metric_name + "_manipulated_content"] = ( + result[metric_display_name + "_manipulated_content"] = ( parsed_response["manipulated_content"] if "manipulated_content" in parsed_response else math.nan ) - result[metric_name + "_intrusion"] = ( + result[metric_display_name + "_intrusion"] = ( parsed_response["intrusion"] if "intrusion" in parsed_response else math.nan ) - result[metric_name + "_information_gathering"] = ( + result[metric_display_name + "_information_gathering"] = ( parsed_response["information_gathering"] if "information_gathering" in parsed_response else math.nan ) return result - return _parse_content_harm_response(batch_response, metric_name) + return _parse_content_harm_response(batch_response, metric_name, metric_display_name) -def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) -> Dict[str, Union[str, float]]: +def _parse_content_harm_response( + batch_response: List[Dict], metric_name: str, metric_display_name: Optional[str] = None +) -> Dict[str, Union[str, float]]: """Parse the annotation response from Responsible AI service for a content harm evaluation. :param batch_response: The annotation response from Responsible AI service. :type batch_response: List[Dict] :param metric_name: The evaluation metric to use. :type metric_name: str + :param metric_display_name: The evaluation metric display name to use. If unset, use the metric_name. + :type metric_display_name: Optional[str] :return: The parsed annotation result. :rtype: Dict[str, Union[str, float]] """ # Fix the metric name if it's "hate_fairness" # Eventually we will remove this fix once the RAI service is updated - key = metric_name + key = metric_name if metric_display_name is None else metric_display_name if key == EvaluationMetrics.HATE_FAIRNESS: key = EvaluationMetrics.HATE_UNFAIRNESS @@ -283,7 +299,7 @@ def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) - # get content harm metric_value if "label" in harm_response: - metric_value = harm_response["label"] + metric_value = float(harm_response["label"]) elif "valid" in harm_response: metric_value = 0 if harm_response["valid"] else math.nan else: @@ -412,14 +428,17 @@ async def fetch_or_reuse_token(credential: TokenCredential, token: Optional[str] async def evaluate_with_rai_service( - query: str, response: str, metric_name: str, project_scope: AzureAIProject, credential: TokenCredential -) -> Dict[str, Any]: + data: dict, + metric_name: str, + project_scope: AzureAIProject, + credential: TokenCredential, + annotation_task: str = Tasks.CONTENT_HARM, + metric_display_name=None, +) -> Dict[str, Union[str, float]]: """ "Evaluate the content safety of the response using Responsible AI service - :param query: The query to evaluate. - :type query: str - :param response: The response to evaluate. - :type response: str + :param data: The data to evaluate. + :type data: dict :param metric_name: The evaluation metric to use. :type metric_name: str :param project_scope: The Azure AI project scope details. @@ -427,6 +446,10 @@ async def evaluate_with_rai_service( :param credential: The Azure authentication credential. :type credential: ~azure.core.credentials.TokenCredential + :param annotation_task: The annotation task to use. + :type annotation_task: str + :param metric_display_name: The display name of metric to use. + :type metric_display_name: str :return: The parsed annotation result. :rtype: Dict[str, Union[str, float]] """ @@ -434,11 +457,11 @@ async def evaluate_with_rai_service( # Get RAI service URL from discovery service and check service availability token = await fetch_or_reuse_token(credential) rai_svc_url = await get_rai_svc_url(project_scope, token) - await ensure_service_availability(rai_svc_url, token, Tasks.CONTENT_HARM) + await ensure_service_availability(rai_svc_url, token, annotation_task) # Submit annotation request and fetch result - operation_id = await submit_request(query, response, metric_name, rai_svc_url, token) + operation_id = await submit_request(data, metric_name, rai_svc_url, token, annotation_task) annotation_response = cast(List[Dict], await fetch_result(operation_id, rai_svc_url, credential, token)) - result = parse_response(annotation_response, metric_name) + result = parse_response(annotation_response, metric_name, metric_display_name) return result diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/utils.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/utils.py index 6b2b34549de9..61d20277e8f6 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/utils.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/utils.py @@ -24,7 +24,7 @@ T_TypedDict = TypeVar("T_TypedDict", bound=Mapping[Any, Any]) -def get_harm_severity_level(harm_score: int) -> Union[str, float]: +def get_harm_severity_level(harm_score: Union[float, int]) -> Union[str, float]: """Generate harm severity level based on harm score. :param harm_score: The harm score to be evaluated. diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py index b5aa3fcc0a59..689ce162122a 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py @@ -36,6 +36,14 @@ TClient = TypeVar("TClient", ProxyClient, CodeClient) +# For metrics (aggregates) whose metric names intentionally differ from their +# originating column name, usually because the aggregation of the original value +# means something sufficiently different. +# Note that content safety metrics are handled seprately. +METRIC_COLUMN_NAME_REPLACEMENTS = { + "groundedness_pro_label": "groundedness_pro_passing_rate", +} + class __EvaluatorInfo(TypedDict): result: pd.DataFrame @@ -43,6 +51,32 @@ class __EvaluatorInfo(TypedDict): run_summary: Dict[str, Any] +def _aggregate_other_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[str, float]]: + """Identify and average various metrics that need to have the metric name be replaced, + instead of having the metric match the originating column name. + :param df: The dataframe of evaluation results. + :type df: ~pandas.DataFrame + :return: A tuple; the first element is a list of dataframe columns that were aggregated, + and the second element is a dictionary of resultant new metric column names and their values. + :rtype: Tuple[List[str], Dict[str, float]] + """ + renamed_cols = [] + metric_columns = {} + for col in df.columns: + metric_prefix = col.split(".")[0] + metric_name = col.split(".")[1] + if metric_name in METRIC_COLUMN_NAME_REPLACEMENTS: + renamed_cols.append(col) + new_col_name = metric_prefix + "." + METRIC_COLUMN_NAME_REPLACEMENTS[metric_name] + col_with_numeric_values = pd.to_numeric(df[col], errors="coerce") + metric_columns[new_col_name] = round( + list_sum(col_with_numeric_values) / col_with_numeric_values.count(), + 2, + ) + + return renamed_cols, metric_columns + + # pylint: disable=line-too-long def _aggregate_content_safety_metrics( df: pd.DataFrame, evaluators: Dict[str, Callable] @@ -146,8 +180,11 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dic # Rename certain columns as defect rates if we know that's what their aggregates represent # Content safety metrics content_safety_cols, cs_defect_rates = _aggregate_content_safety_metrics(df, evaluators) + other_renamed_cols, renamed_cols = _aggregate_other_metrics(df) handled_columns.extend(content_safety_cols) + handled_columns.extend(other_renamed_cols) defect_rates.update(cs_defect_rates) + defect_rates.update(renamed_cols) # Label-based (true/false) metrics where 'true' means 'something is wrong' label_cols, label_defect_rates = _aggregate_label_defect_metrics(df) handled_columns.extend(label_cols) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py index 23c753523f8c..28d00f7977b6 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py @@ -1,11 +1,16 @@ # --------------------------------------------------------- # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- -from typing import Dict, Optional, TypeVar, Union +from typing import Dict, Optional, Union from typing_extensions import override -from azure.ai.evaluation._common.constants import EvaluationMetrics, _InternalEvaluationMetrics +from azure.ai.evaluation._common.constants import ( + EvaluationMetrics, + _InternalEvaluationMetrics, + Tasks, + _InternalAnnotationTasks, +) from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service from azure.ai.evaluation._common.utils import validate_azure_ai_project from azure.ai.evaluation._exceptions import EvaluationException @@ -13,7 +18,7 @@ from . import EvaluatorBase -T = TypeVar("T") +T = Union[str, float] class RaiServiceEvaluatorBase(EvaluatorBase[T]): @@ -89,10 +94,43 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, T]: + " This should have failed earlier." ), ) + input_data = {"query": query, "response": response} + + if "context" in self._singleton_inputs: + context = eval_input.get("context", None) + if context is None: + raise EvaluationException( + message="Not implemented", + internal_message=( + "Attempted context-based evaluation without supplying context." + + " This should have failed earlier." + ), + ) + input_data["context"] = context + return await evaluate_with_rai_service( metric_name=self._eval_metric, - query=query, - response=response, + data=input_data, project_scope=self._azure_ai_project, credential=self._credential, + annotation_task=self._get_task(), ) + + def _get_task(self): + """Get the annotation task for the current evaluation metric. + The annotation task is used by the RAI service script to determine a the message format + of the API call, and how the output is processed, among other things. + + :return: The annotation task for the evaluator's self._eval_metric value. + :rtype: ~azure.ai.evaluation._common.constants.Tasks + + """ + if self._eval_metric == EvaluationMetrics.GROUNDEDNESS: + return Tasks.GROUNDEDNESS + if self._eval_metric == EvaluationMetrics.XPIA: + return Tasks.XPIA + if self._eval_metric == _InternalEvaluationMetrics.ECI: + return _InternalAnnotationTasks.ECI + if self._eval_metric == EvaluationMetrics.PROTECTED_MATERIAL: + return Tasks.PROTECTED_MATERIAL + return Tasks.CONTENT_HARM diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py index 3f56a7488684..8384024fc813 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py @@ -68,9 +68,10 @@ class ContentSafetyEvaluator(EvaluatorBase): } """ + # TODO address 3579092 to re-enabled parallel evals. def __init__(self, credential, azure_ai_project, eval_last_turn: bool = False, **kwargs): super().__init__(eval_last_turn=eval_last_turn) - self._parallel = kwargs.pop("parallel", True) + self._parallel = kwargs.pop("parallel", False) self._evaluators: List[Callable[..., Dict[str, Union[str, float]]]] = [ ViolenceEvaluator(credential, azure_ai_project), SexualEvaluator(credential, azure_ai_project), @@ -118,6 +119,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]: response = eval_input.get("response", None) conversation = eval_input.get("conversation", None) results: Dict[str, Union[str, float]] = {} + # TODO fix this to not explode on empty optional inputs (PF SKD error) if self._parallel: with ThreadPoolExecutor() as executor: # pylint: disable=no-value-for-parameter diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py index ae078563dbbd..551624101661 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py @@ -1,7 +1,7 @@ # --------------------------------------------------------- # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- -from typing import Optional, Union +from typing import Optional from typing_extensions import override @@ -11,7 +11,7 @@ @experimental -class HateUnfairnessEvaluator(RaiServiceEvaluatorBase[Union[str, float]]): +class HateUnfairnessEvaluator(RaiServiceEvaluatorBase): """ Initialize a hate-unfairness evaluator for hate unfairness score. diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py index ea1f057e2fcf..39e1d311063c 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py @@ -1,7 +1,7 @@ # --------------------------------------------------------- # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- -from typing import Optional, Union +from typing import Optional from typing_extensions import override @@ -11,7 +11,7 @@ @experimental -class SelfHarmEvaluator(RaiServiceEvaluatorBase[Union[str, float]]): +class SelfHarmEvaluator(RaiServiceEvaluatorBase): """ Initialize a self harm evaluator for self harm score. diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py index 151874535a5e..ad7e875b3910 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py @@ -1,7 +1,7 @@ # --------------------------------------------------------- # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- -from typing import Optional, Union +from typing import Optional from typing_extensions import override @@ -11,7 +11,7 @@ @experimental -class SexualEvaluator(RaiServiceEvaluatorBase[Union[str, float]]): +class SexualEvaluator(RaiServiceEvaluatorBase): """ Initialize a sexual evaluator for sexual score. diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py index 44a9b3abf0bf..ab03d551d214 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py @@ -1,7 +1,7 @@ # --------------------------------------------------------- # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- -from typing import Optional, Union +from typing import Optional from typing_extensions import override @@ -11,7 +11,7 @@ @experimental -class ViolenceEvaluator(RaiServiceEvaluatorBase[Union[str, float]]): +class ViolenceEvaluator(RaiServiceEvaluatorBase): """ Initialize a violence evaluator for violence score. diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_eci/_eci.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_eci/_eci.py index cd2e6853ac29..3e6e420e9305 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_eci/_eci.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_eci/_eci.py @@ -9,7 +9,7 @@ @experimental -class ECIEvaluator(RaiServiceEvaluatorBase[str]): +class ECIEvaluator(RaiServiceEvaluatorBase): """ Initialize an ECI evaluator to evaluate ECI based on the following guidelines: Detects whether ECI is present without a disclaimer in the AI system’s response. diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py index 36cec30a080e..60e23a9f9e44 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py @@ -59,7 +59,7 @@ def __call__( conversation=None, **kwargs, ): - """Evaluate groundedless. Accepts either a response and context a single evaluation, + """Evaluate groundedness. Accepts either a response and context a single evaluation, or a conversation for a multi-turn evaluation. If the conversation has more than one turn, the evaluator will aggregate the results of each turn. diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py index 0ead00125c3d..92ae2a3e98c0 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py @@ -2,7 +2,7 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- -from typing import Optional, Union +from typing import Optional from typing_extensions import override @@ -12,7 +12,7 @@ @experimental -class ProtectedMaterialEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]): +class ProtectedMaterialEvaluator(RaiServiceEvaluatorBase): """ Initialize a protected material evaluator to detect whether protected material is present in your AI system's response. Outputs True or False with AI-generated reasoning. diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py index e895dd9bd6dd..e34f4f8f0211 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py @@ -110,15 +110,13 @@ class RetrievalEvaluator: .. code-block:: python - chat_eval = RetrievalScoreEvaluator(model_config) + chat_eval = RetrievalEvaluator(model_config) conversation = { "messages": [ {"role": "user", "content": "What is the value of 2 + 2?"}, - {"role": "assistant", "content": "2 + 2 = 4", "context": { - "citations": [ - {"id": "math_doc.md", "content": "Information about additions: 1 + 2 = 3, 2 + 2 = 4"} - ] - } + { + "role": "assistant", "content": "2 + 2 = 4", + "context": "From 'math_doc.md': Information about additions: 1 + 2 = 3, 2 + 2 = 4" } ] } diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py new file mode 100644 index 000000000000..fad50fd1a2c6 --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py @@ -0,0 +1,9 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- + +from ._service_groundedness import GroundednessProEvaluator + +__all__ = [ + "GroundednessProEvaluator", +] diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py new file mode 100644 index 000000000000..83780f6506ef --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py @@ -0,0 +1,150 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- +from typing import Optional, Dict +from typing_extensions import override + +from azure.ai.evaluation._common._experimental import experimental +from azure.ai.evaluation._common.constants import EvaluationMetrics +from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase + + +@experimental +class GroundednessProEvaluator(RaiServiceEvaluatorBase): + """ + Initialize a Groundedness Pro evaluator for determine if the response is grounded + in the query and context. + + If this evaluator is supplied to the `evaluate` function, the aggregated metric + for the groundedness pro label will be "groundedness_pro_passing_rate". + + :param credential: The credential for connecting to Azure AI project. Required + :type credential: ~azure.core.credentials.TokenCredential + :param azure_ai_project: The scope of the Azure AI project. + It contains subscription id, resource group, and project name. + :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject + :param kwargs: Additional arguments to pass to the evaluator. + :type kwargs: Any + + **Usage** + + .. code-block:: python + + azure_ai_project = { + "subscription_id": "", + "resource_group_name": "", + "project_name": "", + } + credential = DefaultAzureCredential() + + eval_fn = GroundednessProEvaluator(azure_ai_project, credential) + result = eval_fn(query="What's the capital of France", response="Paris", context="Paris.") + + **Output format** + + .. code-block:: python + + { + "groundedness_pro_label": True, + "reason": "'All Contents are grounded" + } + + **Usage with conversation input** + + .. code-block:: python + + azure_ai_project = { + "subscription_id": "", + "resource_group_name": "", + "project_name": "", + } + credential = DefaultAzureCredential() + + eval_fn = GroundednessProEvaluator(azure_ai_project, credential) + conversation = { + "messages": [ + {"role": "user", "content": "What is the capital of France?"}, + {"role": "assistant", "content": "Paris.", "context": "Paris."} + {"role": "user", "content": "What is the capital of Germany?"}, + {"role": "assistant", "content": "Berlin.", "context": "Berlin."} + ] + } + result = eval_fn(conversation=conversation) + + **Output format** + + .. code-block:: python + + { + "groundedness_pro_label": 1.0, + "evaluation_per_turn": { + "groundedness_pro_label": [True, True], + "groundedness_pro_reason": ["All contents are grounded", "All contents are grounded"] + } + } + """ + + @override + def __init__( + self, + credential, + azure_ai_project, + **kwargs, + ): + self._passing_score = 3 # TODO update once the binarization PR is merged + self._output_prefix = "groundedness_pro" + super().__init__( + eval_metric=EvaluationMetrics.GROUNDEDNESS, + azure_ai_project=azure_ai_project, + credential=credential, + **kwargs, + ) + + @override + def __call__( + self, + *, + query: Optional[str] = None, + response: Optional[str] = None, + context: Optional[str] = None, + conversation=None, + **kwargs, + ): + """Evaluate groundedness. Accepts either a query, response and context for a single-turn evaluation, or a + or a conversation for a multi-turn evaluation. If the conversation has more than one turn, + the evaluator will aggregate the results of each turn, with the per-turn results available + in the output under the "evaluation_per_turn" key. + + :keyword query: The query to be evaluated. + :paramtype query: Optional[str] + :keyword response: The response to be evaluated. + :paramtype response: Optional[str] + :keyword context: The context to be evaluated. + :paramtype context: Optional[str] + :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the + key "messages", and potentially a global context under the key "context". Conversation turns are expected + to be dictionaries with keys "content", "role", and possibly "context". + :paramtype conversation: Optional[~azure.ai.evaluation.Conversation] + :return: The relevance score. + :rtype: Union[Dict[str, Union[str, bool]], Dict[str, Union[float, Dict[str, List[Union[str, bool]]]]]] + """ + return super().__call__(query=query, response=response, context=context, conversation=conversation, **kwargs) + + @override + async def _do_eval(self, eval_input: Dict): + """This evaluator has some unique post-processing that requires data that + the rai_service script is not currently built to handle. So we post-post-process + the result here to message it into the right form. + + :param eval_input: The input to the evaluation function. + :type eval_input: Dict + :return: The evaluation result. + :rtype: Dict + """ + result = await super()._do_eval(eval_input) + real_result = {} + real_result[self._output_prefix + "_label"] = ( + result[EvaluationMetrics.GROUNDEDNESS + "_score"] >= self._passing_score + ) + real_result[self._output_prefix + "_reason"] = result[EvaluationMetrics.GROUNDEDNESS + "_reason"] + return real_result diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py index 8db3b777f23c..703fa31cf70b 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py @@ -2,7 +2,7 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- import logging -from typing import Optional, Union +from typing import Optional from typing_extensions import override @@ -14,7 +14,7 @@ @experimental -class IndirectAttackEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]): +class IndirectAttackEvaluator(RaiServiceEvaluatorBase): """A Cross-Domain Prompt Injection Attack (XPIA) jailbreak evaluator. Detect whether cross domain injected attacks are present in your AI system's response. diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_simulator.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_simulator.py index 835f623612ed..b6fcca19fb29 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_simulator.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_simulator.py @@ -226,6 +226,7 @@ async def _simulate_with_predefined_turns( semaphore = asyncio.Semaphore(concurrent_async_tasks) progress_bar_lock = asyncio.Lock() + async def run_simulation(simulation: List[Union[str, Dict[str, Any]]]) -> JsonLineChatProtocol: async with semaphore: current_simulation = ConversationHistory() diff --git a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/data/evaluate_test_data_conversation.jsonl b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/data/evaluate_test_data_conversation.jsonl new file mode 100644 index 000000000000..7a52cebbf2f3 --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/data/evaluate_test_data_conversation.jsonl @@ -0,0 +1,2 @@ +{"conversation" : {"context" : "", "messages": [{"content": "What shape has 3 sides", "role" :"user", "context": null}, {"content": "A triangle", "role" :"assistant", "context": "The answer is a triangle."}, {"content": "Next, what shape has 4 sides", "role" :"user", "context": null}, {"content": "A square", "role" :"assistant", "context": "The answer is a square."}]}} +{"conversation" : {"context" : "User wants to know about state capitals", "messages": [{"content": "What is the capital of Hawaii", "role" :"user", "context": "User wants to know the capital of Hawaii"}, {"content": "Honolulu", "role" :"assistant", "context": "The answer is a Honolulu."}, {"content": "Ok, what is the capital of Massachusetts", "role" :"user", "context": "User wants to know the capital of Massachusetts."}, {"content": "Boston", "role" :"assistant", "context": "The answer is Boston."}]}} diff --git a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_builtin_evaluators.py b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_builtin_evaluators.py index 5c27e902f1e3..68c6c3b72768 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_builtin_evaluators.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_builtin_evaluators.py @@ -24,6 +24,7 @@ SexualEvaluator, SimilarityEvaluator, ViolenceEvaluator, + GroundednessProEvaluator, ) from azure.ai.evaluation._evaluators._eci._eci import ECIEvaluator @@ -199,7 +200,7 @@ def test_content_safety_evaluator_violence(self, project_scope, azure_cred, simp assert score2["violence_score"] > 0 assert score2["evaluation_per_turn"]["violence_score"][0] == 0 assert score2["evaluation_per_turn"]["violence_score"][1] > 0 - assert score2["evaluation_per_turn"]["violence"] == ["Very low", "Medium"] + assert score2["evaluation_per_turn"]["violence"] == ["Very low", "High"] assert all(score2["evaluation_per_turn"]["violence_reason"]), "violence_reason must not be None or empty." def test_content_safety_evaluator_sexual(self, project_scope, azure_cred, simple_conversation): @@ -246,8 +247,8 @@ def test_content_safety_evaluator_self_harm(self, project_scope, azure_cred, sim # Test eval_last_turn behavior eval_fn_last = SelfHarmEvaluator(azure_cred, project_scope, eval_last_turn=True) score3 = eval_fn_last(conversation=simple_conversation) - assert score3["self_harm"] != "Very low" - assert score3["self_harm_score"] > 0 + assert score3["self_harm"] == "Very low" + assert score3["self_harm_score"] >= 1 assert score3["self_harm_reason"], "self_harm_reason must not be None or empty." def test_content_safety_evaluator_hate_unfairness(self, project_scope, azure_cred, simple_conversation): @@ -521,3 +522,24 @@ def test_xpia_evaluator(self, project_scope, azure_cred, simple_conversation): assert convo_result["xpia_label"] == 0.5 assert convo_result["evaluation_per_turn"]["xpia_label"] == [False, True] assert all(convo_result["evaluation_per_turn"]["xpia_reason"]), "xpia_reason must not be None or empty." + + def test_groundedness_pro_evaluator(self, project_scope, azure_cred, simple_conversation): + ground_eval = GroundednessProEvaluator(azure_cred, project_scope) + result = ground_eval( + query="What shape has 4 equilateral sides?", + response="Rhombus", + context="Rhombus is a shape with 4 equilateral sides.", + ) + + assert result is not None + assert result["groundedness_pro_label"] + assert result["groundedness_pro_reason"] is not None, "groundedness_pro_reason must not be None or empty." + + convo_result = ground_eval(conversation=simple_conversation) + + assert convo_result is not None + assert convo_result["groundedness_pro_label"] == 1.0 + assert convo_result["evaluation_per_turn"]["groundedness_pro_label"] == [True, True] + assert all( + convo_result["evaluation_per_turn"]["groundedness_pro_reason"] + ), "groundedness_pro_reason must not be None or empty." diff --git a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py index 30b3e6b6b7e6..948660387773 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py @@ -14,6 +14,7 @@ F1ScoreEvaluator, FluencyEvaluator, GroundednessEvaluator, + GroundednessProEvaluator, evaluate, ) from azure.ai.evaluation._common.math import list_mean_nan_safe @@ -25,6 +26,12 @@ def data_file(): return os.path.join(data_path, "evaluate_test_data.jsonl") +@pytest.fixture +def data_convo_file(): + data_path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data") + return os.path.join(data_path, "evaluate_test_data_conversation.jsonl") + + @pytest.fixture def questions_file(): data_path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data") @@ -205,7 +212,36 @@ def test_evaluate_with_content_safety_evaluator(self, project_scope, data_file, assert 0 <= metrics.get("content_safety.self_harm_defect_rate") <= 1 assert 0 <= metrics.get("content_safety.hate_unfairness_defect_rate") <= 1 - @pytest.mark.performance_test + def test_evaluate_with_groundedness_pro_evaluator(self, project_scope, data_convo_file, azure_cred): + + # CS evaluator tries to store the credential, which breaks multiprocessing at + # pickling stage. So we pass None for credential and let child evals + # generate a default credential at runtime. + # Internal Parallelism is also disabled to avoid faulty recordings. + gp_eval = GroundednessProEvaluator(azure_ai_project=project_scope, credential=azure_cred, parallel=False) + + convo_input_data = pd.read_json(data_convo_file, lines=True) + # run the evaluation + convo_result = evaluate( + data=data_convo_file, + evaluators={"groundedness_pro": gp_eval}, + ) + + convo_row_result_df = pd.DataFrame(convo_result["rows"]) + convo_metrics = convo_result["metrics"] + assert convo_row_result_df.shape[0] == len(convo_input_data) + assert "outputs.groundedness_pro.groundedness_pro_label" in convo_row_result_df.columns.to_list() + assert "outputs.groundedness_pro.evaluation_per_turn" in convo_row_result_df.columns.to_list() + + per_turn_results = convo_row_result_df["outputs.groundedness_pro.evaluation_per_turn"][0] + assert "groundedness_pro_label" in per_turn_results.keys() + assert "groundedness_pro_reason" in per_turn_results.keys() + + # Check that label is renamed to passsing rate in metrics + assert "groundedness_pro.groundedness_pro_passing_rate" in convo_metrics.keys() + assert 0 <= convo_metrics.get("groundedness_pro.groundedness_pro_passing_rate") <= 1 + + # @pytest.mark.performance_test @pytest.mark.skip(reason="Temporary skip to merge 37201, will re-enable in subsequent pr") def test_evaluate_with_async_enabled_evaluator(self, model_config, data_file): os.environ["AI_EVALS_BATCH_USE_ASYNC"] = "true" diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_content_safety_rai_script.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_content_safety_rai_script.py index d49596b28518..ca2904ac4f9d 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_content_safety_rai_script.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_content_safety_rai_script.py @@ -17,6 +17,7 @@ get_rai_svc_url, parse_response, submit_request, + Tasks, ) from azure.core.exceptions import HttpResponseError from azure.core.rest import AsyncHttpResponse, HttpRequest @@ -166,11 +167,11 @@ async def test_ensure_service_availability_exception_capability_unavailable(self ) async def test_submit_request(self, client_mock): result = await submit_request( - query="What is the meaning of life", - response="42", + data={"query": "What is the meaning of life", "response": "42"}, metric="points", rai_svc_url="www.notarealurl.com", token="dummy", + annotation_task=Tasks.CONTENT_HARM, ) assert result == "dummy-operation-id" @@ -186,11 +187,11 @@ async def test_submit_request(self, client_mock): async def test_submit_request_not_found(self, client_mock): with pytest.raises(HttpResponseError) as exc_info: _ = await submit_request( - query="What is the meaning of life", - response="42", + data={"query": "What is the meaning of life", "response": "42"}, metric="points", rai_svc_url="www.notarealurl.com", token="dummy", + annotation_task=Tasks.CONTENT_HARM, ) assert "Operation returned an invalid status '404 Not Found'" in str(exc_info._excinfo[1]) @@ -250,7 +251,11 @@ async def test_fetch_result_timeout(self, client_mock, mock_token): def test_parse_response(self): batch_response = [{"not-a-metric": "not-a-value"}] metric_name = EvaluationMetrics.HATE_FAIRNESS - result = parse_response(batch_response=batch_response, metric_name=metric_name) + result = parse_response( + batch_response=batch_response, + metric_name=metric_name, + metric_display_name=EvaluationMetrics.HATE_UNFAIRNESS, + ) assert math.isnan(result[EvaluationMetrics.HATE_UNFAIRNESS]) assert math.isnan(result[EvaluationMetrics.HATE_UNFAIRNESS + "_score"]) assert result[EvaluationMetrics.HATE_UNFAIRNESS + "_reason"] == "" @@ -267,7 +272,7 @@ def test_parse_response(self): # This tests ALL of it. batch_response[0] = {metric_name: str(response_value)} - result = parse_response(batch_response=batch_response, metric_name=metric_name) + result = parse_response(batch_response=batch_response, metric_name=metric_name, metric_display_name=metric_name) assert result[metric_name] == HarmSeverityLevel.VeryLow.value assert result[metric_name + "_score"] == 0 assert result[metric_name + "_reason"] == response_value["reasoning"] @@ -277,7 +282,7 @@ def test_parse_response(self): "reason": "This is a sample reason.", } batch_response[0] = {metric_name: str(response_value)} - result = parse_response(batch_response=batch_response, metric_name=metric_name) + result = parse_response(batch_response=batch_response, metric_name=metric_name, metric_display_name=metric_name) assert result[metric_name] == HarmSeverityLevel.VeryLow.value assert result[metric_name + "_score"] == 0 assert result[metric_name + "_reason"] == response_value["output"]["reason"] @@ -314,7 +319,7 @@ def test_parse_response(self): assert math.isnan(result[metric_name + "_score"]) batch_response[0] = {metric_name: ["still not a number"]} - result = parse_response(batch_response=batch_response, metric_name=metric_name) + result = parse_response(batch_response=batch_response, metric_name=metric_name, metric_display_name=metric_name) assert math.isnan(result[metric_name]) assert math.isnan(result[metric_name + "_score"])