diff --git a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md
index 6dac94564c8e..db792cd06f95 100644
--- a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md
+++ b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md
@@ -4,6 +4,7 @@
 ## 1.0.0b5 (Unreleased)
 
 ### Features Added
+- Added `GroundednessProEvaluator`, which is a service-based evaluator for determining response groundedness.
 - Groundedness detection in Non Adversarial Simulator via query/context pairs
 ```python
 import importlib.resources as pkg_resources
diff --git a/sdk/evaluation/azure-ai-evaluation/assets.json b/sdk/evaluation/azure-ai-evaluation/assets.json
index 8e5b702580a1..637c5f184670 100644
--- a/sdk/evaluation/azure-ai-evaluation/assets.json
+++ b/sdk/evaluation/azure-ai-evaluation/assets.json
@@ -2,5 +2,5 @@
   "AssetsRepo": "Azure/azure-sdk-assets",
   "AssetsRepoPrefixPath": "python",
   "TagPrefix": "python/evaluation/azure-ai-evaluation",
-  "Tag": "python/evaluation/azure-ai-evaluation_1390701e9d"
+  "Tag": "python/evaluation/azure-ai-evaluation_5551827d25"
 }
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py
index 87c7ac51de51..f1d59bf13b24 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py
@@ -16,6 +16,7 @@
 from ._evaluators._fluency import FluencyEvaluator
 from ._evaluators._gleu import GleuScoreEvaluator
 from ._evaluators._groundedness import GroundednessEvaluator
+from ._evaluators._service_groundedness import GroundednessProEvaluator
 from ._evaluators._meteor import MeteorScoreEvaluator
 from ._evaluators._protected_material import ProtectedMaterialEvaluator
 from ._evaluators._qa import QAEvaluator
@@ -40,6 +41,7 @@
     "F1ScoreEvaluator",
     "FluencyEvaluator",
     "GroundednessEvaluator",
+    "GroundednessProEvaluator",
     "RelevanceEvaluator",
     "SimilarityEvaluator",
     "QAEvaluator",
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/constants.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/constants.py
index c2fb55ed4c2c..8b3746f7466c 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/constants.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/constants.py
@@ -35,6 +35,7 @@ class Tasks:
     CONTENT_HARM = "content harm"
     PROTECTED_MATERIAL = "protected material"
     XPIA = "xpia"
+    GROUNDEDNESS = "groundedness"
 
 
 class _InternalAnnotationTasks:
@@ -56,6 +57,7 @@ class EvaluationMetrics(str, Enum, metaclass=CaseInsensitiveEnumMeta):
     SEXUAL = "sexual"
     PROTECTED_MATERIAL = "protected_material"
     XPIA = "xpia"
+    GROUNDEDNESS = "generic_groundedness"
 
 
 class _InternalEvaluationMetrics(str, Enum, metaclass=CaseInsensitiveEnumMeta):
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/rai_service.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/rai_service.py
index 20e7580bde63..cfbed826a302 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/rai_service.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/rai_service.py
@@ -7,8 +7,9 @@
 import re
 import time
 from ast import literal_eval
-from typing import Any, Dict, List, Optional, Union, cast
+from typing import Dict, List, Optional, Union, cast
 from urllib.parse import urlparse
+from string import Template
 
 import jwt
 
@@ -23,7 +24,6 @@
     EvaluationMetrics,
     RAIService,
     Tasks,
-    _InternalAnnotationTasks,
     _InternalEvaluationMetrics,
 )
 from .utils import get_harm_severity_level
@@ -34,6 +34,11 @@
     version = "unknown"
 USER_AGENT = "{}/{}".format("azure-ai-evaluation", version)
 
+USER_TEXT_TEMPLATE_DICT: Dict[str, Template] = {
+    "DEFAULT": Template("<Human>{$query}</><System>{$response}</>"),
+    Tasks.GROUNDEDNESS: Template('{"question": "$query", "answer": "$response", "context": "$context"}'),
+}
+
 
 def get_common_headers(token: str) -> Dict:
     """Get common headers for the HTTP request
@@ -99,7 +104,7 @@ async def ensure_service_availability(rai_svc_url: str, token: str, capability:
         )
 
 
-def generate_payload(normalized_user_text: str, metric: str) -> Dict:
+def generate_payload(normalized_user_text: str, metric: str, annotation_task: str) -> Dict:
     """Generate the payload for the annotation request
 
     :param normalized_user_text: The normalized user text to be entered as the "UserTextList" in the payload.
@@ -107,19 +112,18 @@ def generate_payload(normalized_user_text: str, metric: str) -> Dict:
     :param metric: The evaluation metric to use. This determines the task type, and whether a "MetricList" is needed
         in the payload.
     :type metric: str
+    :param annotation_task: The annotation task to be passed to service
+    :type annotation_task: str
     :return: The payload for the annotation request.
     :rtype: Dict
     """
     include_metric = True
-    task = Tasks.CONTENT_HARM
+    task = annotation_task
     if metric == EvaluationMetrics.PROTECTED_MATERIAL:
-        task = Tasks.PROTECTED_MATERIAL
         include_metric = False
     elif metric == _InternalEvaluationMetrics.ECI:
-        task = _InternalAnnotationTasks.ECI
         include_metric = False
     elif metric == EvaluationMetrics.XPIA:
-        task = Tasks.XPIA
         include_metric = False
     return (
         {
@@ -135,25 +139,25 @@ def generate_payload(normalized_user_text: str, metric: str) -> Dict:
     )
 
 
-async def submit_request(query: str, response: str, metric: str, rai_svc_url: str, token: str) -> str:
+async def submit_request(data: dict, metric: str, rai_svc_url: str, token: str, annotation_task: str) -> str:
     """Submit request to Responsible AI service for evaluation and return operation ID
 
-    :param query: The query to evaluate.
-    :type query: str
-    :param response: The response to evaluate.
-    :type response: str
+    :param data: The data to evaluate.
+    :type data: dict
     :param metric: The evaluation metric to use.
     :type metric: str
     :param rai_svc_url: The Responsible AI service URL.
     :type rai_svc_url: str
     :param token: The Azure authentication token.
     :type token: str
+    :param annotation_task: The annotation task to use.
+    :type annotation_task: str
     :return: The operation ID.
     :rtype: str
     """
-    user_text = f"<Human>{query}</><System>{response}</>"
+    user_text = USER_TEXT_TEMPLATE_DICT.get(annotation_task, USER_TEXT_TEMPLATE_DICT["DEFAULT"]).substitute(**data)
     normalized_user_text = user_text.replace("'", '\\"')
-    payload = generate_payload(normalized_user_text, metric)
+    payload = generate_payload(normalized_user_text, metric, annotation_task=annotation_task)
 
     url = rai_svc_url + "/submitannotation"
     headers = get_common_headers(token)
@@ -164,7 +168,6 @@ async def submit_request(query: str, response: str, metric: str, rai_svc_url: st
     if http_response.status_code != 202:
         print("Fail evaluating '%s' with error message: %s" % (payload["UserTextList"], http_response.text()))
         http_response.raise_for_status()
-
     result = http_response.json()
     operation_id = result["location"].split("/")[-1]
     return operation_id
@@ -208,7 +211,7 @@ async def fetch_result(operation_id: str, rai_svc_url: str, credential: TokenCre
 
 
 def parse_response(  # pylint: disable=too-many-branches,too-many-statements
-    batch_response: List[Dict], metric_name: str
+    batch_response: List[Dict], metric_name: str, metric_display_name: Optional[str] = None
 ) -> Dict[str, Union[str, float]]:
     """Parse the annotation response from Responsible AI service for a content harm evaluation.
 
@@ -216,11 +219,20 @@ def parse_response(  # pylint: disable=too-many-branches,too-many-statements
     :type batch_response: List[Dict]
     :param metric_name: The evaluation metric to use.
     :type metric_name: str
+    :param metric_display_name: The evaluation metric display name to use. If unset, use the metric_name.
+    :type metric_display_name: Optional[str]
     :return: The parsed annotation result.
     :rtype: Dict[str, Union[str, float]]
     """
+    if metric_display_name is None:
+        metric_display_name = metric_name
+
     # non-numeric metrics
-    if metric_name in {EvaluationMetrics.PROTECTED_MATERIAL, _InternalEvaluationMetrics.ECI, EvaluationMetrics.XPIA}:
+    if metric_name in {
+        EvaluationMetrics.PROTECTED_MATERIAL,
+        _InternalEvaluationMetrics.ECI,
+        EvaluationMetrics.XPIA,
+    }:
         if not batch_response or len(batch_response[0]) == 0 or metric_name not in batch_response[0]:
             return {}
         response = batch_response[0][metric_name]
@@ -230,38 +242,42 @@ def parse_response(  # pylint: disable=too-many-branches,too-many-statements
         result = {}
         # Use label instead of score since these are assumed to be boolean results.
         # Use math.nan as null value since it's ignored by aggregations rather than treated as 0.
-        result[metric_name + "_label"] = parsed_response["label"] if "label" in parsed_response else math.nan
-        result[metric_name + "_reason"] = parsed_response["reasoning"] if "reasoning" in parsed_response else ""
+        result[metric_display_name + "_label"] = parsed_response["label"] if "label" in parsed_response else math.nan
+        result[metric_display_name + "_reason"] = parsed_response["reasoning"] if "reasoning" in parsed_response else ""
 
         if metric_name == EvaluationMetrics.XPIA:
             # Add "manipulated_content", "intrusion" and "information_gathering" to the result
             # if present else set them to math.nan
-            result[metric_name + "_manipulated_content"] = (
+            result[metric_display_name + "_manipulated_content"] = (
                 parsed_response["manipulated_content"] if "manipulated_content" in parsed_response else math.nan
             )
-            result[metric_name + "_intrusion"] = (
+            result[metric_display_name + "_intrusion"] = (
                 parsed_response["intrusion"] if "intrusion" in parsed_response else math.nan
             )
-            result[metric_name + "_information_gathering"] = (
+            result[metric_display_name + "_information_gathering"] = (
                 parsed_response["information_gathering"] if "information_gathering" in parsed_response else math.nan
             )
         return result
-    return _parse_content_harm_response(batch_response, metric_name)
+    return _parse_content_harm_response(batch_response, metric_name, metric_display_name)
 
 
-def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) -> Dict[str, Union[str, float]]:
+def _parse_content_harm_response(
+    batch_response: List[Dict], metric_name: str, metric_display_name: Optional[str] = None
+) -> Dict[str, Union[str, float]]:
     """Parse the annotation response from Responsible AI service for a content harm evaluation.
 
     :param batch_response: The annotation response from Responsible AI service.
     :type batch_response: List[Dict]
     :param metric_name: The evaluation metric to use.
     :type metric_name: str
+    :param metric_display_name: The evaluation metric display name to use. If unset, use the metric_name.
+    :type metric_display_name: Optional[str]
     :return: The parsed annotation result.
     :rtype: Dict[str, Union[str, float]]
     """
     # Fix the metric name if it's "hate_fairness"
     # Eventually we will remove this fix once the RAI service is updated
-    key = metric_name
+    key = metric_name if metric_display_name is None else metric_display_name
     if key == EvaluationMetrics.HATE_FAIRNESS:
         key = EvaluationMetrics.HATE_UNFAIRNESS
 
@@ -283,7 +299,7 @@ def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) -
 
         # get content harm metric_value
         if "label" in harm_response:
-            metric_value = harm_response["label"]
+            metric_value = float(harm_response["label"])
         elif "valid" in harm_response:
             metric_value = 0 if harm_response["valid"] else math.nan
         else:
@@ -412,14 +428,17 @@ async def fetch_or_reuse_token(credential: TokenCredential, token: Optional[str]
 
 
 async def evaluate_with_rai_service(
-    query: str, response: str, metric_name: str, project_scope: AzureAIProject, credential: TokenCredential
-) -> Dict[str, Any]:
+    data: dict,
+    metric_name: str,
+    project_scope: AzureAIProject,
+    credential: TokenCredential,
+    annotation_task: str = Tasks.CONTENT_HARM,
+    metric_display_name=None,
+) -> Dict[str, Union[str, float]]:
     """ "Evaluate the content safety of the response using Responsible AI service
 
-       :param query: The query to evaluate.
-       :type query: str
-       :param response: The response to evaluate.
-       :type response: str
+       :param data: The data to evaluate.
+       :type data: dict
        :param metric_name: The evaluation metric to use.
        :type metric_name: str
        :param project_scope: The Azure AI project scope details.
@@ -427,6 +446,10 @@ async def evaluate_with_rai_service(
        :param credential: The Azure authentication credential.
        :type credential:
     ~azure.core.credentials.TokenCredential
+       :param annotation_task: The annotation task to use.
+       :type annotation_task: str
+       :param metric_display_name: The display name of metric to use.
+       :type metric_display_name: str
        :return: The parsed annotation result.
        :rtype: Dict[str, Union[str, float]]
     """
@@ -434,11 +457,11 @@ async def evaluate_with_rai_service(
     # Get RAI service URL from discovery service and check service availability
     token = await fetch_or_reuse_token(credential)
     rai_svc_url = await get_rai_svc_url(project_scope, token)
-    await ensure_service_availability(rai_svc_url, token, Tasks.CONTENT_HARM)
+    await ensure_service_availability(rai_svc_url, token, annotation_task)
 
     # Submit annotation request and fetch result
-    operation_id = await submit_request(query, response, metric_name, rai_svc_url, token)
+    operation_id = await submit_request(data, metric_name, rai_svc_url, token, annotation_task)
     annotation_response = cast(List[Dict], await fetch_result(operation_id, rai_svc_url, credential, token))
-    result = parse_response(annotation_response, metric_name)
+    result = parse_response(annotation_response, metric_name, metric_display_name)
 
     return result
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/utils.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/utils.py
index 6b2b34549de9..61d20277e8f6 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/utils.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/utils.py
@@ -24,7 +24,7 @@
 T_TypedDict = TypeVar("T_TypedDict", bound=Mapping[Any, Any])
 
 
-def get_harm_severity_level(harm_score: int) -> Union[str, float]:
+def get_harm_severity_level(harm_score: Union[float, int]) -> Union[str, float]:
     """Generate harm severity level based on harm score.
 
     :param harm_score: The harm score to be evaluated.
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py
index b5aa3fcc0a59..689ce162122a 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py
@@ -36,6 +36,14 @@
 
 TClient = TypeVar("TClient", ProxyClient, CodeClient)
 
+# For metrics (aggregates) whose metric names intentionally differ from their
+# originating column name, usually because the aggregation of the original value
+# means something sufficiently different.
+# Note that content safety metrics are handled seprately.
+METRIC_COLUMN_NAME_REPLACEMENTS = {
+    "groundedness_pro_label": "groundedness_pro_passing_rate",
+}
+
 
 class __EvaluatorInfo(TypedDict):
     result: pd.DataFrame
@@ -43,6 +51,32 @@ class __EvaluatorInfo(TypedDict):
     run_summary: Dict[str, Any]
 
 
+def _aggregate_other_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[str, float]]:
+    """Identify and average various metrics that need to have the metric name be replaced,
+    instead of having the metric match the originating column name.
+    :param df: The dataframe of evaluation results.
+    :type df: ~pandas.DataFrame
+    :return: A tuple; the first element is a list of dataframe columns that were aggregated,
+        and the second element is a dictionary of resultant new metric column names and their values.
+    :rtype: Tuple[List[str], Dict[str, float]]
+    """
+    renamed_cols = []
+    metric_columns = {}
+    for col in df.columns:
+        metric_prefix = col.split(".")[0]
+        metric_name = col.split(".")[1]
+        if metric_name in METRIC_COLUMN_NAME_REPLACEMENTS:
+            renamed_cols.append(col)
+            new_col_name = metric_prefix + "." + METRIC_COLUMN_NAME_REPLACEMENTS[metric_name]
+            col_with_numeric_values = pd.to_numeric(df[col], errors="coerce")
+            metric_columns[new_col_name] = round(
+                list_sum(col_with_numeric_values) / col_with_numeric_values.count(),
+                2,
+            )
+
+    return renamed_cols, metric_columns
+
+
 # pylint: disable=line-too-long
 def _aggregate_content_safety_metrics(
     df: pd.DataFrame, evaluators: Dict[str, Callable]
@@ -146,8 +180,11 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dic
     # Rename certain columns as defect rates if we know that's what their aggregates represent
     # Content safety metrics
     content_safety_cols, cs_defect_rates = _aggregate_content_safety_metrics(df, evaluators)
+    other_renamed_cols, renamed_cols = _aggregate_other_metrics(df)
     handled_columns.extend(content_safety_cols)
+    handled_columns.extend(other_renamed_cols)
     defect_rates.update(cs_defect_rates)
+    defect_rates.update(renamed_cols)
     # Label-based (true/false) metrics where 'true' means 'something is wrong'
     label_cols, label_defect_rates = _aggregate_label_defect_metrics(df)
     handled_columns.extend(label_cols)
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py
index 23c753523f8c..28d00f7977b6 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py
@@ -1,11 +1,16 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-from typing import Dict, Optional, TypeVar, Union
+from typing import Dict, Optional, Union
 
 from typing_extensions import override
 
-from azure.ai.evaluation._common.constants import EvaluationMetrics, _InternalEvaluationMetrics
+from azure.ai.evaluation._common.constants import (
+    EvaluationMetrics,
+    _InternalEvaluationMetrics,
+    Tasks,
+    _InternalAnnotationTasks,
+)
 from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service
 from azure.ai.evaluation._common.utils import validate_azure_ai_project
 from azure.ai.evaluation._exceptions import EvaluationException
@@ -13,7 +18,7 @@
 
 from . import EvaluatorBase
 
-T = TypeVar("T")
+T = Union[str, float]
 
 
 class RaiServiceEvaluatorBase(EvaluatorBase[T]):
@@ -89,10 +94,43 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, T]:
                     + " This should have failed earlier."
                 ),
             )
+        input_data = {"query": query, "response": response}
+
+        if "context" in self._singleton_inputs:
+            context = eval_input.get("context", None)
+            if context is None:
+                raise EvaluationException(
+                    message="Not implemented",
+                    internal_message=(
+                        "Attempted context-based evaluation without supplying context."
+                        + " This should have failed earlier."
+                    ),
+                )
+            input_data["context"] = context
+
         return await evaluate_with_rai_service(
             metric_name=self._eval_metric,
-            query=query,
-            response=response,
+            data=input_data,
             project_scope=self._azure_ai_project,
             credential=self._credential,
+            annotation_task=self._get_task(),
         )
+
+    def _get_task(self):
+        """Get the annotation task for the current evaluation metric.
+        The annotation task is used by the RAI service script to determine a the message format
+        of the API call, and how the output is processed, among other things.
+
+        :return: The annotation task for the evaluator's self._eval_metric value.
+        :rtype: ~azure.ai.evaluation._common.constants.Tasks
+
+        """
+        if self._eval_metric == EvaluationMetrics.GROUNDEDNESS:
+            return Tasks.GROUNDEDNESS
+        if self._eval_metric == EvaluationMetrics.XPIA:
+            return Tasks.XPIA
+        if self._eval_metric == _InternalEvaluationMetrics.ECI:
+            return _InternalAnnotationTasks.ECI
+        if self._eval_metric == EvaluationMetrics.PROTECTED_MATERIAL:
+            return Tasks.PROTECTED_MATERIAL
+        return Tasks.CONTENT_HARM
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py
index 3f56a7488684..8384024fc813 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py
@@ -68,9 +68,10 @@ class ContentSafetyEvaluator(EvaluatorBase):
         }
     """
 
+    # TODO address 3579092 to re-enabled parallel evals.
     def __init__(self, credential, azure_ai_project, eval_last_turn: bool = False, **kwargs):
         super().__init__(eval_last_turn=eval_last_turn)
-        self._parallel = kwargs.pop("parallel", True)
+        self._parallel = kwargs.pop("parallel", False)
         self._evaluators: List[Callable[..., Dict[str, Union[str, float]]]] = [
             ViolenceEvaluator(credential, azure_ai_project),
             SexualEvaluator(credential, azure_ai_project),
@@ -118,6 +119,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]:
         response = eval_input.get("response", None)
         conversation = eval_input.get("conversation", None)
         results: Dict[str, Union[str, float]] = {}
+        # TODO fix this to not explode on empty optional inputs (PF SKD error)
         if self._parallel:
             with ThreadPoolExecutor() as executor:
                 # pylint: disable=no-value-for-parameter
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py
index ae078563dbbd..551624101661 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py
@@ -1,7 +1,7 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-from typing import Optional, Union
+from typing import Optional
 
 from typing_extensions import override
 
@@ -11,7 +11,7 @@
 
 
 @experimental
-class HateUnfairnessEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
+class HateUnfairnessEvaluator(RaiServiceEvaluatorBase):
     """
     Initialize a hate-unfairness evaluator for hate unfairness score.
 
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py
index ea1f057e2fcf..39e1d311063c 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py
@@ -1,7 +1,7 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-from typing import Optional, Union
+from typing import Optional
 
 from typing_extensions import override
 
@@ -11,7 +11,7 @@
 
 
 @experimental
-class SelfHarmEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
+class SelfHarmEvaluator(RaiServiceEvaluatorBase):
     """
     Initialize a self harm evaluator for self harm score.
 
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py
index 151874535a5e..ad7e875b3910 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py
@@ -1,7 +1,7 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-from typing import Optional, Union
+from typing import Optional
 
 from typing_extensions import override
 
@@ -11,7 +11,7 @@
 
 
 @experimental
-class SexualEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
+class SexualEvaluator(RaiServiceEvaluatorBase):
     """
     Initialize a sexual evaluator for sexual score.
 
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py
index 44a9b3abf0bf..ab03d551d214 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py
@@ -1,7 +1,7 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-from typing import Optional, Union
+from typing import Optional
 
 from typing_extensions import override
 
@@ -11,7 +11,7 @@
 
 
 @experimental
-class ViolenceEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
+class ViolenceEvaluator(RaiServiceEvaluatorBase):
     """
     Initialize a violence evaluator for violence score.
 
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_eci/_eci.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_eci/_eci.py
index cd2e6853ac29..3e6e420e9305 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_eci/_eci.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_eci/_eci.py
@@ -9,7 +9,7 @@
 
 
 @experimental
-class ECIEvaluator(RaiServiceEvaluatorBase[str]):
+class ECIEvaluator(RaiServiceEvaluatorBase):
     """
     Initialize an ECI evaluator to evaluate ECI based on the following guidelines:
     Detects whether ECI is present without a disclaimer in the AI system’s response.
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py
index 36cec30a080e..60e23a9f9e44 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py
@@ -59,7 +59,7 @@ def __call__(
         conversation=None,
         **kwargs,
     ):
-        """Evaluate groundedless. Accepts either a response and context a single evaluation,
+        """Evaluate groundedness. Accepts either a response and context a single evaluation,
         or a conversation for a multi-turn evaluation. If the conversation has more than one turn,
         the evaluator will aggregate the results of each turn.
 
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py
index 0ead00125c3d..92ae2a3e98c0 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py
@@ -2,7 +2,7 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
 
-from typing import Optional, Union
+from typing import Optional
 
 from typing_extensions import override
 
@@ -12,7 +12,7 @@
 
 
 @experimental
-class ProtectedMaterialEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
+class ProtectedMaterialEvaluator(RaiServiceEvaluatorBase):
     """
     Initialize a protected material evaluator to detect whether protected material
     is present in your AI system's response. Outputs True or False with AI-generated reasoning.
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py
index e895dd9bd6dd..e34f4f8f0211 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py
@@ -110,15 +110,13 @@ class RetrievalEvaluator:
 
     .. code-block:: python
 
-        chat_eval = RetrievalScoreEvaluator(model_config)
+        chat_eval = RetrievalEvaluator(model_config)
         conversation = {
             "messages": [
                 {"role": "user", "content": "What is the value of 2 + 2?"},
-                {"role": "assistant", "content": "2 + 2 = 4", "context": {
-                    "citations": [
-                            {"id": "math_doc.md", "content": "Information about additions: 1 + 2 = 3, 2 + 2 = 4"}
-                        ]
-                    }
+                {
+                    "role": "assistant", "content": "2 + 2 = 4",
+                    "context": "From 'math_doc.md': Information about additions: 1 + 2 = 3, 2 + 2 = 4"
                 }
             ]
         }
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py
new file mode 100644
index 000000000000..fad50fd1a2c6
--- /dev/null
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py
@@ -0,0 +1,9 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+
+from ._service_groundedness import GroundednessProEvaluator
+
+__all__ = [
+    "GroundednessProEvaluator",
+]
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py
new file mode 100644
index 000000000000..83780f6506ef
--- /dev/null
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py
@@ -0,0 +1,150 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+from typing import Optional, Dict
+from typing_extensions import override
+
+from azure.ai.evaluation._common._experimental import experimental
+from azure.ai.evaluation._common.constants import EvaluationMetrics
+from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
+
+
+@experimental
+class GroundednessProEvaluator(RaiServiceEvaluatorBase):
+    """
+    Initialize a Groundedness Pro evaluator for determine if the response is grounded
+    in the query and context.
+
+    If this evaluator is supplied to the `evaluate` function, the aggregated metric
+    for the groundedness pro label will be "groundedness_pro_passing_rate".
+
+    :param credential: The credential for connecting to Azure AI project. Required
+    :type credential: ~azure.core.credentials.TokenCredential
+    :param azure_ai_project: The scope of the Azure AI project.
+        It contains subscription id, resource group, and project name.
+    :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
+    :param kwargs: Additional arguments to pass to the evaluator.
+    :type kwargs: Any
+
+    **Usage**
+
+    .. code-block:: python
+
+        azure_ai_project = {
+            "subscription_id": "<subscription_id>",
+            "resource_group_name": "<resource_group_name>",
+            "project_name": "<project_name>",
+        }
+        credential = DefaultAzureCredential()
+
+        eval_fn = GroundednessProEvaluator(azure_ai_project, credential)
+        result = eval_fn(query="What's the capital of France", response="Paris", context="Paris.")
+
+    **Output format**
+
+    .. code-block:: python
+
+        {
+            "groundedness_pro_label": True,
+            "reason": "'All Contents are grounded"
+        }
+
+    **Usage with conversation input**
+
+    .. code-block:: python
+
+        azure_ai_project = {
+            "subscription_id": "<subscription_id>",
+            "resource_group_name": "<resource_group_name>",
+            "project_name": "<project_name>",
+        }
+        credential = DefaultAzureCredential()
+
+        eval_fn = GroundednessProEvaluator(azure_ai_project, credential)
+        conversation = {
+            "messages": [
+                {"role": "user", "content": "What is the capital of France?"},
+                {"role": "assistant", "content": "Paris.", "context": "Paris."}
+                {"role": "user", "content": "What is the capital of Germany?"},
+                {"role": "assistant", "content": "Berlin.", "context": "Berlin."}
+            ]
+        }
+        result = eval_fn(conversation=conversation)
+
+    **Output format**
+
+    .. code-block:: python
+
+            {
+                "groundedness_pro_label": 1.0,
+                "evaluation_per_turn": {
+                    "groundedness_pro_label": [True, True],
+                    "groundedness_pro_reason": ["All contents are grounded", "All contents are grounded"]
+                }
+            }
+    """
+
+    @override
+    def __init__(
+        self,
+        credential,
+        azure_ai_project,
+        **kwargs,
+    ):
+        self._passing_score = 3  # TODO update once the binarization PR is merged
+        self._output_prefix = "groundedness_pro"
+        super().__init__(
+            eval_metric=EvaluationMetrics.GROUNDEDNESS,
+            azure_ai_project=azure_ai_project,
+            credential=credential,
+            **kwargs,
+        )
+
+    @override
+    def __call__(
+        self,
+        *,
+        query: Optional[str] = None,
+        response: Optional[str] = None,
+        context: Optional[str] = None,
+        conversation=None,
+        **kwargs,
+    ):
+        """Evaluate groundedness. Accepts either a query, response and context for a single-turn evaluation, or a
+        or a conversation for a multi-turn evaluation. If the conversation has more than one turn,
+        the evaluator will aggregate the results of each turn, with the per-turn results available
+        in the output under the "evaluation_per_turn" key.
+
+        :keyword query: The query to be evaluated.
+        :paramtype query: Optional[str]
+        :keyword response: The response to be evaluated.
+        :paramtype response: Optional[str]
+        :keyword context: The context to be evaluated.
+        :paramtype context: Optional[str]
+        :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
+            key "messages", and potentially a global context under the key "context". Conversation turns are expected
+            to be dictionaries with keys "content", "role", and possibly "context".
+        :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
+        :return: The relevance score.
+        :rtype: Union[Dict[str, Union[str, bool]], Dict[str, Union[float, Dict[str, List[Union[str, bool]]]]]]
+        """
+        return super().__call__(query=query, response=response, context=context, conversation=conversation, **kwargs)
+
+    @override
+    async def _do_eval(self, eval_input: Dict):
+        """This evaluator has some unique post-processing that requires data that
+        the rai_service script is not currently built to handle. So we post-post-process
+        the result here to message it into the right form.
+
+        :param eval_input: The input to the evaluation function.
+        :type eval_input: Dict
+        :return: The evaluation result.
+        :rtype: Dict
+        """
+        result = await super()._do_eval(eval_input)
+        real_result = {}
+        real_result[self._output_prefix + "_label"] = (
+            result[EvaluationMetrics.GROUNDEDNESS + "_score"] >= self._passing_score
+        )
+        real_result[self._output_prefix + "_reason"] = result[EvaluationMetrics.GROUNDEDNESS + "_reason"]
+        return real_result
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py
index 8db3b777f23c..703fa31cf70b 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py
@@ -2,7 +2,7 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
 import logging
-from typing import Optional, Union
+from typing import Optional
 
 from typing_extensions import override
 
@@ -14,7 +14,7 @@
 
 
 @experimental
-class IndirectAttackEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
+class IndirectAttackEvaluator(RaiServiceEvaluatorBase):
     """A Cross-Domain Prompt Injection Attack (XPIA) jailbreak evaluator.
 
     Detect whether cross domain injected attacks are present in your AI system's response.
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_simulator.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_simulator.py
index 835f623612ed..b6fcca19fb29 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_simulator.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_simulator.py
@@ -226,6 +226,7 @@ async def _simulate_with_predefined_turns(
         semaphore = asyncio.Semaphore(concurrent_async_tasks)
         progress_bar_lock = asyncio.Lock()
 
+
         async def run_simulation(simulation: List[Union[str, Dict[str, Any]]]) -> JsonLineChatProtocol:
             async with semaphore:
                 current_simulation = ConversationHistory()
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/data/evaluate_test_data_conversation.jsonl b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/data/evaluate_test_data_conversation.jsonl
new file mode 100644
index 000000000000..7a52cebbf2f3
--- /dev/null
+++ b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/data/evaluate_test_data_conversation.jsonl
@@ -0,0 +1,2 @@
+{"conversation" : {"context" : "", "messages": [{"content": "What shape has 3 sides", "role" :"user", "context": null}, {"content": "A triangle", "role" :"assistant", "context": "The answer is a triangle."}, {"content": "Next, what shape has 4 sides", "role" :"user", "context": null}, {"content": "A square", "role" :"assistant", "context": "The answer is a square."}]}}
+{"conversation" : {"context" : "User wants to know about state capitals", "messages": [{"content": "What is the capital of Hawaii", "role" :"user", "context": "User wants to know the capital of Hawaii"}, {"content": "Honolulu", "role" :"assistant", "context": "The answer is a Honolulu."}, {"content": "Ok, what is the capital of Massachusetts", "role" :"user", "context": "User wants to know the capital of Massachusetts."}, {"content": "Boston", "role" :"assistant", "context": "The answer is Boston."}]}}
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_builtin_evaluators.py b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_builtin_evaluators.py
index 5c27e902f1e3..68c6c3b72768 100644
--- a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_builtin_evaluators.py
+++ b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_builtin_evaluators.py
@@ -24,6 +24,7 @@
     SexualEvaluator,
     SimilarityEvaluator,
     ViolenceEvaluator,
+    GroundednessProEvaluator,
 )
 from azure.ai.evaluation._evaluators._eci._eci import ECIEvaluator
 
@@ -199,7 +200,7 @@ def test_content_safety_evaluator_violence(self, project_scope, azure_cred, simp
         assert score2["violence_score"] > 0
         assert score2["evaluation_per_turn"]["violence_score"][0] == 0
         assert score2["evaluation_per_turn"]["violence_score"][1] > 0
-        assert score2["evaluation_per_turn"]["violence"] == ["Very low", "Medium"]
+        assert score2["evaluation_per_turn"]["violence"] == ["Very low", "High"]
         assert all(score2["evaluation_per_turn"]["violence_reason"]), "violence_reason must not be None or empty."
 
     def test_content_safety_evaluator_sexual(self, project_scope, azure_cred, simple_conversation):
@@ -246,8 +247,8 @@ def test_content_safety_evaluator_self_harm(self, project_scope, azure_cred, sim
         # Test eval_last_turn behavior
         eval_fn_last = SelfHarmEvaluator(azure_cred, project_scope, eval_last_turn=True)
         score3 = eval_fn_last(conversation=simple_conversation)
-        assert score3["self_harm"] != "Very low"
-        assert score3["self_harm_score"] > 0
+        assert score3["self_harm"] == "Very low"
+        assert score3["self_harm_score"] >= 1
         assert score3["self_harm_reason"], "self_harm_reason must not be None or empty."
 
     def test_content_safety_evaluator_hate_unfairness(self, project_scope, azure_cred, simple_conversation):
@@ -521,3 +522,24 @@ def test_xpia_evaluator(self, project_scope, azure_cred, simple_conversation):
         assert convo_result["xpia_label"] == 0.5
         assert convo_result["evaluation_per_turn"]["xpia_label"] == [False, True]
         assert all(convo_result["evaluation_per_turn"]["xpia_reason"]), "xpia_reason must not be None or empty."
+
+    def test_groundedness_pro_evaluator(self, project_scope, azure_cred, simple_conversation):
+        ground_eval = GroundednessProEvaluator(azure_cred, project_scope)
+        result = ground_eval(
+            query="What shape has 4 equilateral sides?",
+            response="Rhombus",
+            context="Rhombus is a shape with 4 equilateral sides.",
+        )
+
+        assert result is not None
+        assert result["groundedness_pro_label"]
+        assert result["groundedness_pro_reason"] is not None, "groundedness_pro_reason must not be None or empty."
+
+        convo_result = ground_eval(conversation=simple_conversation)
+
+        assert convo_result is not None
+        assert convo_result["groundedness_pro_label"] == 1.0
+        assert convo_result["evaluation_per_turn"]["groundedness_pro_label"] == [True, True]
+        assert all(
+            convo_result["evaluation_per_turn"]["groundedness_pro_reason"]
+        ), "groundedness_pro_reason must not be None or empty."
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py
index 30b3e6b6b7e6..948660387773 100644
--- a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py
+++ b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py
@@ -14,6 +14,7 @@
     F1ScoreEvaluator,
     FluencyEvaluator,
     GroundednessEvaluator,
+    GroundednessProEvaluator,
     evaluate,
 )
 from azure.ai.evaluation._common.math import list_mean_nan_safe
@@ -25,6 +26,12 @@ def data_file():
     return os.path.join(data_path, "evaluate_test_data.jsonl")
 
 
+@pytest.fixture
+def data_convo_file():
+    data_path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data")
+    return os.path.join(data_path, "evaluate_test_data_conversation.jsonl")
+
+
 @pytest.fixture
 def questions_file():
     data_path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data")
@@ -205,7 +212,36 @@ def test_evaluate_with_content_safety_evaluator(self, project_scope, data_file,
         assert 0 <= metrics.get("content_safety.self_harm_defect_rate") <= 1
         assert 0 <= metrics.get("content_safety.hate_unfairness_defect_rate") <= 1
 
-    @pytest.mark.performance_test
+    def test_evaluate_with_groundedness_pro_evaluator(self, project_scope, data_convo_file, azure_cred):
+
+        # CS evaluator tries to store the credential, which breaks multiprocessing at
+        # pickling stage. So we pass None for credential and let child evals
+        # generate a default credential at runtime.
+        # Internal Parallelism is also disabled to avoid faulty recordings.
+        gp_eval = GroundednessProEvaluator(azure_ai_project=project_scope, credential=azure_cred, parallel=False)
+
+        convo_input_data = pd.read_json(data_convo_file, lines=True)
+        # run the evaluation
+        convo_result = evaluate(
+            data=data_convo_file,
+            evaluators={"groundedness_pro": gp_eval},
+        )
+
+        convo_row_result_df = pd.DataFrame(convo_result["rows"])
+        convo_metrics = convo_result["metrics"]
+        assert convo_row_result_df.shape[0] == len(convo_input_data)
+        assert "outputs.groundedness_pro.groundedness_pro_label" in convo_row_result_df.columns.to_list()
+        assert "outputs.groundedness_pro.evaluation_per_turn" in convo_row_result_df.columns.to_list()
+
+        per_turn_results = convo_row_result_df["outputs.groundedness_pro.evaluation_per_turn"][0]
+        assert "groundedness_pro_label" in per_turn_results.keys()
+        assert "groundedness_pro_reason" in per_turn_results.keys()
+
+        # Check that label is renamed to passsing rate in metrics
+        assert "groundedness_pro.groundedness_pro_passing_rate" in convo_metrics.keys()
+        assert 0 <= convo_metrics.get("groundedness_pro.groundedness_pro_passing_rate") <= 1
+
+    # @pytest.mark.performance_test
     @pytest.mark.skip(reason="Temporary skip to merge 37201, will re-enable in subsequent pr")
     def test_evaluate_with_async_enabled_evaluator(self, model_config, data_file):
         os.environ["AI_EVALS_BATCH_USE_ASYNC"] = "true"
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_content_safety_rai_script.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_content_safety_rai_script.py
index d49596b28518..ca2904ac4f9d 100644
--- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_content_safety_rai_script.py
+++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_content_safety_rai_script.py
@@ -17,6 +17,7 @@
     get_rai_svc_url,
     parse_response,
     submit_request,
+    Tasks,
 )
 from azure.core.exceptions import HttpResponseError
 from azure.core.rest import AsyncHttpResponse, HttpRequest
@@ -166,11 +167,11 @@ async def test_ensure_service_availability_exception_capability_unavailable(self
     )
     async def test_submit_request(self, client_mock):
         result = await submit_request(
-            query="What is the meaning of life",
-            response="42",
+            data={"query": "What is the meaning of life", "response": "42"},
             metric="points",
             rai_svc_url="www.notarealurl.com",
             token="dummy",
+            annotation_task=Tasks.CONTENT_HARM,
         )
         assert result == "dummy-operation-id"
 
@@ -186,11 +187,11 @@ async def test_submit_request(self, client_mock):
     async def test_submit_request_not_found(self, client_mock):
         with pytest.raises(HttpResponseError) as exc_info:
             _ = await submit_request(
-                query="What is the meaning of life",
-                response="42",
+                data={"query": "What is the meaning of life", "response": "42"},
                 metric="points",
                 rai_svc_url="www.notarealurl.com",
                 token="dummy",
+                annotation_task=Tasks.CONTENT_HARM,
             )
         assert "Operation returned an invalid status '404 Not Found'" in str(exc_info._excinfo[1])
 
@@ -250,7 +251,11 @@ async def test_fetch_result_timeout(self, client_mock, mock_token):
     def test_parse_response(self):
         batch_response = [{"not-a-metric": "not-a-value"}]
         metric_name = EvaluationMetrics.HATE_FAIRNESS
-        result = parse_response(batch_response=batch_response, metric_name=metric_name)
+        result = parse_response(
+            batch_response=batch_response,
+            metric_name=metric_name,
+            metric_display_name=EvaluationMetrics.HATE_UNFAIRNESS,
+        )
         assert math.isnan(result[EvaluationMetrics.HATE_UNFAIRNESS])
         assert math.isnan(result[EvaluationMetrics.HATE_UNFAIRNESS + "_score"])
         assert result[EvaluationMetrics.HATE_UNFAIRNESS + "_reason"] == ""
@@ -267,7 +272,7 @@ def test_parse_response(self):
         # This tests ALL of it.
         batch_response[0] = {metric_name: str(response_value)}
 
-        result = parse_response(batch_response=batch_response, metric_name=metric_name)
+        result = parse_response(batch_response=batch_response, metric_name=metric_name, metric_display_name=metric_name)
         assert result[metric_name] == HarmSeverityLevel.VeryLow.value
         assert result[metric_name + "_score"] == 0
         assert result[metric_name + "_reason"] == response_value["reasoning"]
@@ -277,7 +282,7 @@ def test_parse_response(self):
             "reason": "This is a sample reason.",
         }
         batch_response[0] = {metric_name: str(response_value)}
-        result = parse_response(batch_response=batch_response, metric_name=metric_name)
+        result = parse_response(batch_response=batch_response, metric_name=metric_name, metric_display_name=metric_name)
         assert result[metric_name] == HarmSeverityLevel.VeryLow.value
         assert result[metric_name + "_score"] == 0
         assert result[metric_name + "_reason"] == response_value["output"]["reason"]
@@ -314,7 +319,7 @@ def test_parse_response(self):
         assert math.isnan(result[metric_name + "_score"])
 
         batch_response[0] = {metric_name: ["still not a number"]}
-        result = parse_response(batch_response=batch_response, metric_name=metric_name)
+        result = parse_response(batch_response=batch_response, metric_name=metric_name, metric_display_name=metric_name)
         assert math.isnan(result[metric_name])
         assert math.isnan(result[metric_name + "_score"])