From 67fc6809a9600934e93a66f30e15e2655f4e65d6 Mon Sep 17 00:00:00 2001
From: huiwengoh <45724323+huiwengoh@users.noreply.github.com>
Date: Thu, 12 Sep 2024 23:25:24 -0400
Subject: [PATCH 1/2] add args to specify allow null reposes

---
 tests/tlm/test_prompt.py     | 31 ++++++++++++++---
 tests/tlm/test_properties.py | 64 +++++++++++++++++++++++++++++-------
 2 files changed, 79 insertions(+), 16 deletions(-)

diff --git a/tests/tlm/test_prompt.py b/tests/tlm/test_prompt.py
index 40956b99..feec92a4 100644
--- a/tests/tlm/test_prompt.py
+++ b/tests/tlm/test_prompt.py
@@ -6,15 +6,35 @@
 from cleanlab_studio.studio.trustworthy_language_model import TLM
 
 
-def is_tlm_response(response: Any) -> bool:
-    """Returns True if the response is a TLMResponse."""
+def is_tlm_response(
+    response: Any,
+    allow_none_response: bool = False,
+    allow_null_trustworthiness_score: bool = False,
+) -> bool:
+    """Returns True if the response is a TLMResponse.
+
+    Args:
+        allow_none_response: If True, allows the response to be None (only allowed for try_prompt)
+        allow_null_trustworthiness_score: If True, allows the trustworthiness_score to be None
+            (only allowed for base preset for models with no perplexity score)
+    """
+    # check if response is allowed to be none
+    if response is None:
+        return allow_none_response
+
     if (
         isinstance(response, dict)
         and "response" in response
         and "trustworthiness_score" in response
-        and isinstance(response["trustworthiness_score"], float)
     ):
-        return 0.0 <= response["trustworthiness_score"] <= 1.0
+        trustworthiness_score = response["trustworthiness_score"]
+
+        # check if trustworthiness score is allowed to be none
+        if trustworthiness_score is None:
+            return allow_null_trustworthiness_score
+
+        return isinstance(trustworthiness_score, float) and 0.0 <= trustworthiness_score <= 1.0
+
     return False
 
 
@@ -26,6 +46,7 @@ def test_single_prompt(tlm: TLM) -> None:
     - Response should be non-None
     - No exceptions are raised
     """
+
     # act -- run a single prompt
     response = tlm.prompt("What is the capital of France?")
 
@@ -93,7 +114,7 @@ def test_batch_try_prompt(tlm: TLM) -> None:
     # - no exceptions are raised (implicit)
     assert response is not None
     assert isinstance(response, list)
-    assert all(r is None or is_tlm_response(r) for r in response)
+    assert all(is_tlm_response(r, allow_none_response=True) for r in response)
 
 
 def test_batch_try_prompt_force_timeouts(tlm: TLM) -> None:
diff --git a/tests/tlm/test_properties.py b/tests/tlm/test_properties.py
index 93cfb40e..20e5b98a 100644
--- a/tests/tlm/test_properties.py
+++ b/tests/tlm/test_properties.py
@@ -13,6 +13,7 @@
 
 excluded_tlm_models = ["claude-3-sonnet", "claude-3.5-sonnet"]
 valid_tlm_models = [model for model in _VALID_TLM_MODELS if model not in excluded_tlm_models]
+models_with_no_perplexity_score = ["claude-3-haiku", "claude-3-sonnet", "claude-3.5-sonnet"]
 
 
 def _test_log(response: Dict[str, Any], options: Dict[str, Any]) -> None:
@@ -35,17 +36,39 @@ def _test_log_batch(responses: Dict[str, Any], options: Dict[str, Any]) -> None:
             _test_log(response, options)
 
 
-def _test_prompt_response(response, options):
+def _test_prompt_response(
+    response,
+    options,
+    allow_none_response=False,
+    allow_null_trustworthiness_score=False,
+):
     """Property tests the responses of a prompt based on the options dictionary and returned responses."""
     assert response is not None
-    assert is_tlm_response(response)
+    assert is_tlm_response(
+        response,
+        allow_none_response=allow_none_response,
+        allow_null_trustworthiness_score=allow_null_trustworthiness_score,
+    )
     _test_log(response, options)
 
 
-def _test_batch_prompt_response(responses, options):
+def _test_batch_prompt_response(
+    responses,
+    options,
+    allow_none_response=False,
+    allow_null_trustworthiness_score=False,
+):
     """Property tests the responses of a batch prompt based on the options dictionary and returned responses."""
     assert responses is not None
     assert isinstance(responses, list)
+    assert all(
+        is_tlm_response(
+            response,
+            allow_none_response=allow_none_response,
+            allow_null_trustworthiness_score=allow_null_trustworthiness_score,
+        )
+        for response in responses
+    )
     _test_log_batch(responses, options)
 
 
@@ -88,15 +111,23 @@ def test_prompt(tlm_dict: Dict[str, Any], model: str, quality_preset: str) -> No
     # get TLM and options dictionary based on parameters
     tlm = tlm_dict[quality_preset][model]["tlm"]
     options = tlm_dict[quality_preset][model]["options"]
+    allow_null_trustworthiness_score = model in models_with_no_perplexity_score
 
     # test prompt with single prompt
     response = tlm.prompt("What is the capital of France?")
-    _test_prompt_response(response, options)
+    _test_prompt_response(
+        response,
+        options,
+        allow_null_trustworthiness_score=allow_null_trustworthiness_score,
+    )
 
     # test prompt with batch prompt
     responses = tlm.prompt(["What is the capital of France?", "What is the capital of Ukraine?"])
-    assert all(is_tlm_response(response) for response in responses)
-    _test_batch_prompt_response(responses, options)
+    _test_batch_prompt_response(
+        responses,
+        options,
+        allow_null_trustworthiness_score=allow_null_trustworthiness_score,
+    )
 
 
 @pytest.mark.parametrize("model", valid_tlm_models)
@@ -106,10 +137,13 @@ def test_prompt_async(tlm_dict: Dict[str, Any], model: str, quality_preset: str)
     # get TLM and options dictionary based on parameters
     tlm = tlm_dict[quality_preset][model]["tlm"]
     options = tlm_dict[quality_preset][model]["options"]
+    allow_null_trustworthiness_score = model in models_with_no_perplexity_score
 
     # test prompt with single prompt
     response = asyncio.run(_run_prompt_async(tlm, "What is the capital of France?"))
-    _test_prompt_response(response, options)
+    _test_prompt_response(
+        response, options, allow_null_trustworthiness_score=allow_null_trustworthiness_score
+    )
 
     # test prompt with batch prompt
     responses = asyncio.run(
@@ -117,8 +151,11 @@ def test_prompt_async(tlm_dict: Dict[str, Any], model: str, quality_preset: str)
             tlm, ["What is the capital of France?", "What is the capital of Ukraine?"]
         )
     )
-    assert all(is_tlm_response(r) for r in responses)
-    _test_batch_prompt_response(responses, options)
+    _test_batch_prompt_response(
+        responses,
+        options,
+        allow_null_trustworthiness_score=allow_null_trustworthiness_score,
+    )
 
 
 @pytest.mark.parametrize("model", valid_tlm_models)
@@ -128,13 +165,18 @@ def test_try_prompt(tlm_dict: Dict[str, Any], model: str, quality_preset: str) -
     # get TLM and options dictionary based on parameters
     tlm = tlm_dict[quality_preset][model]["tlm"]
     options = tlm_dict[quality_preset][model]["options"]
+    allow_null_trustworthiness_score = model in models_with_no_perplexity_score
 
     # test prompt with batch prompt
     responses = tlm.try_prompt(
         ["What is the capital of France?", "What is the capital of Ukraine?"]
     )
-    assert all(response is None or is_tlm_response(response) for response in responses)
-    _test_batch_prompt_response(responses, options)
+    _test_batch_prompt_response(
+        responses,
+        options,
+        allow_none_response=True,
+        allow_null_trustworthiness_score=allow_null_trustworthiness_score,
+    )
 
 
 @pytest.mark.parametrize("model", valid_tlm_models)

From 9b9ab2f56e92625829a4d22fa5d37e018bc76d64 Mon Sep 17 00:00:00 2001
From: huiwengoh <45724323+huiwengoh@users.noreply.github.com>
Date: Thu, 12 Sep 2024 23:28:24 -0400
Subject: [PATCH 2/2] only on base preset

---
 tests/tlm/test_properties.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/tests/tlm/test_properties.py b/tests/tlm/test_properties.py
index 20e5b98a..4a660f63 100644
--- a/tests/tlm/test_properties.py
+++ b/tests/tlm/test_properties.py
@@ -111,7 +111,9 @@ def test_prompt(tlm_dict: Dict[str, Any], model: str, quality_preset: str) -> No
     # get TLM and options dictionary based on parameters
     tlm = tlm_dict[quality_preset][model]["tlm"]
     options = tlm_dict[quality_preset][model]["options"]
-    allow_null_trustworthiness_score = model in models_with_no_perplexity_score
+    allow_null_trustworthiness_score = (
+        quality_preset == "base" and model in models_with_no_perplexity_score
+    )
 
     # test prompt with single prompt
     response = tlm.prompt("What is the capital of France?")
@@ -137,7 +139,9 @@ def test_prompt_async(tlm_dict: Dict[str, Any], model: str, quality_preset: str)
     # get TLM and options dictionary based on parameters
     tlm = tlm_dict[quality_preset][model]["tlm"]
     options = tlm_dict[quality_preset][model]["options"]
-    allow_null_trustworthiness_score = model in models_with_no_perplexity_score
+    allow_null_trustworthiness_score = (
+        quality_preset == "base" and model in models_with_no_perplexity_score
+    )
 
     # test prompt with single prompt
     response = asyncio.run(_run_prompt_async(tlm, "What is the capital of France?"))
@@ -165,7 +169,9 @@ def test_try_prompt(tlm_dict: Dict[str, Any], model: str, quality_preset: str) -
     # get TLM and options dictionary based on parameters
     tlm = tlm_dict[quality_preset][model]["tlm"]
     options = tlm_dict[quality_preset][model]["options"]
-    allow_null_trustworthiness_score = model in models_with_no_perplexity_score
+    allow_null_trustworthiness_score = (
+        quality_preset == "base" and model in models_with_no_perplexity_score
+    )
 
     # test prompt with batch prompt
     responses = tlm.try_prompt(