From 67fc6809a9600934e93a66f30e15e2655f4e65d6 Mon Sep 17 00:00:00 2001 From: huiwengoh <45724323+huiwengoh@users.noreply.github.com> Date: Thu, 12 Sep 2024 23:25:24 -0400 Subject: [PATCH 1/2] add args to specify allow null reposes --- tests/tlm/test_prompt.py | 31 ++++++++++++++--- tests/tlm/test_properties.py | 64 +++++++++++++++++++++++++++++------- 2 files changed, 79 insertions(+), 16 deletions(-) diff --git a/tests/tlm/test_prompt.py b/tests/tlm/test_prompt.py index 40956b99..feec92a4 100644 --- a/tests/tlm/test_prompt.py +++ b/tests/tlm/test_prompt.py @@ -6,15 +6,35 @@ from cleanlab_studio.studio.trustworthy_language_model import TLM -def is_tlm_response(response: Any) -> bool: - """Returns True if the response is a TLMResponse.""" +def is_tlm_response( + response: Any, + allow_none_response: bool = False, + allow_null_trustworthiness_score: bool = False, +) -> bool: + """Returns True if the response is a TLMResponse. + + Args: + allow_none_response: If True, allows the response to be None (only allowed for try_prompt) + allow_null_trustworthiness_score: If True, allows the trustworthiness_score to be None + (only allowed for base preset for models with no perplexity score) + """ + # check if response is allowed to be none + if response is None: + return allow_none_response + if ( isinstance(response, dict) and "response" in response and "trustworthiness_score" in response - and isinstance(response["trustworthiness_score"], float) ): - return 0.0 <= response["trustworthiness_score"] <= 1.0 + trustworthiness_score = response["trustworthiness_score"] + + # check if trustworthiness score is allowed to be none + if trustworthiness_score is None: + return allow_null_trustworthiness_score + + return isinstance(trustworthiness_score, float) and 0.0 <= trustworthiness_score <= 1.0 + return False @@ -26,6 +46,7 @@ def test_single_prompt(tlm: TLM) -> None: - Response should be non-None - No exceptions are raised """ + # act -- run a single prompt response = tlm.prompt("What is the capital of France?") @@ -93,7 +114,7 @@ def test_batch_try_prompt(tlm: TLM) -> None: # - no exceptions are raised (implicit) assert response is not None assert isinstance(response, list) - assert all(r is None or is_tlm_response(r) for r in response) + assert all(is_tlm_response(r, allow_none_response=True) for r in response) def test_batch_try_prompt_force_timeouts(tlm: TLM) -> None: diff --git a/tests/tlm/test_properties.py b/tests/tlm/test_properties.py index 93cfb40e..20e5b98a 100644 --- a/tests/tlm/test_properties.py +++ b/tests/tlm/test_properties.py @@ -13,6 +13,7 @@ excluded_tlm_models = ["claude-3-sonnet", "claude-3.5-sonnet"] valid_tlm_models = [model for model in _VALID_TLM_MODELS if model not in excluded_tlm_models] +models_with_no_perplexity_score = ["claude-3-haiku", "claude-3-sonnet", "claude-3.5-sonnet"] def _test_log(response: Dict[str, Any], options: Dict[str, Any]) -> None: @@ -35,17 +36,39 @@ def _test_log_batch(responses: Dict[str, Any], options: Dict[str, Any]) -> None: _test_log(response, options) -def _test_prompt_response(response, options): +def _test_prompt_response( + response, + options, + allow_none_response=False, + allow_null_trustworthiness_score=False, +): """Property tests the responses of a prompt based on the options dictionary and returned responses.""" assert response is not None - assert is_tlm_response(response) + assert is_tlm_response( + response, + allow_none_response=allow_none_response, + allow_null_trustworthiness_score=allow_null_trustworthiness_score, + ) _test_log(response, options) -def _test_batch_prompt_response(responses, options): +def _test_batch_prompt_response( + responses, + options, + allow_none_response=False, + allow_null_trustworthiness_score=False, +): """Property tests the responses of a batch prompt based on the options dictionary and returned responses.""" assert responses is not None assert isinstance(responses, list) + assert all( + is_tlm_response( + response, + allow_none_response=allow_none_response, + allow_null_trustworthiness_score=allow_null_trustworthiness_score, + ) + for response in responses + ) _test_log_batch(responses, options) @@ -88,15 +111,23 @@ def test_prompt(tlm_dict: Dict[str, Any], model: str, quality_preset: str) -> No # get TLM and options dictionary based on parameters tlm = tlm_dict[quality_preset][model]["tlm"] options = tlm_dict[quality_preset][model]["options"] + allow_null_trustworthiness_score = model in models_with_no_perplexity_score # test prompt with single prompt response = tlm.prompt("What is the capital of France?") - _test_prompt_response(response, options) + _test_prompt_response( + response, + options, + allow_null_trustworthiness_score=allow_null_trustworthiness_score, + ) # test prompt with batch prompt responses = tlm.prompt(["What is the capital of France?", "What is the capital of Ukraine?"]) - assert all(is_tlm_response(response) for response in responses) - _test_batch_prompt_response(responses, options) + _test_batch_prompt_response( + responses, + options, + allow_null_trustworthiness_score=allow_null_trustworthiness_score, + ) @pytest.mark.parametrize("model", valid_tlm_models) @@ -106,10 +137,13 @@ def test_prompt_async(tlm_dict: Dict[str, Any], model: str, quality_preset: str) # get TLM and options dictionary based on parameters tlm = tlm_dict[quality_preset][model]["tlm"] options = tlm_dict[quality_preset][model]["options"] + allow_null_trustworthiness_score = model in models_with_no_perplexity_score # test prompt with single prompt response = asyncio.run(_run_prompt_async(tlm, "What is the capital of France?")) - _test_prompt_response(response, options) + _test_prompt_response( + response, options, allow_null_trustworthiness_score=allow_null_trustworthiness_score + ) # test prompt with batch prompt responses = asyncio.run( @@ -117,8 +151,11 @@ def test_prompt_async(tlm_dict: Dict[str, Any], model: str, quality_preset: str) tlm, ["What is the capital of France?", "What is the capital of Ukraine?"] ) ) - assert all(is_tlm_response(r) for r in responses) - _test_batch_prompt_response(responses, options) + _test_batch_prompt_response( + responses, + options, + allow_null_trustworthiness_score=allow_null_trustworthiness_score, + ) @pytest.mark.parametrize("model", valid_tlm_models) @@ -128,13 +165,18 @@ def test_try_prompt(tlm_dict: Dict[str, Any], model: str, quality_preset: str) - # get TLM and options dictionary based on parameters tlm = tlm_dict[quality_preset][model]["tlm"] options = tlm_dict[quality_preset][model]["options"] + allow_null_trustworthiness_score = model in models_with_no_perplexity_score # test prompt with batch prompt responses = tlm.try_prompt( ["What is the capital of France?", "What is the capital of Ukraine?"] ) - assert all(response is None or is_tlm_response(response) for response in responses) - _test_batch_prompt_response(responses, options) + _test_batch_prompt_response( + responses, + options, + allow_none_response=True, + allow_null_trustworthiness_score=allow_null_trustworthiness_score, + ) @pytest.mark.parametrize("model", valid_tlm_models) From 9b9ab2f56e92625829a4d22fa5d37e018bc76d64 Mon Sep 17 00:00:00 2001 From: huiwengoh <45724323+huiwengoh@users.noreply.github.com> Date: Thu, 12 Sep 2024 23:28:24 -0400 Subject: [PATCH 2/2] only on base preset --- tests/tlm/test_properties.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/tests/tlm/test_properties.py b/tests/tlm/test_properties.py index 20e5b98a..4a660f63 100644 --- a/tests/tlm/test_properties.py +++ b/tests/tlm/test_properties.py @@ -111,7 +111,9 @@ def test_prompt(tlm_dict: Dict[str, Any], model: str, quality_preset: str) -> No # get TLM and options dictionary based on parameters tlm = tlm_dict[quality_preset][model]["tlm"] options = tlm_dict[quality_preset][model]["options"] - allow_null_trustworthiness_score = model in models_with_no_perplexity_score + allow_null_trustworthiness_score = ( + quality_preset == "base" and model in models_with_no_perplexity_score + ) # test prompt with single prompt response = tlm.prompt("What is the capital of France?") @@ -137,7 +139,9 @@ def test_prompt_async(tlm_dict: Dict[str, Any], model: str, quality_preset: str) # get TLM and options dictionary based on parameters tlm = tlm_dict[quality_preset][model]["tlm"] options = tlm_dict[quality_preset][model]["options"] - allow_null_trustworthiness_score = model in models_with_no_perplexity_score + allow_null_trustworthiness_score = ( + quality_preset == "base" and model in models_with_no_perplexity_score + ) # test prompt with single prompt response = asyncio.run(_run_prompt_async(tlm, "What is the capital of France?")) @@ -165,7 +169,9 @@ def test_try_prompt(tlm_dict: Dict[str, Any], model: str, quality_preset: str) - # get TLM and options dictionary based on parameters tlm = tlm_dict[quality_preset][model]["tlm"] options = tlm_dict[quality_preset][model]["options"] - allow_null_trustworthiness_score = model in models_with_no_perplexity_score + allow_null_trustworthiness_score = ( + quality_preset == "base" and model in models_with_no_perplexity_score + ) # test prompt with batch prompt responses = tlm.try_prompt(