Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[TLM CI] Add more configurable args to unittests #319

Merged
merged 2 commits into from
Sep 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 26 additions & 5 deletions tests/tlm/test_prompt.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,35 @@
from cleanlab_studio.studio.trustworthy_language_model import TLM


def is_tlm_response(response: Any) -> bool:
"""Returns True if the response is a TLMResponse."""
def is_tlm_response(
response: Any,
allow_none_response: bool = False,
allow_null_trustworthiness_score: bool = False,
) -> bool:
"""Returns True if the response is a TLMResponse.

Args:
allow_none_response: If True, allows the response to be None (only allowed for try_prompt)
allow_null_trustworthiness_score: If True, allows the trustworthiness_score to be None
(only allowed for base preset for models with no perplexity score)
"""
# check if response is allowed to be none
if response is None:
return allow_none_response

if (
isinstance(response, dict)
and "response" in response
and "trustworthiness_score" in response
and isinstance(response["trustworthiness_score"], float)
):
return 0.0 <= response["trustworthiness_score"] <= 1.0
trustworthiness_score = response["trustworthiness_score"]

# check if trustworthiness score is allowed to be none
if trustworthiness_score is None:
return allow_null_trustworthiness_score

return isinstance(trustworthiness_score, float) and 0.0 <= trustworthiness_score <= 1.0

return False


Expand All @@ -26,6 +46,7 @@ def test_single_prompt(tlm: TLM) -> None:
- Response should be non-None
- No exceptions are raised
"""

# act -- run a single prompt
response = tlm.prompt("What is the capital of France?")

Expand Down Expand Up @@ -93,7 +114,7 @@ def test_batch_try_prompt(tlm: TLM) -> None:
# - no exceptions are raised (implicit)
assert response is not None
assert isinstance(response, list)
assert all(r is None or is_tlm_response(r) for r in response)
assert all(is_tlm_response(r, allow_none_response=True) for r in response)


def test_batch_try_prompt_force_timeouts(tlm: TLM) -> None:
Expand Down
70 changes: 59 additions & 11 deletions tests/tlm/test_properties.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

excluded_tlm_models = ["claude-3-sonnet", "claude-3.5-sonnet"]
valid_tlm_models = [model for model in _VALID_TLM_MODELS if model not in excluded_tlm_models]
models_with_no_perplexity_score = ["claude-3-haiku", "claude-3-sonnet", "claude-3.5-sonnet"]


def _test_log(response: Dict[str, Any], options: Dict[str, Any]) -> None:
Expand All @@ -35,17 +36,39 @@ def _test_log_batch(responses: Dict[str, Any], options: Dict[str, Any]) -> None:
_test_log(response, options)


def _test_prompt_response(response, options):
def _test_prompt_response(
response,
options,
allow_none_response=False,
allow_null_trustworthiness_score=False,
):
"""Property tests the responses of a prompt based on the options dictionary and returned responses."""
assert response is not None
assert is_tlm_response(response)
assert is_tlm_response(
response,
allow_none_response=allow_none_response,
allow_null_trustworthiness_score=allow_null_trustworthiness_score,
)
_test_log(response, options)


def _test_batch_prompt_response(responses, options):
def _test_batch_prompt_response(
responses,
options,
allow_none_response=False,
allow_null_trustworthiness_score=False,
):
"""Property tests the responses of a batch prompt based on the options dictionary and returned responses."""
assert responses is not None
assert isinstance(responses, list)
assert all(
is_tlm_response(
response,
allow_none_response=allow_none_response,
allow_null_trustworthiness_score=allow_null_trustworthiness_score,
)
for response in responses
)
_test_log_batch(responses, options)


Expand Down Expand Up @@ -88,15 +111,25 @@ def test_prompt(tlm_dict: Dict[str, Any], model: str, quality_preset: str) -> No
# get TLM and options dictionary based on parameters
tlm = tlm_dict[quality_preset][model]["tlm"]
options = tlm_dict[quality_preset][model]["options"]
allow_null_trustworthiness_score = (
quality_preset == "base" and model in models_with_no_perplexity_score
)

# test prompt with single prompt
response = tlm.prompt("What is the capital of France?")
_test_prompt_response(response, options)
_test_prompt_response(
response,
options,
allow_null_trustworthiness_score=allow_null_trustworthiness_score,
)

# test prompt with batch prompt
responses = tlm.prompt(["What is the capital of France?", "What is the capital of Ukraine?"])
assert all(is_tlm_response(response) for response in responses)
_test_batch_prompt_response(responses, options)
_test_batch_prompt_response(
responses,
options,
allow_null_trustworthiness_score=allow_null_trustworthiness_score,
)


@pytest.mark.parametrize("model", valid_tlm_models)
Expand All @@ -106,19 +139,27 @@ def test_prompt_async(tlm_dict: Dict[str, Any], model: str, quality_preset: str)
# get TLM and options dictionary based on parameters
tlm = tlm_dict[quality_preset][model]["tlm"]
options = tlm_dict[quality_preset][model]["options"]
allow_null_trustworthiness_score = (
quality_preset == "base" and model in models_with_no_perplexity_score
)

# test prompt with single prompt
response = asyncio.run(_run_prompt_async(tlm, "What is the capital of France?"))
_test_prompt_response(response, options)
_test_prompt_response(
response, options, allow_null_trustworthiness_score=allow_null_trustworthiness_score
)

# test prompt with batch prompt
responses = asyncio.run(
_run_prompt_async(
tlm, ["What is the capital of France?", "What is the capital of Ukraine?"]
)
)
assert all(is_tlm_response(r) for r in responses)
_test_batch_prompt_response(responses, options)
_test_batch_prompt_response(
responses,
options,
allow_null_trustworthiness_score=allow_null_trustworthiness_score,
)


@pytest.mark.parametrize("model", valid_tlm_models)
Expand All @@ -128,13 +169,20 @@ def test_try_prompt(tlm_dict: Dict[str, Any], model: str, quality_preset: str) -
# get TLM and options dictionary based on parameters
tlm = tlm_dict[quality_preset][model]["tlm"]
options = tlm_dict[quality_preset][model]["options"]
allow_null_trustworthiness_score = (
quality_preset == "base" and model in models_with_no_perplexity_score
)

# test prompt with batch prompt
responses = tlm.try_prompt(
["What is the capital of France?", "What is the capital of Ukraine?"]
)
assert all(response is None or is_tlm_response(response) for response in responses)
_test_batch_prompt_response(responses, options)
_test_batch_prompt_response(
responses,
options,
allow_none_response=True,
allow_null_trustworthiness_score=allow_null_trustworthiness_score,
)


@pytest.mark.parametrize("model", valid_tlm_models)
Expand Down
Loading