diff --git a/docs/user_guide/metrics.md b/docs/user_guide/metrics.md index b8fc0d8ee0..88f8b49c3a 100644 --- a/docs/user_guide/metrics.md +++ b/docs/user_guide/metrics.md @@ -183,6 +183,15 @@ There are some places where a request would not be considered pending: generally brief, it will not be considered pending from Triton's perspective until Triton core has received the request from the frontend. +#### Load Time Per-Model +The *Model Load Duration* reflects the time to load a model from storage into GPU/CPU in seconds. +``` +# HELP nv_model_load_duration_secs Model load time in seconds +# TYPE nv_model_load_duration_secs gauge +nv_model_load_duration_secs{model="input_all_optional",version="2"} 1.532738387 +nv_model_load_duration_secs{model="input_all_optional",version="1"} 11.68753265 +``` + ### Latencies Starting in 23.04, Triton exposes the ability to choose the types of metrics diff --git a/qa/L0_metrics/general_metrics_test.py b/qa/L0_metrics/general_metrics_test.py new file mode 100644 index 0000000000..f0002def65 --- /dev/null +++ b/qa/L0_metrics/general_metrics_test.py @@ -0,0 +1,178 @@ +# /usr/bin/python +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +import re +import time +import unittest + +import requests + +_tritonserver_ipaddr = os.environ.get("TRITONSERVER_IPADDR", "localhost") +MODEL_LOAD_TIME = "nv_model_load_duration_secs{model=" + + +def get_model_load_times(): + r = requests.get(f"http://{_tritonserver_ipaddr}:8002/metrics") + r.raise_for_status() + # Initialize an empty dictionary to store the data + model_data = {} + lines = r.text.strip().split("\n") + for line in lines: + # Use regex to extract model name, version, and load time + match = re.match( + r"nv_model_load_duration_secs\{model=\"(.*?)\",version=\"(.*?)\"\} (.*)", + line, + ) + if match: + model_name = match.group(1) + model_version = match.group(2) + load_time = float(match.group(3)) + # Store in dictionary + if model_name not in model_data: + model_data[model_name] = {} + model_data[model_name][model_version] = load_time + return model_data + + +def load_model_explicit(model_name, server_url="http://localhost:8000"): + endpoint = f"{server_url}/v2/repository/models/{model_name}/load" + response = requests.post(endpoint) + + if response.status_code == 200: + print(f"Model '{model_name}' loaded successfully.") + else: + print( + f"Failed to load model '{model_name}'. Status code: {response.status_code}" + ) + print("Response:", response.text) + + +def unload_model_explicit(model_name, server_url="http://localhost:8000"): + endpoint = f"{server_url}/v2/repository/models/{model_name}/unload" + response = requests.post(endpoint) + + if response.status_code == 200: + print(f"Model '{model_name}' unloaded successfully.") + else: + print( + f"Failed to load model '{model_name}'. Status code: {response.status_code}" + ) + print("Response:", response.text) + + +class TestGeneralMetrics(unittest.TestCase): + def setUp(self): + self.model_name = "libtorch_float32_float32_float32" + self.model_name_multiple_versions = "input_all_optional" + + def test_metrics_load_time(self): + model_load_times = get_model_load_times() + load_time = model_load_times.get(self.model_name, {}).get("1") + + self.assertIsNotNone(load_time, "Model Load time not found") + + dict_size = len(model_load_times) + self.assertEqual(dict_size, 1, "Too many model_load_time entries found") + + def test_metrics_load_time_explicit_load(self): + model_load_times = get_model_load_times() + load_time = model_load_times.get(self.model_name, {}).get("1") + + self.assertIsNotNone(load_time, "Model Load time not found") + + dict_size = len(model_load_times) + self.assertEqual(dict_size, 1, "Too many model_load_time entries found") + + def test_metrics_load_time_explicit_unload(self): + model_load_times = get_model_load_times() + load_time = model_load_times.get(self.model_name, {}).get("1") + self.assertIsNone(load_time, "Model Load time found even after unload") + + def test_metrics_load_time_multiple_version_reload(self): + # Part 0 check start condistion, metric should not be present + model_load_times = get_model_load_times() + load_time = model_load_times.get(self.model_name, {}).get("1") + self.assertIsNone(load_time, "Model Load time found even before model load") + + # Part 1 load multiple versions of the same model and check if slow and fast models reflect the metric correctly + load_model_explicit(self.model_name_multiple_versions) + model_load_times = get_model_load_times() + load_time_slow = model_load_times.get( + self.model_name_multiple_versions, {} + ).get("1") + load_time_fast = model_load_times.get( + self.model_name_multiple_versions, {} + ).get("2") + # Fail the test if load_time_slow is less than load_time_fast + self.assertGreaterEqual( + load_time_slow, + load_time_fast, + "Slow load time should be greater than or equal to fast load time", + ) + # Fail the test if load_time_slow is less than 10 seconds as manual delay is 10 seconds + self.assertGreaterEqual( + load_time_slow, + 10, + "Slow load time should be greater than or equal to fast load time", + ) + # Fail the test if load_time_fast is greater than generous 2 seconds + self.assertLess( + load_time_fast, + 2, + "Model taking too much time to load", + ) + + # Part 2 load multiple versions AGAIN and compare with prev values expect to be the same + # as triton does not actually load the model again. + load_model_explicit(self.model_name_multiple_versions) + model_load_times_new = get_model_load_times() + load_time_slow_new = model_load_times_new.get( + self.model_name_multiple_versions, {} + ).get("1") + load_time_fast_new = model_load_times_new.get( + self.model_name_multiple_versions, {} + ).get("2") + self.assertEqual(load_time_fast_new, load_time_fast) + self.assertEqual(load_time_slow_new, load_time_slow) + + # Part 3 unload the model and expect the metrics to go away as model is not loaded now + unload_model_explicit(self.model_name_multiple_versions) + time.sleep(1) + model_load_times_new = get_model_load_times() + load_time_slow_new = model_load_times_new.get( + self.model_name_multiple_versions, {} + ).get("1") + load_time_fast_new = model_load_times_new.get( + self.model_name_multiple_versions, {} + ).get("2") + self.assertIsNone(load_time_slow_new, "Model Load time found even after unload") + self.assertIsNone(load_time_fast_new, "Model Load time found even after unload") + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_metrics/test.sh b/qa/L0_metrics/test.sh index 76e99e7c48..4ea1971f5c 100755 --- a/qa/L0_metrics/test.sh +++ b/qa/L0_metrics/test.sh @@ -45,7 +45,6 @@ SERVER=${TRITON_DIR}/bin/tritonserver BASE_SERVER_ARGS="--model-repository=${MODELDIR}" SERVER_ARGS="${BASE_SERVER_ARGS}" SERVER_LOG="./inference_server.log" -PYTHON_TEST="metrics_config_test.py" source ../common/util.sh CLIENT_LOG="client.log" @@ -132,12 +131,54 @@ fi kill_server set -e +### General metrics tests + +set +e +CLIENT_PY="./general_metrics_test.py" +CLIENT_LOG="general_metrics_test_client.log" +SERVER_LOG="general_metrics_test_server.log" +SERVER_ARGS="$BASE_SERVER_ARGS --log-verbose=1" +PYTHON_TEST="general_metrics_test.py" +run_and_check_server +# Test 1 for default model control mode (all models loaded at startup) +python3 -m pytest --junitxml="general_metrics_test.test_metrics_load_time.report.xml" $CLIENT_PY::TestGeneralMetrics::test_metrics_load_time >> $CLIENT_LOG 2>&1 +kill_server + +set +e +CLIENT_PY="./general_metrics_test.py" +CLIENT_LOG="general_metrics_test_client.log" +SERVER_LOG="general_metrics_test_server.log" +SERVER_ARGS="$BASE_SERVER_ARGS --model-control-mode=explicit --log-verbose=1" +run_and_check_server +MODEL_NAME='libtorch_float32_float32_float32' +code=`curl -s -w %{http_code} -X POST ${TRITONSERVER_IPADDR}:8000/v2/repository/models/${MODEL_NAME}/load` +# Test 2 for explicit mode LOAD +python3 -m pytest --junitxml="general_metrics_test.test_metrics_load_time_explicit_load.report.xml" $CLIENT_PY::TestGeneralMetrics::test_metrics_load_time_explicit_load >> $CLIENT_LOG 2>&1 + +code=`curl -s -w %{http_code} -X POST ${TRITONSERVER_IPADDR}:8000/v2/repository/models/${MODEL_NAME}/unload` +# Test 3 for explicit mode UNLOAD +python3 -m pytest --junitxml="general_metrics_test.test_metrics_load_time_explicit_unload.report.xml" $CLIENT_PY::TestGeneralMetrics::test_metrics_load_time_explicit_unload >> $CLIENT_LOG 2>&1 +kill_server + +# Test 4 for explicit mode LOAD and UNLOAD with multiple versions +set +e +CLIENT_PY="./general_metrics_test.py" +CLIENT_LOG="general_metrics_test_client.log" +SERVER_LOG="general_metrics_test_server.log" +VERSION_DIR="${PWD}/version_models" +SERVER_ARGS="$BASE_SERVER_ARGS --model-repository=${VERSION_DIR} --model-control-mode=explicit --log-verbose=1" +run_and_check_server +python3 -m pytest --junitxml="general_metrics_test.test_metrics_load_time_multiple_version_reload.report.xml" $CLIENT_PY::TestGeneralMetrics::test_metrics_load_time_multiple_version_reload >> $CLIENT_LOG 2>&1 + +kill_server + ### Pinned memory metrics tests set +e CLIENT_PY="./pinned_memory_metrics_test.py" CLIENT_LOG="pinned_memory_metrics_test_client.log" SERVER_LOG="pinned_memory_metrics_test_server.log" SERVER_ARGS="$BASE_SERVER_ARGS --metrics-interval-ms=1 --model-control-mode=explicit --log-verbose=1" +PYTHON_TEST="metrics_config_test.py" run_and_check_server python3 ${PYTHON_TEST} MetricsConfigTest.test_pinned_memory_metrics_exist -v 2>&1 | tee ${CLIENT_LOG} check_unit_test diff --git a/qa/L0_metrics/version_models/input_all_optional/1/model.py b/qa/L0_metrics/version_models/input_all_optional/1/model.py new file mode 100644 index 0000000000..86cd368fe0 --- /dev/null +++ b/qa/L0_metrics/version_models/input_all_optional/1/model.py @@ -0,0 +1,49 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json +import time + +import numpy as np +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + def initialize(self, args): + time.sleep(10) + self.model_config = json.loads(args["model_config"]) + + def execute(self, requests): + """This function is called on inference request.""" + + responses = [] + for _ in requests: + # Include one of each specially parsed JSON value: nan, inf, and -inf + out_0 = np.array([1], dtype=np.float32) + out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0) + responses.append(pb_utils.InferenceResponse([out_tensor_0])) + + return responses diff --git a/qa/L0_metrics/version_models/input_all_optional/2/model.py b/qa/L0_metrics/version_models/input_all_optional/2/model.py new file mode 100644 index 0000000000..40f8b25579 --- /dev/null +++ b/qa/L0_metrics/version_models/input_all_optional/2/model.py @@ -0,0 +1,47 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json + +import numpy as np +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + def initialize(self, args): + self.model_config = json.loads(args["model_config"]) + + def execute(self, requests): + """This function is called on inference request.""" + + responses = [] + for _ in requests: + # Include one of each specially parsed JSON value: nan, inf, and -inf + out_0 = np.array([1], dtype=np.float32) + out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0) + responses.append(pb_utils.InferenceResponse([out_tensor_0])) + + return responses diff --git a/qa/L0_metrics/version_models/input_all_optional/config.pbtxt b/qa/L0_metrics/version_models/input_all_optional/config.pbtxt new file mode 100644 index 0000000000..e3653342b4 --- /dev/null +++ b/qa/L0_metrics/version_models/input_all_optional/config.pbtxt @@ -0,0 +1,59 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "input_all_optional" +backend: "python" +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ -1 ] + optional: true + }, + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ -1 ] + optional: true + }, + { + name: "INPUT2" + data_type: TYPE_FP32 + dims: [ -1 ] + optional: true + } +] + +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] + +instance_group [{ kind: KIND_CPU }] +version_policy: { all { }}