From e1816629dd8bc1b05748450739420639543bcb11 Mon Sep 17 00:00:00 2001 From: Kris Hung Date: Wed, 31 Jul 2024 12:52:56 -0700 Subject: [PATCH 01/44] ci: Return custom exit code to indicate known shm leak failure in L0_backend_python bls test (#7485) --- .../argument_validation/test.sh | 2 +- qa/L0_backend_python/bls/test.sh | 41 +++++++++++-------- qa/L0_backend_python/custom_metrics/test.sh | 2 +- .../request_rescheduling/test.sh | 2 +- .../setup_python_enviroment.sh | 2 +- qa/L0_backend_python/test.sh | 29 +++++++++++-- ...hon_unittest.py => test_infer_shm_leak.py} | 31 ++++++++------ qa/L0_dlpack_multi_gpu/test.sh | 6 +-- qa/L0_warmup/test.sh | 6 +-- qa/common/shm_util.py | 5 ++- 10 files changed, 81 insertions(+), 45 deletions(-) rename qa/L0_backend_python/{python_unittest.py => test_infer_shm_leak.py} (75%) diff --git a/qa/L0_backend_python/argument_validation/test.sh b/qa/L0_backend_python/argument_validation/test.sh index b14ba4abb3..90cbef89b5 100755 --- a/qa/L0_backend_python/argument_validation/test.sh +++ b/qa/L0_backend_python/argument_validation/test.sh @@ -25,7 +25,7 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -CLIENT_PY=../python_unittest.py +CLIENT_PY=../test_infer_shm_leak.py CLIENT_LOG="./arg_validation_client.log" TEST_RESULT_FILE='test_results.txt' SERVER_ARGS="--model-repository=${MODELDIR}/argument_validation/models --backend-directory=${BACKEND_DIR} --log-verbose=1" diff --git a/qa/L0_backend_python/bls/test.sh b/qa/L0_backend_python/bls/test.sh index 204af7e2ba..46d1f40818 100755 --- a/qa/L0_backend_python/bls/test.sh +++ b/qa/L0_backend_python/bls/test.sh @@ -25,7 +25,7 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -CLIENT_PY=../python_unittest.py +CLIENT_PY=../test_infer_shm_leak.py CLIENT_LOG="./bls_client.log" TEST_RESULT_FILE='test_results.txt' source ../../common/util.sh @@ -33,7 +33,7 @@ source ../../common/util.sh TRITON_REPO_ORGANIZATION=${TRITON_REPO_ORGANIZATION:=http://github.com/triton-inference-server} RET=0 -rm -fr *.log ./models *.txt +rm -fr *.log ./models *.txt *.xml # FIXME: [DLIS-5970] Until Windows supports GPU tensors, only test CPU if [[ ${TEST_WINDOWS} == 0 ]]; then @@ -119,30 +119,35 @@ if [[ ${TEST_WINDOWS} == 0 ]]; then for MODEL_NAME in bls bls_memory bls_memory_async bls_async; do export MODEL_NAME=${MODEL_NAME} - - python3 -m pytest --junitxml="${MODEL_NAME}.${TRIAL}.${CUDA_MEMORY_POOL_SIZE_MB}.report.xml" $CLIENT_PY >> $CLIENT_LOG 2>&1 - if [ $? -ne 0 ]; then + # Run with pytest to capture the return code correctly + pytest --junitxml="${MODEL_NAME}.${TRIAL}.${CUDA_MEMORY_POOL_SIZE_MB}.report.xml" $CLIENT_PY >> $CLIENT_LOG 2>&1 + EXIT_CODE=$? + if [ $EXIT_CODE -ne 0 ]; then echo -e "\n***\n*** ${MODEL_NAME} ${BLS_KIND} test FAILED. \n***" + RET=$EXIT_CODE cat $SERVER_LOG cat $CLIENT_LOG - RET=1 fi done - set -e - kill_server - # Check for bls 'test_timeout' to ensure timeout value is being correctly passed - if [ `grep -c "Request timeout: 11000000000" $SERVER_LOG` == "0" ]; then - echo -e "\n***\n*** BLS timeout value not correctly passed to model: line ${LINENO}\n***" - cat $SERVER_LOG - RET=1 + set -e + + # Only check the timeout value if there is no error since the test + # may fail before the test_timeout case gets run. + if [ $RET -eq 0 ]; then + # Check for bls 'test_timeout' to ensure timeout value is being correctly passed + if [ `grep -c "Request timeout: 11000000000" $SERVER_LOG` == "0" ]; then + echo -e "\n***\n*** BLS timeout value not correctly passed to model: line ${LINENO}\n***" + cat $SERVER_LOG + RET=1 + fi fi - if [[ $CUDA_MEMORY_POOL_SIZE_MB -eq 128 ]]; then + if [[ $CUDA_MEMORY_POOL_SIZE_MB -eq 256 ]]; then if [ `grep -c "Failed to allocate memory from CUDA memory pool" $SERVER_LOG` != "0" ]; then - echo -e "\n***\n*** Expected to use CUDA memory pool for all tests when CUDA_MEMOY_POOL_SIZE_MB is 128 MB for 'bls' $BLS_KIND test\n***" + echo -e "\n***\n*** Expected to use CUDA memory pool for all tests when CUDA_MEMORY_POOL_SIZE_MB is 256 MB for 'bls' $BLS_KIND test\n***" cat $SERVER_LOG RET=1 fi @@ -342,10 +347,10 @@ set -e kill_server -if [ $RET -eq 1 ]; then - echo -e "\n***\n*** BLS test FAILED. \n***" -else +if [ $RET -eq 0 ]; then echo -e "\n***\n*** BLS test PASSED. \n***" +else + echo -e "\n***\n*** BLS test FAILED. \n***" fi exit $RET diff --git a/qa/L0_backend_python/custom_metrics/test.sh b/qa/L0_backend_python/custom_metrics/test.sh index 4491d9e030..9020c7ebfd 100755 --- a/qa/L0_backend_python/custom_metrics/test.sh +++ b/qa/L0_backend_python/custom_metrics/test.sh @@ -25,7 +25,7 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -CLIENT_PY=../python_unittest.py +CLIENT_PY=../test_infer_shm_leak.py CLIENT_LOG="./custom_metrics_client.log" TEST_RESULT_FILE='test_results.txt' source ../../common/util.sh diff --git a/qa/L0_backend_python/request_rescheduling/test.sh b/qa/L0_backend_python/request_rescheduling/test.sh index 6fd6fe09e5..31ba6692d9 100755 --- a/qa/L0_backend_python/request_rescheduling/test.sh +++ b/qa/L0_backend_python/request_rescheduling/test.sh @@ -25,7 +25,7 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -CLIENT_PY="../python_unittest.py" +CLIENT_PY="../test_infer_shm_leak.py" CLIENT_LOG="./request_rescheduling_client.log" TEST_RESULT_FILE='test_results.txt' source ../../common/util.sh diff --git a/qa/L0_backend_python/setup_python_enviroment.sh b/qa/L0_backend_python/setup_python_enviroment.sh index 88baccc4f6..a2171e02da 100755 --- a/qa/L0_backend_python/setup_python_enviroment.sh +++ b/qa/L0_backend_python/setup_python_enviroment.sh @@ -151,7 +151,7 @@ apt-get update && apt-get -y install \ libboost-dev rm -f /usr/bin/python3 && \ ln -s "/usr/bin/python3.${PYTHON_ENV_VERSION}" /usr/bin/python3 -pip3 install --upgrade install requests numpy virtualenv protobuf +pip3 install --upgrade requests numpy virtualenv protobuf find /opt/tritonserver/qa/pkgs/ -maxdepth 1 -type f -name \ "tritonclient-*linux*.whl" | xargs printf -- '%s[all]' | \ xargs pip3 install --upgrade diff --git a/qa/L0_backend_python/test.sh b/qa/L0_backend_python/test.sh index 65767419f2..f6d4b7b445 100755 --- a/qa/L0_backend_python/test.sh +++ b/qa/L0_backend_python/test.sh @@ -425,11 +425,20 @@ if [ "$TEST_JETSON" == "0" ]; then # between dependencies. setup_virtualenv + set +e (cd ${TEST} && bash -ex test.sh) - if [ $? -ne 0 ]; then + EXIT_CODE=$? + if [ $EXIT_CODE -ne 0 ]; then echo "Subtest ${TEST} FAILED" - RET=1 + RET=$EXIT_CODE + + # In bls test, it is allowed to fail with a strict memory leak of 480 bytes with exit code '123'. + # Propagate the exit code to make sure it's not overwritten by other tests. + if [[ ${TEST} == "bls" ]] && [[ $EXIT_CODE -ne 1 ]] ; then + BLS_RET=$RET + fi fi + set -e deactivate_virtualenv done @@ -438,11 +447,13 @@ if [ "$TEST_JETSON" == "0" ]; then if [[ ${PYTHON_ENV_VERSION} = "10" ]] && [[ ${TEST_WINDOWS} == 0 ]]; then # In 'env' test we use miniconda for dependency management. No need to run # the test in a virtual environment. + set +e (cd env && bash -ex test.sh) if [ $? -ne 0 ]; then echo "Subtest env FAILED" RET=1 fi + set -e fi fi @@ -459,12 +470,14 @@ for TEST in ${SUBTESTS}; do # between dependencies. setup_virtualenv + set +e (cd ${TEST} && bash -ex test.sh) if [ $? -ne 0 ]; then echo "Subtest ${TEST} FAILED" RET=1 fi + set -e deactivate_virtualenv done @@ -475,4 +488,14 @@ else echo -e "\n***\n*** Test FAILED\n***" fi -exit $RET +# Exit with RET if it is 1, meaning that the test failed. +# Otherwise, exit with BLS_RET if it is set, meaning that the known memory leak is captured. +if [ $RET -eq 1 ]; then + exit $RET +else + if [ -z "$BLS_RET" ]; then + exit $RET + else + exit $BLS_RET + fi +fi diff --git a/qa/L0_backend_python/python_unittest.py b/qa/L0_backend_python/test_infer_shm_leak.py similarity index 75% rename from qa/L0_backend_python/python_unittest.py rename to qa/L0_backend_python/test_infer_shm_leak.py index 4b94996976..966243e86e 100755 --- a/qa/L0_backend_python/python_unittest.py +++ b/qa/L0_backend_python/test_infer_shm_leak.py @@ -33,6 +33,7 @@ import os import unittest +import pytest import shm_util import tritonclient.grpc as grpcclient from tritonclient.utils import * @@ -41,11 +42,13 @@ # we overwrite the IP address with the TRITONSERVER_IPADDR envvar _tritonserver_ipaddr = os.environ.get("TRITONSERVER_IPADDR", "localhost") +# The exit code 123 is used to indicate that the shm leak probe detected a 480 +# bytes leak in the bls sub-test. Any leak other than 480 bytes will cause the +# test to fail with the default exit code 1. +ALLOWED_FAILURE_EXIT_CODE = 123 -class PythonUnittest(unittest.TestCase): - def setUp(self): - self._shm_leak_detector = shm_util.ShmLeakDetector() +class TestInferShmLeak: def _run_unittest(self, model_name): with grpcclient.InferenceServerClient(f"{_tritonserver_ipaddr}:8001") as client: # No input is required @@ -54,15 +57,17 @@ def _run_unittest(self, model_name): # The model returns 1 if the tests were successfully passed. # Otherwise, it will return 0. - self.assertEqual( - output0, [1], f"python_unittest failed for model {model_name}" - ) - - def test_python_unittest(self): - model_name = os.environ["MODEL_NAME"] - with self._shm_leak_detector.Probe() as shm_probe: - self._run_unittest(model_name) + assert output0 == [1], f"python_unittest failed for model {model_name}" + def test_shm_leak(self): + self._shm_leak_detector = shm_util.ShmLeakDetector() + model_name = os.environ.get("MODEL_NAME", "default_model") -if __name__ == "__main__": - unittest.main() + try: + with self._shm_leak_detector.Probe() as shm_probe: + self._run_unittest(model_name) + except AssertionError as e: + if "Known shared memory leak of 480 bytes detected" in str(e): + pytest.exit(str(e), returncode=ALLOWED_FAILURE_EXIT_CODE) + else: + raise e diff --git a/qa/L0_dlpack_multi_gpu/test.sh b/qa/L0_dlpack_multi_gpu/test.sh index 996f062f42..ae72daa7d0 100755 --- a/qa/L0_dlpack_multi_gpu/test.sh +++ b/qa/L0_dlpack_multi_gpu/test.sh @@ -27,7 +27,7 @@ SERVER=/opt/tritonserver/bin/tritonserver SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=1" -CLIENT_PY=./python_unittest.py +CLIENT_PY=./test_infer_shm_leak.py CLIENT_LOG="./client.log" EXPECTED_NUM_TESTS="1" TEST_RESULT_FILE='test_results.txt' @@ -52,8 +52,8 @@ rm -fr *.log ./models mkdir -p models/dlpack_test/1/ cp ../python_models/dlpack_test/model.py models/dlpack_test/1/ cp ../python_models/dlpack_test/config.pbtxt models/dlpack_test -cp ../L0_backend_python/python_unittest.py . -sed -i 's#sys.path.append("../../common")#sys.path.append("../common")#g' python_unittest.py +cp ../L0_backend_python/test_infer_shm_leak.py . +sed -i 's#sys.path.append("../../common")#sys.path.append("../common")#g' test_infer_shm_leak.py run_server if [ "$SERVER_PID" == "0" ]; then diff --git a/qa/L0_warmup/test.sh b/qa/L0_warmup/test.sh index aeed873b25..a535aed25b 100755 --- a/qa/L0_warmup/test.sh +++ b/qa/L0_warmup/test.sh @@ -42,7 +42,7 @@ export CUDA_VISIBLE_DEVICES=0 CLIENT=../clients/image_client CLIENT_LOG="./client.log" -CLIENT_PY=./python_unittest.py +CLIENT_PY=./test_infer_shm_leak.py EXPECTED_NUM_TESTS="1" TEST_RESULT_FILE='test_results.txt' @@ -449,8 +449,8 @@ mkdir -p models/bls_onnx_warmup/1/ cp ../python_models/bls_onnx_warmup/model.py models/bls_onnx_warmup/1/ cp ../python_models/bls_onnx_warmup/config.pbtxt models/bls_onnx_warmup/. -cp ../L0_backend_python/python_unittest.py . -sed -i 's#sys.path.append("../../common")#sys.path.append("../common")#g' python_unittest.py +cp ../L0_backend_python/test_infer_shm_leak.py . +sed -i 's#sys.path.append("../../common")#sys.path.append("../common")#g' test_infer_shm_leak.py run_server if [ "$SERVER_PID" == "0" ]; then diff --git a/qa/common/shm_util.py b/qa/common/shm_util.py index 16e5ce4e45..0e533bcdbb 100755 --- a/qa/common/shm_util.py +++ b/qa/common/shm_util.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -# Copyright 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2018-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -441,6 +441,9 @@ def __exit__(self, type, value, traceback): print( f"Shared memory leak detected [{shm_region}]: {curr_shm_free_size} (curr free) < {prev_shm_free_size} (prev free)." ) + # FIXME DLIS-7122: Known shared memory leak of 480 bytes in BLS test. + if curr_shm_free_size == 1006576 and prev_shm_free_size == 1007056: + assert False, f"Known shared memory leak of 480 bytes detected." assert not shm_leak_detected, f"Shared memory leak detected." def _get_shm_free_sizes(self, delay_sec=0): From dc90a5260ddad1cef54b6a2523533209bbe373e1 Mon Sep 17 00:00:00 2001 From: Misha Chornyi <99709299+mc-nv@users.noreply.github.com> Date: Fri, 2 Aug 2024 08:19:21 -0700 Subject: [PATCH 02/44] Including 'tritonserver.lib' into final package (#7491) --- build.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/build.py b/build.py index 6ab8a58515..2c95cbded3 100755 --- a/build.py +++ b/build.py @@ -1647,6 +1647,10 @@ def core_build( os.path.join(repo_install_dir, "bin", "tritonserver.dll"), os.path.join(install_dir, "bin"), ) + cmake_script.cp( + os.path.join(repo_install_dir, "lib", "tritonserver.lib"), + os.path.join(install_dir, "bin"), + ) else: cmake_script.mkdir(os.path.join(install_dir, "bin")) cmake_script.cp( From cca12f9ddee928b7ad4b089598e4b7a98132c9f8 Mon Sep 17 00:00:00 2001 From: Alex Zhang Date: Mon, 5 Aug 2024 09:56:38 +0800 Subject: [PATCH 03/44] build: Add default value for argument 'TRITON_REPO_ORGANIZATION' from sdk Dockerfile (#7437) --- Dockerfile.sdk | 1 + docs/customization_guide/build.md | 12 ++++++------ docs/customization_guide/test.md | 13 ++++++++----- 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/Dockerfile.sdk b/Dockerfile.sdk index 1524b5ead3..0748277d52 100644 --- a/Dockerfile.sdk +++ b/Dockerfile.sdk @@ -33,6 +33,7 @@ ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.07-py3-min ARG TRITON_CLIENT_REPO_SUBDIR=clientrepo ARG TRITON_PA_REPO_SUBDIR=perfanalyzerrepo +ARG TRITON_REPO_ORGANIZATION=http://github.com/triton-inference-server ARG TRITON_COMMON_REPO_TAG=main ARG TRITON_CORE_REPO_TAG=main ARG TRITON_CLIENT_REPO_TAG=main diff --git a/docs/customization_guide/build.md b/docs/customization_guide/build.md index db16b65c6b..0c1cc08a41 100644 --- a/docs/customization_guide/build.md +++ b/docs/customization_guide/build.md @@ -331,13 +331,13 @@ invocation builds all features and backends available on windows. python build.py --cmake-dir=/build --build-dir=/tmp/citritonbuild --no-container-pull --image=base,win10-py3-min --enable-logging --enable-stats --enable-tracing --enable-gpu --endpoint=grpc --endpoint=http --repo-tag=common: --repo-tag=core: --repo-tag=backend: --repo-tag=thirdparty: --backend=ensemble --backend=tensorrt: --backend=onnxruntime: --backend=openvino: ``` -If you are building on *main* branch then '' will +If you are building on *main* branch then `` will default to "main". If you are building on a release branch then -'' will default to the branch name. For example, if you -are building on the r24.07 branch, '' will default to -r24.07. Therefore, you typically do not need to provide '' at all (nor the preceding colon). You can use a different -'' for a component to instead use the corresponding +`` will default to the branch name. For example, if you +are building on the r24.07 branch, `` will default to +r24.07. Therefore, you typically do not need to provide `` at all (nor the preceding colon). You can use a different +`` for a component to instead use the corresponding branch/tag in the build. For example, if you have a branch called "mybranch" in the [onnxruntime_backend](https://github.com/triton-inference-server/onnxruntime_backend) diff --git a/docs/customization_guide/test.md b/docs/customization_guide/test.md index d664a139d3..e066d31493 100644 --- a/docs/customization_guide/test.md +++ b/docs/customization_guide/test.md @@ -48,7 +48,7 @@ $ ./gen_qa_model_repository $ ./gen_qa_custom_ops ``` -This will create multiple model repositories in /tmp//qa_* +This will create multiple model repositories in /tmp/\/qa_* (for example /tmp/24.07/qa_model_repository). The TensorRT models will be created for the GPU on the system that CUDA considers device 0 (zero). If you have multiple GPUs on your system see the documentation @@ -57,14 +57,17 @@ in the scripts for how to target a specific GPU. ## Build SDK Image Build the *tritonserver_sdk* image that contains the client -libraries, model analyzer, and examples using the following -commands. You must first checkout the branch of the -*client* repo into the clientrepo/ subdirectory. Typically you want to -set to be the same as your current server branch. +libraries, model analyzer, perf analyzer and examples using the following +commands. You must first checkout the `` branch of the +*client* repo into the clientrepo/ subdirectory and the `` +branch of the *perf_analyzer* repo into the perfanalyzerrepo/ subdirectory +respectively. Typically you want to set both `` and `` +to be the same as your current server branch. ``` $ cd $ git clone --single-branch --depth=1 -b https://github.com/triton-inference-server/client.git clientrepo +$ git clone --single-branch --depth=1 -b https://github.com/triton-inference-server/perf_analyzer.git perfanalyzerrepo $ docker build -t tritonserver_sdk -f Dockerfile.sdk . ``` From 9ad856c4f67e62226eff40d757e6181ddf97c9a2 Mon Sep 17 00:00:00 2001 From: Francesco Petrini Date: Tue, 6 Aug 2024 16:32:29 -0700 Subject: [PATCH 04/44] chore:Purge PA from Client Repo (#7488) * PA Migration: Update server docs and tests --- README.md | 2 +- deploy/gke-marketplace-app/README.md | 4 ++-- deploy/k8s-onprem/README.md | 4 ++-- docs/README.md | 6 ++--- docs/contents.md | 31 ++++++++++++++++---------- docs/examples/jetson/README.md | 6 ++--- docs/generate_docs.py | 4 ++++ docs/user_guide/debugging_guide.md | 4 ++-- docs/user_guide/faq.md | 4 ++-- docs/user_guide/jetson.md | 2 +- docs/user_guide/model_analyzer.md | 4 ++-- docs/user_guide/model_configuration.md | 2 +- docs/user_guide/optimization.md | 4 ++-- docs/user_guide/perf_analyzer.md | 4 ++-- docs/user_guide/performance_tuning.md | 4 ++-- qa/L0_perf_analyzer_doc_links/test.sh | 10 ++++----- 16 files changed, 53 insertions(+), 42 deletions(-) diff --git a/README.md b/README.md index 17628b4f03..2200886a20 100644 --- a/README.md +++ b/README.md @@ -179,7 +179,7 @@ configuration](docs/user_guide/model_configuration.md) for the model. [Backend-Platform Support Matrix](https://github.com/triton-inference-server/backend/blob/main/docs/backend_platform_support_matrix.md) to learn which backends are supported on your target platform. - Learn how to [optimize performance](docs/user_guide/optimization.md) using the - [Performance Analyzer](https://github.com/triton-inference-server/client/blob/main/src/c++/perf_analyzer/README.md) + [Performance Analyzer](https://github.com/triton-inference-server/perf_analyzer/blob/main/README.md) and [Model Analyzer](https://github.com/triton-inference-server/model_analyzer) - Learn how to [manage loading and unloading models](docs/user_guide/model_management.md) in diff --git a/deploy/gke-marketplace-app/README.md b/deploy/gke-marketplace-app/README.md index e99b9efbae..595d4634ab 100644 --- a/deploy/gke-marketplace-app/README.md +++ b/deploy/gke-marketplace-app/README.md @@ -1,5 +1,5 @@ Perf Analyzer documentation has been relocated to -[here](https://github.com/triton-inference-server/client/blob/main/src/c++/perf_analyzer/README.md). +[here](https://github.com/triton-inference-server/perf_analyzer/blob/main/README.md). diff --git a/docs/user_guide/performance_tuning.md b/docs/user_guide/performance_tuning.md index 49cad9e637..446534da99 100644 --- a/docs/user_guide/performance_tuning.md +++ b/docs/user_guide/performance_tuning.md @@ -73,7 +73,7 @@ For additional material, see the verify that we can run inference requests and get a baseline performance benchmark of your model. Triton's - [Perf Analyzer](https://github.com/triton-inference-server/client/blob/main/src/c++/perf_analyzer/README.md) + [Perf Analyzer](https://github.com/triton-inference-server/perf_analyzer/blob/main/README.md) tool specifically fits this purpose. Here is a simplified output for demonstration purposes: @@ -103,7 +103,7 @@ For additional material, see the There are many variables that can be tweaked just within your model configuration (`config.pbtxt`) to obtain different results. - As your model, config, or use case evolves, - [Perf Analyzer](https://github.com/triton-inference-server/client/blob/main/src/c++/perf_analyzer/README.md) + [Perf Analyzer](https://github.com/triton-inference-server/perf_analyzer/blob/main/README.md) is a great tool to quickly verify model functionality and performance. 3. How can I improve my model performance? diff --git a/qa/L0_perf_analyzer_doc_links/test.sh b/qa/L0_perf_analyzer_doc_links/test.sh index db80e84974..d0757bca9e 100755 --- a/qa/L0_perf_analyzer_doc_links/test.sh +++ b/qa/L0_perf_analyzer_doc_links/test.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -35,10 +35,10 @@ python3 -m pip install mkdocs-htmlproofer-plugin==0.10.3 #Download perf_analyzer docs TRITON_REPO_ORGANIZATION=${TRITON_REPO_ORGANIZATION:="http://github.com/triton-inference-server"} -TRITON_CLIENT_REPO_TAG="${TRITON_CLIENT_REPO_TAG:=main}" -git clone -b ${TRITON_CLIENT_REPO_TAG} ${TRITON_REPO_ORGANIZATION}/client.git -cp `pwd`/client/src/c++/perf_analyzer/README.md . -cp -rf `pwd`/client/src/c++/perf_analyzer/docs . +TRITON_PERF_ANALYZER_REPO_TAG="${TRITON_PERF_ANALYZER_REPO_TAG:=main}" +git clone -b ${TRITON_PERF_ANALYZER_REPO_TAG} ${TRITON_REPO_ORGANIZATION}/perf_analyzer.git +cp `pwd`/perf_analyzer/README.md . +cp -rf `pwd`/perf_analyzer/docs . # Need to remove all links that start with -- or -. Mkdocs converts all -- to - for anchor links. # This breaks all links to cli commands throughout the docs. This will iterate over all From a4285ff0d68643bb4c959e5cb7287de427d006d5 Mon Sep 17 00:00:00 2001 From: Francesco Petrini Date: Wed, 7 Aug 2024 10:01:11 -0700 Subject: [PATCH 05/44] PA Migration: Update L0_client_build_variants (#7505) * PA Migration: Update L0_client_build_variants --- Dockerfile.sdk | 1 + qa/L0_client_build_variants/test.sh | 84 ++++++----------------------- 2 files changed, 16 insertions(+), 69 deletions(-) diff --git a/Dockerfile.sdk b/Dockerfile.sdk index 0748277d52..7897c2a215 100644 --- a/Dockerfile.sdk +++ b/Dockerfile.sdk @@ -218,6 +218,7 @@ WORKDIR /workspace COPY TRITON_VERSION . COPY NVIDIA_Deep_Learning_Container_License.pdf . COPY --from=sdk_build /workspace/client/ client/ +COPY --from=sdk_build /workspace/perf_analyzer/ perf_analyzer/ COPY --from=sdk_build /workspace/install/ install/ RUN cd install && \ export VERSION=`cat /workspace/TRITON_VERSION` && \ diff --git a/qa/L0_client_build_variants/test.sh b/qa/L0_client_build_variants/test.sh index c31c55e310..9dc1c4c85d 100755 --- a/qa/L0_client_build_variants/test.sh +++ b/qa/L0_client_build_variants/test.sh @@ -58,10 +58,6 @@ TRITON_REPO_ORGANIZATION=${TRITON_REPO_ORGANIZATION:="http://github.com/triton-i -DTRITON_ENABLE_PYTHON_HTTP=ON \ -DTRITON_ENABLE_PYTHON_GRPC=ON \ -DTRITON_ENABLE_JAVA_HTTP=ON \ - -DTRITON_ENABLE_PERF_ANALYZER=ON \ - -DTRITON_ENABLE_PERF_ANALYZER_C_API=ON \ - -DTRITON_ENABLE_PERF_ANALYZER_TFS=OFF \ - -DTRITON_ENABLE_PERF_ANALYZER_TS=OFF \ -DTRITON_ENABLE_EXAMPLES=ON \ -DTRITON_ENABLE_TESTS=ON \ -DTRITON_ENABLE_GPU=OFF \ @@ -90,10 +86,6 @@ fi -DTRITON_ENABLE_CC_GRPC=ON \ -DTRITON_ENABLE_PYTHON_HTTP=OFF \ -DTRITON_ENABLE_PYTHON_GRPC=ON \ - -DTRITON_ENABLE_PERF_ANALYZER=ON \ - -DTRITON_ENABLE_PERF_ANALYZER_C_API=ON \ - -DTRITON_ENABLE_PERF_ANALYZER_TFS=OFF \ - -DTRITON_ENABLE_PERF_ANALYZER_TS=OFF \ -DTRITON_ENABLE_EXAMPLES=ON \ -DTRITON_ENABLE_TESTS=ON \ -DTRITON_ENABLE_GPU=ON \ @@ -121,10 +113,6 @@ fi -DTRITON_ENABLE_CC_GRPC=OFF \ -DTRITON_ENABLE_PYTHON_HTTP=ON \ -DTRITON_ENABLE_PYTHON_GRPC=OFF \ - -DTRITON_ENABLE_PERF_ANALYZER=ON \ - -DTRITON_ENABLE_PERF_ANALYZER_C_API=ON \ - -DTRITON_ENABLE_PERF_ANALYZER_TFS=OFF \ - -DTRITON_ENABLE_PERF_ANALYZER_TS=OFF \ -DTRITON_ENABLE_EXAMPLES=ON \ -DTRITON_ENABLE_TESTS=ON \ -DTRITON_ENABLE_GPU=ON \ @@ -141,59 +129,27 @@ else exit 1 fi -# -# Build without Perf Analyzer -# -(cd /workspace/build && \ - rm -fr cc-clients python-clients && \ - cmake -DCMAKE_INSTALL_PREFIX=/workspace/install \ - -DTRITON_ENABLE_CC_HTTP=ON \ - -DTRITON_ENABLE_CC_GRPC=ON \ - -DTRITON_ENABLE_PYTHON_HTTP=ON \ - -DTRITON_ENABLE_PYTHON_GRPC=ON \ - -DTRITON_ENABLE_PERF_ANALYZER=OFF \ - -DTRITON_ENABLE_PERF_ANALYZER_C_API=OFF \ - -DTRITON_ENABLE_PERF_ANALYZER_TFS=OFF \ - -DTRITON_ENABLE_PERF_ANALYZER_TS=OFF \ - -DTRITON_ENABLE_EXAMPLES=ON \ - -DTRITON_ENABLE_TESTS=ON \ - -DTRITON_ENABLE_GPU=ON \ - -DTRITON_REPO_ORGANIZATION:STRING=${TRITON_REPO_ORGANIZATION} \ - -DTRITON_COMMON_REPO_TAG=${TRITON_COMMON_REPO_TAG} \ - -DTRITON_CORE_REPO_TAG=${TRITON_CORE_REPO_TAG} \ - -DTRITON_THIRD_PARTY_REPO_TAG=${TRITON_THIRD_PARTY_REPO_TAG} \ - /workspace/client && \ - make -j16 cc-clients python-clients) -if [ $? -eq 0 ]; then - echo -e "\n***\n*** No-Perf-Analyzer Passed\n***" -else - echo -e "\n***\n*** No-Perf-Analyzer FAILED\n***" - exit 1 -fi - +# TODO: TPRD-342 These tests should be PA CI test +# cases not Triton test cases +rm -fr /workspace/build +mkdir -p /workspace/build # # Build without C API in Perf Analyzer # (cd /workspace/build && \ - rm -fr cc-clients python-clients && \ cmake -DCMAKE_INSTALL_PREFIX=/workspace/install \ -DTRITON_ENABLE_CC_HTTP=ON \ -DTRITON_ENABLE_CC_GRPC=ON \ - -DTRITON_ENABLE_PYTHON_HTTP=ON \ - -DTRITON_ENABLE_PYTHON_GRPC=ON \ - -DTRITON_ENABLE_PERF_ANALYZER=ON \ -DTRITON_ENABLE_PERF_ANALYZER_C_API=OFF \ -DTRITON_ENABLE_PERF_ANALYZER_TFS=ON \ -DTRITON_ENABLE_PERF_ANALYZER_TS=ON \ - -DTRITON_ENABLE_EXAMPLES=ON \ - -DTRITON_ENABLE_TESTS=ON \ -DTRITON_ENABLE_GPU=ON \ -DTRITON_REPO_ORGANIZATION:STRING=${TRITON_REPO_ORGANIZATION} \ -DTRITON_COMMON_REPO_TAG=${TRITON_COMMON_REPO_TAG} \ -DTRITON_CORE_REPO_TAG=${TRITON_CORE_REPO_TAG} \ - -DTRITON_THIRD_PARTY_REPO_TAG=${TRITON_THIRD_PARTY_REPO_TAG} \ - /workspace/client && \ - make -j16 cc-clients python-clients) + -DTRITON_CLIENT_REPO_TAG=${TRITON_CLIENT_REPO_TAG} \ + /workspace/perf_analyzer && \ + make -j16 perf-analyzer) if [ $? -eq 0 ]; then echo -e "\n***\n*** No-CAPI Passed\n***" else @@ -205,25 +161,20 @@ fi # Build without TensorFlow Serving in Perf Analyzer # (cd /workspace/build && \ - rm -fr cc-clients python-clients && \ + rm -fr cc_clients perf_analyzer && \ cmake -DCMAKE_INSTALL_PREFIX=/workspace/install \ -DTRITON_ENABLE_CC_HTTP=ON \ -DTRITON_ENABLE_CC_GRPC=ON \ - -DTRITON_ENABLE_PYTHON_HTTP=ON \ - -DTRITON_ENABLE_PYTHON_GRPC=ON \ - -DTRITON_ENABLE_PERF_ANALYZER=ON \ -DTRITON_ENABLE_PERF_ANALYZER_C_API=ON \ -DTRITON_ENABLE_PERF_ANALYZER_TFS=OFF \ -DTRITON_ENABLE_PERF_ANALYZER_TS=ON \ - -DTRITON_ENABLE_EXAMPLES=ON \ - -DTRITON_ENABLE_TESTS=ON \ -DTRITON_ENABLE_GPU=ON \ -DTRITON_REPO_ORGANIZATION:STRING=${TRITON_REPO_ORGANIZATION} \ -DTRITON_COMMON_REPO_TAG=${TRITON_COMMON_REPO_TAG} \ -DTRITON_CORE_REPO_TAG=${TRITON_CORE_REPO_TAG} \ - -DTRITON_THIRD_PARTY_REPO_TAG=${TRITON_THIRD_PARTY_REPO_TAG} \ - /workspace/client && \ - make -j16 cc-clients python-clients) + -DTRITON_CLIENT_REPO_TAG=${TRITON_CLIENT_REPO_TAG} \ + /workspace/perf_analyzer && \ + make -j16 perf-analyzer) if [ $? -eq 0 ]; then echo -e "\n***\n*** No-TF-Serving Passed\n***" else @@ -235,25 +186,20 @@ fi # Build without TorchServe in Perf Analyzer # (cd /workspace/build && \ - rm -fr cc-clients python-clients && \ + rm -fr cc_clients perf_analyzer && \ cmake -DCMAKE_INSTALL_PREFIX=/workspace/install \ -DTRITON_ENABLE_CC_HTTP=ON \ -DTRITON_ENABLE_CC_GRPC=ON \ - -DTRITON_ENABLE_PYTHON_HTTP=ON \ - -DTRITON_ENABLE_PYTHON_GRPC=ON \ - -DTRITON_ENABLE_PERF_ANALYZER=ON \ -DTRITON_ENABLE_PERF_ANALYZER_C_API=ON \ -DTRITON_ENABLE_PERF_ANALYZER_TFS=ON \ -DTRITON_ENABLE_PERF_ANALYZER_TS=OFF \ - -DTRITON_ENABLE_EXAMPLES=ON \ - -DTRITON_ENABLE_TESTS=ON \ -DTRITON_ENABLE_GPU=ON \ -DTRITON_REPO_ORGANIZATION:STRING=${TRITON_REPO_ORGANIZATION} \ -DTRITON_COMMON_REPO_TAG=${TRITON_COMMON_REPO_TAG} \ -DTRITON_CORE_REPO_TAG=${TRITON_CORE_REPO_TAG} \ - -DTRITON_THIRD_PARTY_REPO_TAG=${TRITON_THIRD_PARTY_REPO_TAG} \ - /workspace/client && \ - make -j16 cc-clients python-clients) + -DTRITON_CLIENT_REPO_TAG=${TRITON_CLIENT_REPO_TAG} \ + /workspace/perf_analyzer && \ + make -j16 perf-analyzer) if [ $? -eq 0 ]; then echo -e "\n***\n*** No-TorchServe Passed\n***" else From 6636fc9fc007e913566d562cefe0b882cd568dd3 Mon Sep 17 00:00:00 2001 From: Jacky <18255193+kthui@users.noreply.github.com> Date: Wed, 7 Aug 2024 10:58:18 -0700 Subject: [PATCH 06/44] test: Add test for sending response after sending complete final flag (#7504) --- .../response_sender_complete_final_test.py | 77 +++++++++++++++++++ qa/L0_backend_python/response_sender/test.sh | 31 ++++++++ .../config.pbtxt | 47 +++++++++++ .../response_sender_complete_final/model.py | 63 +++++++++++++++ 4 files changed, 218 insertions(+) create mode 100644 qa/L0_backend_python/response_sender/response_sender_complete_final_test.py create mode 100644 qa/python_models/response_sender_complete_final/config.pbtxt create mode 100644 qa/python_models/response_sender_complete_final/model.py diff --git a/qa/L0_backend_python/response_sender/response_sender_complete_final_test.py b/qa/L0_backend_python/response_sender/response_sender_complete_final_test.py new file mode 100644 index 0000000000..386a54e3d3 --- /dev/null +++ b/qa/L0_backend_python/response_sender/response_sender_complete_final_test.py @@ -0,0 +1,77 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +import time +import unittest + +import numpy as np +import tritonclient.grpc as grpcclient + + +class ResponseSenderTest(unittest.TestCase): + def _generate_streaming_callback_and_responses_pair(self): + responses = [] # [{"result": result, "error": error}, ...] + + def callback(result, error): + responses.append({"result": result, "error": error}) + + return callback, responses + + def test_respond_after_complete_final(self): + with open(os.environ["SERVER_LOG"]) as f: + server_log = f.read() + self.assertNotIn("Test Passed", server_log) + + model_name = "response_sender_complete_final" + shape = [1, 1] + inputs = [grpcclient.InferInput("INPUT0", shape, "FP32")] + input0_np = np.array([[123.45]], np.float32) + inputs[0].set_data_from_numpy(input0_np) + + callback, responses = self._generate_streaming_callback_and_responses_pair() + with grpcclient.InferenceServerClient("localhost:8001") as client: + client.start_stream(callback) + client.async_stream_infer(model_name, inputs) + client.stop_stream() + + self.assertEqual(len(responses), 1) + for response in responses: + output0_np = response["result"].as_numpy(name="OUTPUT0") + self.assertTrue(np.allclose(input0_np, output0_np)) + self.assertIsNone(response["error"]) + + time.sleep(1) # make sure the logs are written before checking + with open(os.environ["SERVER_LOG"]) as f: + server_log = f.read() + self.assertNotIn("Unexpected request length", server_log) + self.assertNotIn("Expected exception not raised", server_log) + self.assertNotIn("Test FAILED", server_log) + self.assertIn("Test Passed", server_log) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_backend_python/response_sender/test.sh b/qa/L0_backend_python/response_sender/test.sh index 33db46edbb..cca7e7acfa 100755 --- a/qa/L0_backend_python/response_sender/test.sh +++ b/qa/L0_backend_python/response_sender/test.sh @@ -97,6 +97,37 @@ set -e kill $SERVER_PID wait $SERVER_PID +# +# Test response sender to raise exception on response after complete final flag +# +rm -rf models && mkdir models +mkdir -p models/response_sender_complete_final/1 && \ + cp ../../python_models/response_sender_complete_final/model.py models/response_sender_complete_final/1 && \ + cp ../../python_models/response_sender_complete_final/config.pbtxt models/response_sender_complete_final + +TEST_LOG="response_sender_complete_final_test.log" +SERVER_LOG="response_sender_complete_final_test.server.log" +SERVER_ARGS="--model-repository=${MODELDIR}/response_sender/models --backend-directory=${BACKEND_DIR} --log-verbose=1" + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +SERVER_LOG=$SERVER_LOG python3 -m pytest --junitxml=concurrency_test.report.xml response_sender_complete_final_test.py > $TEST_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** response sender complete final test FAILED\n***" + cat $TEST_LOG + RET=1 +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + # # Test async response sender under decoupled / non-decoupled # diff --git a/qa/python_models/response_sender_complete_final/config.pbtxt b/qa/python_models/response_sender_complete_final/config.pbtxt new file mode 100644 index 0000000000..f08ed6da5b --- /dev/null +++ b/qa/python_models/response_sender_complete_final/config.pbtxt @@ -0,0 +1,47 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +backend: "python" +max_batch_size: 8 + +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] + +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] + +instance_group [{ kind: KIND_CPU }] +model_transaction_policy { decoupled: True } diff --git a/qa/python_models/response_sender_complete_final/model.py b/qa/python_models/response_sender_complete_final/model.py new file mode 100644 index 0000000000..e17f0b04f6 --- /dev/null +++ b/qa/python_models/response_sender_complete_final/model.py @@ -0,0 +1,63 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + def execute(self, requests): + # Expect exactly one request per execute() call. + if len(requests) != 1: + pb_utils.Logger.log_error(f"Unexpected request length: {len(requests)}") + raise Exception("Test FAILED") + + # Send a response with complete final flag, and then send another response and + # and assert an exception is raised, for all requests. + for request in requests: + in_tensor = pb_utils.get_input_tensor_by_name(request, "INPUT0") + out_tensor = pb_utils.Tensor("OUTPUT0", in_tensor.as_numpy()) + response = pb_utils.InferenceResponse([out_tensor]) + response_sender = request.get_response_sender() + response_sender.send( + response, flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL + ) + test_passed = False + try: + response_sender.send(response) + except Exception as e: + pb_utils.Logger.log_info(f"Raised exception: {e}") + if ( + str(e) + == "Unable to send response. Response sender has been closed." + ): + test_passed = True + finally: + if not test_passed: + pb_utils.Logger.log_error("Expected exception not raised") + raise Exception("Test FAILED") + pb_utils.Logger.log_info("Test Passed") + return None From f2841016079b0b647f10e85db6a4b9e7b2dda330 Mon Sep 17 00:00:00 2001 From: Harry Kim Date: Thu, 8 Aug 2024 07:15:30 -0700 Subject: [PATCH 07/44] Add vLLM x Triton user meetup announcement (#7509) --- README.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/README.md b/README.md index 2200886a20..f9b1a483f3 100644 --- a/README.md +++ b/README.md @@ -28,6 +28,17 @@ # Triton Inference Server +📣 **vLLM x Triton Meetup at Fort Mason on Sept 9th 4:00 - 9:00 pm** + +We are excited to announce that we will be hosting our Triton user meetup with the vLLM team at +[Fort Mason](https://maps.app.goo.gl/9Lr3fxRssrpQCGK58) on Sept 9th 4:00 - 9:00 pm. Join us for this +exclusive event where you will learn about the newest vLLM and Triton features, get a +glimpse into the roadmaps, and connect with fellow users, the NVIDIA Triton and vLLM teams. Seating is limited and registration confirmation +is required to attend - please register [here](https://lu.ma/87q3nvnh) to join +the meetup. + +___ + [![License](https://img.shields.io/badge/License-BSD3-lightgrey.svg)](https://opensource.org/licenses/BSD-3-Clause) [!WARNING] From 53200091b84f08a5e4921f5073137784570283e9 Mon Sep 17 00:00:00 2001 From: Sai Kiran Polisetty Date: Sat, 10 Aug 2024 21:21:35 +0530 Subject: [PATCH 08/44] Fix benchmarking tests (#7461) --- qa/L0_perf_tensorrt_llm/test.sh | 65 +++++++++------------------------ qa/L0_perf_vllm/test.sh | 2 +- 2 files changed, 19 insertions(+), 48 deletions(-) diff --git a/qa/L0_perf_tensorrt_llm/test.sh b/qa/L0_perf_tensorrt_llm/test.sh index 35d360498d..e74b01e568 100755 --- a/qa/L0_perf_tensorrt_llm/test.sh +++ b/qa/L0_perf_tensorrt_llm/test.sh @@ -34,7 +34,7 @@ TRT_ROOT="/usr/local/tensorrt" MODEL_NAME="gpt2_tensorrt_llm" NAME="tensorrt_llm_benchmarking_test" MODEL_REPOSITORY="$(pwd)/triton_model_repo" -TENSORRTLLM_BACKEND_DIR="/opt/tritonserver/tensorrtllm_backend" +TENSORRTLLM_BACKEND_DIR="/workspace/tensorrtllm_backend" GPT_DIR="$TENSORRTLLM_BACKEND_DIR/tensorrt_llm/examples/gpt" TOKENIZER_DIR="$GPT_DIR/gpt2" ENGINES_DIR="${BASE_DIR}/engines/inflight_batcher_llm/${NUM_GPUS}-gpu" @@ -47,40 +47,27 @@ SERVER_TIMEOUT=${SERVER_TIMEOUT:=120} function clone_tensorrt_llm_backend_repo { rm -rf $TENSORRTLLM_BACKEND_DIR && mkdir $TENSORRTLLM_BACKEND_DIR apt-get update && apt-get install git-lfs -y --no-install-recommends - git clone --single-branch --depth=1 -b ${TENSORRTLLM_BACKEND_REPO_TAG} https://github.com/triton-inference-server/tensorrtllm_backend.git $TENSORRTLLM_BACKEND_DIR + git clone --single-branch --depth=1 -b ${TENSORRTLLM_BACKEND_REPO_TAG} ${TRITON_REPO_ORG}/tensorrtllm_backend.git $TENSORRTLLM_BACKEND_DIR cd $TENSORRTLLM_BACKEND_DIR && git lfs install && git submodule update --init --recursive } # Update Open MPI to a version compatible with SLURM. function upgrade_openmpi { - cd /tmp/ local CURRENT_VERSION=$(mpirun --version 2>&1 | awk '/Open MPI/ {gsub(/rc[0-9]+/, "", $NF); print $NF}') if [ -n "$CURRENT_VERSION" ] && dpkg --compare-versions "$CURRENT_VERSION" lt "5.0.1"; then # Uninstall the current version of Open MPI - wget "https://download.open-mpi.org/release/open-mpi/v$(echo "${CURRENT_VERSION}" | awk -F. '{print $1"."$2}')/openmpi-${CURRENT_VERSION}.tar.gz" || { - echo "Failed to download Open MPI ${CURRENT_VERSION}" - exit 1 - } - rm -rf "openmpi-${CURRENT_VERSION}" && tar -xzf "openmpi-${CURRENT_VERSION}.tar.gz" && cd "openmpi-${CURRENT_VERSION}" || { - echo "Failed to extract Open MPI ${CURRENT_VERSION}" - exit 1 - } - unset PMIX_VERSION && ./configure --prefix=/opt/hpcx/ompi/ && make uninstall || { - echo "Failed to uninstall Open MPI ${CURRENT_VERSION}" - exit 1 - } - rm -rf /opt/hpcx/ompi/ /usr/local/mpi/ || { - echo "Failed to remove Open MPI ${CURRENT_VERSION} installation directories" + rm -r /opt/hpcx/ompi/ /usr/local/mpi && rm -rf /usr/lib/$(gcc -print-multiarch)/openmpi || { + echo "Failed to uninstall the existing Open MPI version $CURRENT_VERSION." exit 1 } - cd ../ && rm -r openmpi-${CURRENT_VERSION} else - echo "Installed Open MPI version is not less than 5.0.1. Skipping the upgrade." + echo "The installed Open MPI version ($CURRENT_VERSION) is 5.0.1 or higher. Skipping the upgrade." return fi # Install SLURM supported Open MPI version + cd /tmp/ wget "https://download.open-mpi.org/release/open-mpi/v5.0/openmpi-5.0.1.tar.gz" || { echo "Failed to download Open MPI 5.0.1" exit 1 @@ -108,18 +95,6 @@ function upgrade_openmpi { mpirun --version } -function install_tensorrt_llm { - # Install CMake - bash ${TENSORRTLLM_BACKEND_DIR}/tensorrt_llm/docker/common/install_cmake.sh - export PATH="/usr/local/cmake/bin:${PATH}" - - TORCH_INSTALL_TYPE="pypi" && - (cd ${TENSORRTLLM_BACKEND_DIR}/tensorrt_llm && - bash docker/common/install_pytorch.sh $TORCH_INSTALL_TYPE && - python3 ./scripts/build_wheel.py --trt_root=/usr/local/tensorrt && - pip3 install ./build/tensorrt_llm*.whl) -} - function build_gpt2_base_model { # Download weights from HuggingFace Transformers cd ${GPT_DIR} && rm -rf gpt2 && git clone https://huggingface.co/gpt2-medium gpt2 && cd gpt2 @@ -131,24 +106,21 @@ function build_gpt2_base_model { cd ${GPT_DIR} # Convert weights from HF Tranformers to FT format - python3 hf_gpt_convert.py -p 1 -i gpt2 -o ./c-model/gpt2 --tensor-parallelism ${NUM_GPUS} --storage-type float16 + python3 convert_checkpoint.py --model_dir gpt2 --dtype float16 --tp_size ${NUM_GPUS} --output_dir "./c-model/gpt2/${NUM_GPUS}-gpu/" cd ${BASE_DIR} } function build_gpt2_tensorrt_engine { # Build TensorRT engines cd ${GPT_DIR} - python3 build.py --model_dir="./c-model/gpt2/${NUM_GPUS}-gpu/" \ - --world_size="${NUM_GPUS}" \ - --dtype float16 \ - --use_inflight_batching \ - --use_gpt_attention_plugin float16 \ - --paged_kv_cache \ - --use_gemm_plugin float16 \ - --remove_input_padding \ - --hidden_act gelu \ - --parallel_build \ - --output_dir="${ENGINES_DIR}" + trtllm-build --checkpoint_dir "./c-model/gpt2/${NUM_GPUS}-gpu/" \ + --gpt_attention_plugin float16 \ + --remove_input_padding enable \ + --paged_kv_cache enable \ + --gemm_plugin float16 \ + --workers "${NUM_GPUS}" \ + --output_dir "${ENGINES_DIR}" + cd ${BASE_DIR} } @@ -172,18 +144,18 @@ function prepare_model_repository { replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_REPOSITORY}/preprocessing/config.pbtxt" replace_config_tags '${preprocessing_instance_count}' '1' "${MODEL_REPOSITORY}/preprocessing/config.pbtxt" replace_config_tags '${tokenizer_dir}' "${TOKENIZER_DIR}/" "${MODEL_REPOSITORY}/preprocessing/config.pbtxt" - replace_config_tags '${tokenizer_type}' 'auto' "${MODEL_REPOSITORY}/preprocessing/config.pbtxt" replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_REPOSITORY}/postprocessing/config.pbtxt" replace_config_tags '${postprocessing_instance_count}' '1' "${MODEL_REPOSITORY}/postprocessing/config.pbtxt" replace_config_tags '${tokenizer_dir}' "${TOKENIZER_DIR}/" "${MODEL_REPOSITORY}/postprocessing/config.pbtxt" - replace_config_tags '${tokenizer_type}' 'auto' "${MODEL_REPOSITORY}/postprocessing/config.pbtxt" replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt" replace_config_tags '${decoupled_mode}' 'true' "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt" replace_config_tags '${max_queue_delay_microseconds}' "1000000" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt" replace_config_tags '${batching_strategy}' 'inflight_fused_batching' "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt" replace_config_tags '${engine_dir}' "${ENGINES_DIR}" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt" + replace_config_tags '${triton_backend}' "tensorrtllm" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt" + replace_config_tags '${max_queue_size}' "0" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt" } # Wait until server health endpoint shows ready. Sets WAIT_RET to 0 on @@ -244,13 +216,12 @@ function kill_server { upgrade_openmpi clone_tensorrt_llm_backend_repo -install_tensorrt_llm build_gpt2_base_model build_gpt2_tensorrt_engine prepare_model_repository # Install perf_analyzer -pip3 install tritonclient nvidia-ml-py3 +pip3 install tritonclient ARCH="amd64" STATIC_BATCH=1 diff --git a/qa/L0_perf_vllm/test.sh b/qa/L0_perf_vllm/test.sh index 498f6f8e14..e1ce8cf2ed 100755 --- a/qa/L0_perf_vllm/test.sh +++ b/qa/L0_perf_vllm/test.sh @@ -41,7 +41,7 @@ SERVER_ARGS="--model-repository=${MODEL_REPO} --backend-directory=${BACKEND_DIR} export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:=0} EXPORT_FILE=profile-export-vllm-model.json -pip3 install tritonclient nvidia-ml-py3 +pip3 install tritonclient rm -rf $MODEL_REPO $EXPORT_FILE *.tjson *.json *.csv mkdir -p $MODEL_REPO/$MODEL_NAME/1 From e7c8e7b2e52cf927da29b9d857746592222be0ff Mon Sep 17 00:00:00 2001 From: Yingge He <157551214+yinggeh@users.noreply.github.com> Date: Thu, 15 Aug 2024 20:35:51 -0700 Subject: [PATCH 09/44] feat: Add vLLM counter metrics access through Triton (#7493) Report vLLM counter metrics through Triton server --- build.py | 4 ++++ docs/user_guide/metrics.md | 6 ++++++ 2 files changed, 10 insertions(+) diff --git a/build.py b/build.py index 2c95cbded3..0487636b09 100755 --- a/build.py +++ b/build.py @@ -1806,6 +1806,10 @@ def backend_clone( os.path.join(build_dir, be, "src", "model.py"), backend_dir, ) + clone_script.cpdir( + os.path.join(build_dir, be, "src", "utils"), + backend_dir, + ) clone_script.comment() clone_script.comment(f"end '{be}' backend") diff --git a/docs/user_guide/metrics.md b/docs/user_guide/metrics.md index 0a7f3cf1a3..b8fc0d8ee0 100644 --- a/docs/user_guide/metrics.md +++ b/docs/user_guide/metrics.md @@ -378,3 +378,9 @@ Further documentation can be found in the `TRITONSERVER_MetricFamily*` and The TRT-LLM backend uses the custom metrics API to track and expose specific metrics about LLMs, KV Cache, and Inflight Batching to Triton: https://github.com/triton-inference-server/tensorrtllm_backend?tab=readme-ov-file#triton-metrics + +### vLLM Backend Metrics + +The vLLM backend uses the custom metrics API to track and expose specific metrics about +LLMs to Triton: +https://github.com/triton-inference-server/vllm_backend?tab=readme-ov-file#triton-metrics From 7a158a4075bbd3d757ce0d2a2afc419d509ed599 Mon Sep 17 00:00:00 2001 From: Kyle McGill <101670481+nv-kmcgill53@users.noreply.github.com> Date: Fri, 16 Aug 2024 10:41:19 -0700 Subject: [PATCH 10/44] build: RHEL 8 Compatibility (#7519) Co-authored-by: Francesco Petrini --- CMakeLists.txt | 16 +-- build.py | 216 +++++++++++++++++++++++++++++--- qa/L0_infer/install_and_test.sh | 22 +++- src/CMakeLists.txt | 11 +- 4 files changed, 228 insertions(+), 37 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ff578c9724..56cb346dc0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -125,17 +125,13 @@ FetchContent_Declare( # Some libs are installed to ${TRITON_THIRD_PARTY_INSTALL_PREFIX}/{LIB}/lib64 instead # of ${TRITON_THIRD_PARTY_INSTALL_PREFIX}/{LIB}/lib on Centos -set (LIB_DIR "lib") -# /etc/os-release does not exist on Windows -if(EXISTS "/etc/os-release") - file(STRINGS /etc/os-release DISTRO REGEX "^NAME=") - string(REGEX REPLACE "NAME=\"(.*)\"" "\\1" DISTRO "${DISTRO}") - message(STATUS "Distro Name: ${DISTRO}") - if(DISTRO MATCHES "CentOS.*") +set(LIB_DIR "lib") +if(LINUX) + file(STRINGS "/etc/os-release" DISTRO_ID_LIKE REGEX "ID_LIKE") + if(${DISTRO_ID_LIKE} MATCHES "rhel|centos") set (LIB_DIR "lib64") - endif() -endif() - + endif(${DISTRO_ID_LIKE} MATCHES "rhel|centos") +endif(LINUX) set(TRITON_CORE_HEADERS_ONLY OFF) FetchContent_MakeAvailable(repo-third-party repo-core) diff --git a/build.py b/build.py index 0487636b09..2a9b2469fc 100755 --- a/build.py +++ b/build.py @@ -37,6 +37,7 @@ import sys from inspect import getsourcefile +import distro import requests # @@ -117,7 +118,17 @@ def fail_if(p, msg): def target_platform(): if FLAGS.target_platform is not None: return FLAGS.target_platform - return platform.system().lower() + platform_string = platform.system().lower() + if platform_string == "linux": + # Need to inspect the /etc/os-release file to get + # the distribution of linux + id_like_list = distro.like().split() + if "debian" in id_like_list: + return "linux" + else: + return "rhel" + else: + return platform_string def target_machine(): @@ -649,7 +660,8 @@ def onnxruntime_cmake_args(images, library_paths): ] # TRITON_ENABLE_GPU is already set for all backends in backend_cmake_args() - if FLAGS.enable_gpu: + # TODO: TPRD-334 TensorRT extension is not currently supported by our manylinux build + if FLAGS.enable_gpu and target_platform() != "rhel": cargs.append( cmake_backend_enable( "onnxruntime", "TRITON_ENABLE_ONNXRUNTIME_TENSORRT", True @@ -680,8 +692,11 @@ def onnxruntime_cmake_args(images, library_paths): ) ) - if (target_machine() != "aarch64") and ( - TRITON_VERSION_MAP[FLAGS.version][3] is not None + # TODO: TPRD-333 OpenVino extension is not currently supported by our manylinux build + if ( + (target_machine() != "aarch64") + and (target_platform() != "rhel") + and (TRITON_VERSION_MAP[FLAGS.version][3] is not None) ): cargs.append( cmake_backend_enable( @@ -697,7 +712,7 @@ def onnxruntime_cmake_args(images, library_paths): ) ) - if target_platform() == "igpu": + if (target_platform() == "igpu") or (target_platform() == "rhel"): cargs.append( cmake_backend_arg( "onnxruntime", @@ -833,8 +848,31 @@ def install_dcgm_libraries(dcgm_version, target_machine): ) return "" else: - if target_machine == "aarch64": - return """ + # RHEL has the same install instructions for both aarch64 and x86 + if target_platform() == "rhel": + if target_machine == "aarch64": + return """ +ENV DCGM_VERSION {} +# Install DCGM. Steps from https://developer.nvidia.com/dcgm#Downloads +RUN dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/sbsa/cuda-rhel8.repo \\ + && dnf clean expire-cache \\ + && dnf install -y datacenter-gpu-manager-{} +""".format( + dcgm_version, dcgm_version + ) + else: + return """ +ENV DCGM_VERSION {} +# Install DCGM. Steps from https://developer.nvidia.com/dcgm#Downloads +RUN dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo \\ + && dnf clean expire-cache \\ + && dnf install -y datacenter-gpu-manager-{} +""".format( + dcgm_version, dcgm_version + ) + else: + if target_machine == "aarch64": + return """ ENV DCGM_VERSION {} # Install DCGM. Steps from https://developer.nvidia.com/dcgm#Downloads RUN curl -o /tmp/cuda-keyring.deb \\ @@ -844,10 +882,10 @@ def install_dcgm_libraries(dcgm_version, target_machine): && apt-get update \\ && apt-get install -y datacenter-gpu-manager=1:{} """.format( - dcgm_version, dcgm_version - ) - else: - return """ + dcgm_version, dcgm_version + ) + else: + return """ ENV DCGM_VERSION {} # Install DCGM. Steps from https://developer.nvidia.com/dcgm#Downloads RUN curl -o /tmp/cuda-keyring.deb \\ @@ -857,8 +895,106 @@ def install_dcgm_libraries(dcgm_version, target_machine): && apt-get update \\ && apt-get install -y datacenter-gpu-manager=1:{} """.format( - dcgm_version, dcgm_version - ) + dcgm_version, dcgm_version + ) + + +def create_dockerfile_buildbase_rhel(ddir, dockerfile_name, argmap): + df = """ +ARG TRITON_VERSION={} +ARG TRITON_CONTAINER_VERSION={} +ARG BASE_IMAGE={} +""".format( + argmap["TRITON_VERSION"], + argmap["TRITON_CONTAINER_VERSION"], + argmap["BASE_IMAGE"], + ) + + df += """ +FROM ${BASE_IMAGE} + +ARG TRITON_VERSION +ARG TRITON_CONTAINER_VERSION +""" + df += """ +# Install docker docker buildx +RUN yum install -y ca-certificates curl gnupg yum-utils \\ + && yum-config-manager --add-repo https://download.docker.com/linux/rhel/docker-ce.repo \\ + && yum install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin +# && yum install -y docker.io docker-buildx-plugin + +# libcurl4-openSSL-dev is needed for GCS +# python3-dev is needed by Torchvision +# python3-pip and libarchive-dev is needed by python backend +# libxml2-dev is needed for Azure Storage +# scons is needed for armnn_tflite backend build dep +RUN yum install -y \\ + ca-certificates \\ + autoconf \\ + automake \\ + git \\ + gperf \\ + re2-devel \\ + openssl-devel \\ + libtool \\ + libcurl-devel \\ + libb64-devel \\ + gperftools-devel \\ + patchelf \\ + python3.11-devel \\ + python3-pip \\ + python3-setuptools \\ + rapidjson-devel \\ + python3-scons \\ + pkg-config \\ + unzip \\ + wget \\ + zlib-devel \\ + libarchive-devel \\ + libxml2-devel \\ + numactl-devel \\ + wget + +RUN pip3 install --upgrade pip \\ + && pip3 install --upgrade \\ + wheel \\ + setuptools \\ + docker \\ + virtualenv + +# Install boost version >= 1.78 for boost::span +# Current libboost-dev apt packages are < 1.78, so install from tar.gz +RUN wget -O /tmp/boost.tar.gz \\ + https://archives.boost.io/release/1.80.0/source/boost_1_80_0.tar.gz \\ + && (cd /tmp && tar xzf boost.tar.gz) \\ + && mv /tmp/boost_1_80_0/boost /usr/include/boost + +# Server build requires recent version of CMake (FetchContent required) +# Might not need this if the installed version of cmake is high enough for our build. +# RUN apt update -q=2 \\ +# && apt install -y gpg wget \\ +# && wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null \\ +# && . /etc/os-release \\ +# && echo "deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ $UBUNTU_CODENAME main" | tee /etc/apt/sources.list.d/kitware.list >/dev/null \\ +# && apt-get update -q=2 \\ +# && apt-get install -y --no-install-recommends cmake=3.27.7* cmake-data=3.27.7* +""" + if FLAGS.enable_gpu: + df += install_dcgm_libraries(argmap["DCGM_VERSION"], target_machine()) + df += """ +ENV TRITON_SERVER_VERSION ${TRITON_VERSION} +ENV NVIDIA_TRITON_SERVER_VERSION ${TRITON_CONTAINER_VERSION} +""" + + df += """ +WORKDIR /workspace +RUN rm -fr * +COPY . . +ENTRYPOINT [] +""" + + with open(os.path.join(ddir, dockerfile_name), "w") as dfile: + dfile.write(df) def create_dockerfile_buildbase(ddir, dockerfile_name, argmap): @@ -1161,7 +1297,29 @@ def dockerfile_prepare_container_linux(argmap, backends, enable_gpu, target_mach fi \\ && [ `id -u $TRITON_SERVER_USER` -eq 1000 ] \\ && [ `id -g $TRITON_SERVER_USER` -eq 1000 ] +""".format( + gpu_enabled=gpu_enabled + ) + # This + if target_platform() == "rhel": + df += """ +# Common dpeendencies. +RUN yum install -y \\ + git \\ + gperf \\ + re2-devel \\ + openssl-devel \\ + libtool \\ + libcurl-devel \\ + libb64-devel \\ + gperftools-devel \\ + patchelf \\ + wget \\ + numactl-devel +""" + else: + df += """ # Ensure apt-get won't prompt for selecting options ENV DEBIAN_FRONTEND=noninteractive @@ -1184,12 +1342,14 @@ def dockerfile_prepare_container_linux(argmap, backends, enable_gpu, target_mach wget \\ {backend_dependencies} \\ && rm -rf /var/lib/apt/lists/* +""".format( + backend_dependencies=backend_dependencies + ) + df += """ # Set TCMALLOC_RELEASE_RATE for users setting LD_PRELOAD with tcmalloc ENV TCMALLOC_RELEASE_RATE 200 -""".format( - gpu_enabled=gpu_enabled, backend_dependencies=backend_dependencies - ) +""" if "fastertransformer" in backends: be = "fastertransformer" @@ -1433,9 +1593,14 @@ def create_build_dockerfiles( ) dockerfileargmap["GPU_BASE_IMAGE"] = gpu_base_image - create_dockerfile_buildbase( - FLAGS.build_dir, "Dockerfile.buildbase", dockerfileargmap - ) + if target_platform() == "rhel": + create_dockerfile_buildbase_rhel( + FLAGS.build_dir, "Dockerfile.buildbase", dockerfileargmap + ) + else: + create_dockerfile_buildbase( + FLAGS.build_dir, "Dockerfile.buildbase", dockerfileargmap + ) if target_platform() == "windows": create_dockerfile_windows( @@ -1651,6 +1816,17 @@ def core_build( os.path.join(repo_install_dir, "lib", "tritonserver.lib"), os.path.join(install_dir, "bin"), ) + elif target_platform() == "rhel": + cmake_script.mkdir(os.path.join(install_dir, "bin")) + cmake_script.cp( + os.path.join(repo_install_dir, "bin", "tritonserver"), + os.path.join(install_dir, "bin"), + ) + cmake_script.mkdir(os.path.join(install_dir, "lib64")) + cmake_script.cp( + os.path.join(repo_install_dir, "lib64", "libtritonserver.so"), + os.path.join(install_dir, "lib64"), + ) else: cmake_script.mkdir(os.path.join(install_dir, "bin")) cmake_script.cp( @@ -2128,7 +2304,7 @@ def enable_all(): "--target-platform", required=False, default=None, - help='Target platform for build, can be "linux", "windows" or "igpu". If not specified, build targets the current platform.', + help='Target platform for build, can be "linux", "rhel", "windows" or "igpu". If not specified, build targets the current platform.', ) parser.add_argument( "--target-machine", diff --git a/qa/L0_infer/install_and_test.sh b/qa/L0_infer/install_and_test.sh index 28e5dad52e..4c136cf1dd 100755 --- a/qa/L0_infer/install_and_test.sh +++ b/qa/L0_infer/install_and_test.sh @@ -25,14 +25,24 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# Determine the operating system to call the correct package manager. +ID_LIKE=$(grep -Po '(?<=ID_LIKE=).*' /etc/os-release | awk -F= '{print $1}' | tr -d '"' | awk '{print $1}') + # Note: This script is to be used with customized triton containers that need # dependencies to run L0_infer tests -apt-get update && \ - apt-get install -y --no-install-recommends \ - curl \ - jq \ - python3 \ - python3-pip +if [[ "$ID_LIKE" =~ "debian" ]]; then + apt-get update && \ + apt-get install -y --no-install-recommends \ + curl \ + jq \ + python3 \ + python3-pip +else + yum install -y \ + jq \ + curl +fi + pip3 install --upgrade pip # install client libraries pip3 install tritonclient[all] diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 783275d8d7..cf43765dba 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -138,6 +138,15 @@ else() ) endif() +set(LIB_DIR "lib") +if(LINUX) + file(STRINGS "/etc/os-release" DISTRO_ID_LIKE REGEX "ID_LIKE") + if(${DISTRO_ID_LIKE} MATCHES "rhel|centos") + set (LIB_DIR "lib64") + endif(${DISTRO_ID_LIKE} MATCHES "rhel|centos") +endif(LINUX) +set(TRITON_CORE_HEADERS_ONLY OFF) + set_target_properties( main PROPERTIES @@ -145,7 +154,7 @@ set_target_properties( SKIP_BUILD_RPATH TRUE BUILD_WITH_INSTALL_RPATH TRUE INSTALL_RPATH_USE_LINK_PATH FALSE - INSTALL_RPATH "$\{ORIGIN\}/../lib" + INSTALL_RPATH "$\{ORIGIN\}/../${LIB_DIR}" ) target_link_libraries( From 66ccb9d8baf5d6d0d6a3fa901c98d781255e28eb Mon Sep 17 00:00:00 2001 From: Indrajit Bhosale Date: Fri, 16 Aug 2024 16:15:34 -0700 Subject: [PATCH 11/44] feat: Add GRPC error codes to GRPC streaming if enabled by user. (#7499) --- Dockerfile.QA | 6 + .../inference_protocols.md | 10 ++ .../lifecycle/lifecycle_test.py | 130 ++++++++++++++++++ qa/L0_backend_python/lifecycle/test.sh | 8 ++ qa/L0_decoupled/decoupled_test.py | 16 ++- qa/L0_decoupled/test.sh | 2 +- qa/L0_grpc_state_cleanup/cleanup_test.py | 42 ++++-- .../execute_grpc_error/config.pbtxt | 51 +++++++ qa/python_models/execute_grpc_error/model.py | 52 +++++++ src/grpc/grpc_utils.h | 41 +++++- src/grpc/infer_handler.h | 43 +++++- src/grpc/stream_infer_handler.cc | 64 ++++++++- 12 files changed, 444 insertions(+), 21 deletions(-) create mode 100644 qa/python_models/execute_grpc_error/config.pbtxt create mode 100644 qa/python_models/execute_grpc_error/model.py diff --git a/Dockerfile.QA b/Dockerfile.QA index 2c43f735a5..b381abfaaf 100644 --- a/Dockerfile.QA +++ b/Dockerfile.QA @@ -267,6 +267,12 @@ RUN cp -r qa/L0_decoupled/models qa/L0_decoupled/python_models/ && \ cp /workspace/tritonbuild/python/examples/decoupled/square_config.pbtxt \ qa/L0_decoupled/python_models/square_int32/. +RUN mkdir -p qa/L0_decoupled_grpc_error && \ + cp -r qa/L0_decoupled/. qa/L0_decoupled_grpc_error + +RUN mkdir -p qa/L0_grpc_error_state_cleanup && \ + cp -r qa/L0_grpc_state_cleanup/. qa/L0_grpc_error_state_cleanup + RUN mkdir -p qa/L0_repoagent_checksum/models/identity_int32/1 && \ cp tritonbuild/identity/install/backends/identity/libtriton_identity.so \ qa/L0_repoagent_checksum/models/identity_int32/1/. diff --git a/docs/customization_guide/inference_protocols.md b/docs/customization_guide/inference_protocols.md index 592f26e7d1..a241f097da 100644 --- a/docs/customization_guide/inference_protocols.md +++ b/docs/customization_guide/inference_protocols.md @@ -115,6 +115,16 @@ These options can be used to configure the KeepAlive settings: For client-side documentation, see [Client-Side GRPC KeepAlive](https://github.com/triton-inference-server/client/blob/main/README.md#grpc-keepalive). +#### GRPC Status Codes + +Triton implements GRPC error handling for streaming requests when a specific flag is enabled through headers. Upon encountering an error, Triton returns the appropriate GRPC error code and subsequently closes the stream. + +* `triton_grpc_error` : The header value needs to be set to true while starting the stream. + +GRPC status codes can be used for better visibility and monitoring. For more details, see [gRPC Status Codes](https://grpc.io/docs/guides/status-codes/) + +For client-side documentation, see [Client-Side GRPC Status Codes](https://github.com/triton-inference-server/client/tree/main#GRPC-Status-Codes) + ### Limit Endpoint Access (BETA) Triton users may want to restrict access to protocols or APIs that are diff --git a/qa/L0_backend_python/lifecycle/lifecycle_test.py b/qa/L0_backend_python/lifecycle/lifecycle_test.py index 883f6d20b6..d6eb2a8f53 100755 --- a/qa/L0_backend_python/lifecycle/lifecycle_test.py +++ b/qa/L0_backend_python/lifecycle/lifecycle_test.py @@ -35,6 +35,7 @@ sys.path.append("../../common") import queue +import threading import time import unittest from functools import partial @@ -241,6 +242,135 @@ def test_infer_pymodel_error(self): initial_metrics_value, ) + # Test grpc stream behavior when triton_grpc_error is set to true. + # Expected to close stream and return GRPC error when model returns error. + def test_triton_grpc_error_error_on(self): + model_name = "execute_grpc_error" + shape = [2, 2] + number_of_requests = 2 + user_data = UserData() + triton_client = grpcclient.InferenceServerClient(f"{_tritonserver_ipaddr}:8001") + metadata = {"triton_grpc_error": "true"} + triton_client.start_stream( + callback=partial(callback, user_data), headers=metadata + ) + stream_end = False + for i in range(number_of_requests): + input_data = np.random.randn(*shape).astype(np.float32) + inputs = [ + grpcclient.InferInput( + "IN", input_data.shape, np_to_triton_dtype(input_data.dtype) + ) + ] + inputs[0].set_data_from_numpy(input_data) + try: + triton_client.async_stream_infer(model_name=model_name, inputs=inputs) + result = user_data._completed_requests.get() + if type(result) == InferenceServerException: + # execute_grpc_error intentionally returns error with StatusCode.INTERNAL status on 2nd request + self.assertEqual(str(result.status()), "StatusCode.INTERNAL") + stream_end = True + else: + # Stream is not killed + output_data = result.as_numpy("OUT") + self.assertIsNotNone(output_data, "error: expected 'OUT'") + except Exception as e: + if stream_end == True: + # We expect the stream to have closed + self.assertTrue( + True, + "This should always pass as cancellation should succeed", + ) + else: + self.assertFalse( + True, "Unexpected Stream killed without Error from CORE" + ) + + # Test grpc stream behavior when triton_grpc_error is set to true in multiple open streams. + # Expected to close stream and return GRPC error when model returns error. + def test_triton_grpc_error_multithreaded(self): + thread1 = threading.Thread(target=self.test_triton_grpc_error_error_on) + thread2 = threading.Thread(target=self.test_triton_grpc_error_error_on) + # Start the threads + thread1.start() + thread2.start() + # Wait for both threads to finish + thread1.join() + thread2.join() + + # Test grpc stream behavior when triton_grpc_error is set to true and subsequent stream is cancelled. + # Expected cancellation is successful. + def test_triton_grpc_error_cancel(self): + model_name = "execute_grpc_error" + shape = [2, 2] + number_of_requests = 1 + user_data = UserData() + triton_server_url = "localhost:8001" # Replace with your Triton server address + stream_end = False + triton_client = grpcclient.InferenceServerClient(triton_server_url) + + metadata = {"triton_grpc_error": "true"} + + triton_client.start_stream( + callback=partial(callback, user_data), headers=metadata + ) + + for i in range(number_of_requests): + input_data = np.random.randn(*shape).astype(np.float32) + inputs = [ + grpcclient.InferInput( + "IN", input_data.shape, np_to_triton_dtype(input_data.dtype) + ) + ] + inputs[0].set_data_from_numpy(input_data) + try: + triton_client.async_stream_infer(model_name=model_name, inputs=inputs) + result = user_data._completed_requests.get() + if type(result) == InferenceServerException: + stream_end = True + if i == 0: + triton_client.stop_stream(cancel_requests=True) + except Exception as e: + if stream_end == True: + # We expect the stream to have closed + self.assertTrue( + True, + "This should always pass as cancellation should succeed", + ) + else: + self.assertFalse( + True, "Unexpected Stream killed without Error from CORE" + ) + self.assertTrue( + True, + "This should always pass as cancellation should succeed without any exception", + ) + + # Test grpc stream behavior when triton_grpc_error is set to false + # and subsequent stream is NOT closed when error is reported from CORE + def test_triton_grpc_error_error_off(self): + model_name = "execute_grpc_error" + shape = [2, 2] + number_of_requests = 4 + response_counter = 0 + user_data = UserData() + triton_client = grpcclient.InferenceServerClient(f"{_tritonserver_ipaddr}:8001") + triton_client.start_stream(callback=partial(callback, user_data)) + for i in range(number_of_requests): + input_data = np.random.randn(*shape).astype(np.float32) + inputs = [ + grpcclient.InferInput( + "IN", input_data.shape, np_to_triton_dtype(input_data.dtype) + ) + ] + inputs[0].set_data_from_numpy(input_data) + triton_client.async_stream_infer(model_name=model_name, inputs=inputs) + _ = user_data._completed_requests.get() + response_counter += 1 + # we expect response_counter == number_of_requests, + # which indicates that after the first reported grpc error stream did NOT close and mode != triton_grpc_error + self.assertEqual(response_counter, number_of_requests) + if __name__ == "__main__": unittest.main() diff --git a/qa/L0_backend_python/lifecycle/test.sh b/qa/L0_backend_python/lifecycle/test.sh index dba4581ddd..59b846f56b 100755 --- a/qa/L0_backend_python/lifecycle/test.sh +++ b/qa/L0_backend_python/lifecycle/test.sh @@ -52,6 +52,14 @@ cp ../../python_models/execute_error/config.pbtxt ./models/execute_error/ sed -i "s/^max_batch_size:.*/max_batch_size: 8/" config.pbtxt && \ echo "dynamic_batching { preferred_batch_size: [8], max_queue_delay_microseconds: 12000000 }" >> config.pbtxt) +mkdir -p models/execute_grpc_error/1/ +cp ../../python_models/execute_grpc_error/model.py ./models/execute_grpc_error/1/ +cp ../../python_models/execute_grpc_error/config.pbtxt ./models/execute_grpc_error/ +(cd models/execute_grpc_error && \ + sed -i "s/^name:.*/name: \"execute_grpc_error\"/" config.pbtxt && \ + sed -i "s/^max_batch_size:.*/max_batch_size: 8/" config.pbtxt && \ + echo "dynamic_batching { preferred_batch_size: [8], max_queue_delay_microseconds: 1200000 }" >> config.pbtxt) + mkdir -p models/execute_return_error/1/ cp ../../python_models/execute_return_error/model.py ./models/execute_return_error/1/ cp ../../python_models/execute_return_error/config.pbtxt ./models/execute_return_error/ diff --git a/qa/L0_decoupled/decoupled_test.py b/qa/L0_decoupled/decoupled_test.py index 1f76f4845b..d7bc59f5c7 100755 --- a/qa/L0_decoupled/decoupled_test.py +++ b/qa/L0_decoupled/decoupled_test.py @@ -116,7 +116,13 @@ def _stream_infer_with_params( url="localhost:8001", verbose=True ) as triton_client: # Establish stream - triton_client.start_stream(callback=partial(callback, user_data)) + if "TRITONSERVER_GRPC_STATUS_FLAG" in os.environ: + metadata = {"triton_grpc_error": "true"} + triton_client.start_stream( + callback=partial(callback, user_data), headers=metadata + ) + else: + triton_client.start_stream(callback=partial(callback, user_data)) # Send specified many requests in parallel for i in range(request_count): time.sleep((request_delay / 1000)) @@ -175,7 +181,13 @@ def _stream_infer( url="localhost:8001", verbose=True ) as triton_client: # Establish stream - triton_client.start_stream(callback=partial(callback, user_data)) + if "TRITONSERVER_GRPC_STATUS_FLAG" in os.environ: + metadata = {"triton_grpc_error": "true"} + triton_client.start_stream( + callback=partial(callback, user_data), headers=metadata + ) + else: + triton_client.start_stream(callback=partial(callback, user_data)) # Send specified many requests in parallel for i in range(request_count): time.sleep((request_delay / 1000)) diff --git a/qa/L0_decoupled/test.sh b/qa/L0_decoupled/test.sh index 98ad134d8b..22c37dff49 100755 --- a/qa/L0_decoupled/test.sh +++ b/qa/L0_decoupled/test.sh @@ -176,4 +176,4 @@ else echo -e "\n***\n*** Test Failed\n***" fi -exit $RET +exit $RET \ No newline at end of file diff --git a/qa/L0_grpc_state_cleanup/cleanup_test.py b/qa/L0_grpc_state_cleanup/cleanup_test.py index 431eeb1720..f7507747e9 100755 --- a/qa/L0_grpc_state_cleanup/cleanup_test.py +++ b/qa/L0_grpc_state_cleanup/cleanup_test.py @@ -161,9 +161,17 @@ def _stream_infer_with_params( url="localhost:8001", verbose=True ) as triton_client: # Establish stream - triton_client.start_stream( - callback=partial(callback, user_data), stream_timeout=stream_timeout - ) + if "TRITONSERVER_GRPC_STATUS_FLAG" in os.environ: + metadata = {"triton_grpc_error": "true"} + triton_client.start_stream( + callback=partial(callback, user_data), + stream_timeout=stream_timeout, + headers=metadata, + ) + else: + triton_client.start_stream( + callback=partial(callback, user_data), stream_timeout=stream_timeout + ) # Send specified many requests in parallel for i in range(request_count): time.sleep((request_delay / 1000)) @@ -229,9 +237,17 @@ def _stream_infer( url="localhost:8001", verbose=True ) as triton_client: # Establish stream - triton_client.start_stream( - callback=partial(callback, user_data), stream_timeout=stream_timeout - ) + if "TRITONSERVER_GRPC_STATUS_FLAG" in os.environ: + metadata = {"triton_grpc_error": "true"} + triton_client.start_stream( + callback=partial(callback, user_data), + stream_timeout=stream_timeout, + headers=metadata, + ) + else: + triton_client.start_stream( + callback=partial(callback, user_data), stream_timeout=stream_timeout + ) # Send specified many requests in parallel for i in range(request_count): time.sleep((request_delay / 1000)) @@ -608,9 +624,17 @@ def test_non_decoupled_streaming_multi_response(self): url="localhost:8001", verbose=True ) as client: # Establish stream - client.start_stream( - callback=partial(callback, user_data), stream_timeout=16 - ) + if "TRITONSERVER_GRPC_STATUS_FLAG" in os.environ: + metadata = {"triton_grpc_error": "true"} + client.start_stream( + callback=partial(callback, user_data), + stream_timeout=16, + headers=metadata, + ) + else: + client.start_stream( + callback=partial(callback, user_data), stream_timeout=16 + ) # Send a request client.async_stream_infer( model_name=self.repeat_non_decoupled_model_name, diff --git a/qa/python_models/execute_grpc_error/config.pbtxt b/qa/python_models/execute_grpc_error/config.pbtxt new file mode 100644 index 0000000000..70e247148a --- /dev/null +++ b/qa/python_models/execute_grpc_error/config.pbtxt @@ -0,0 +1,51 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +backend: "python" +max_batch_size: 64 + +input [ + { + name: "IN" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] + +output [ + { + name: "OUT" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] + +instance_group [ + { + count: 1 + kind : KIND_CPU + } +] diff --git a/qa/python_models/execute_grpc_error/model.py b/qa/python_models/execute_grpc_error/model.py new file mode 100644 index 0000000000..d5087a49ec --- /dev/null +++ b/qa/python_models/execute_grpc_error/model.py @@ -0,0 +1,52 @@ +# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + def __init__(self): + # Maintain total inference count, so as to return error on 2nd request, all of this to simulate model failure + self.inf_count = 1 + + def execute(self, requests): + """This function is called on inference request.""" + responses = [] + + # Generate the error for the second request + for request in requests: + input_tensor = pb_utils.get_input_tensor_by_name(request, "IN") + out_tensor = pb_utils.Tensor("OUT", input_tensor.as_numpy()) + if self.inf_count % 2: + # Every odd request is success + responses.append(pb_utils.InferenceResponse([out_tensor])) + else: + # Every even request is failure + error = pb_utils.TritonError("An error occurred during execution") + responses.append(pb_utils.InferenceResponse([out_tensor], error)) + self.inf_count += 1 + + return responses diff --git a/src/grpc/grpc_utils.h b/src/grpc/grpc_utils.h index 898e4acb4f..032dec3ad9 100644 --- a/src/grpc/grpc_utils.h +++ b/src/grpc/grpc_utils.h @@ -76,6 +76,46 @@ typedef enum { PARTIAL_COMPLETION } Steps; +typedef enum { + // No error from CORE seen yet + NONE, + // Error from CORE encountered, waiting to be picked up by completion queue to + // initiate cancellation + ERROR_ENCOUNTERED, + // Error from CORE encountered, stream closed + // This state is added to avoid double cancellation + ERROR_HANDLING_COMPLETE +} TritonGRPCErrorSteps; + +class gRPCErrorTracker { + public: + // True if set by user via header + // Can be accessed without a lock, as set only once in startstream + std::atomic triton_grpc_error_; + + // Indicates the state of triton_grpc_error, only relevant if special + // triton_grpc_error feature set to true by client + TritonGRPCErrorSteps grpc_stream_error_state_; + + // Constructor + gRPCErrorTracker() + : triton_grpc_error_(false), + grpc_stream_error_state_(TritonGRPCErrorSteps::NONE) + { + } + // Changes the state of grpc_stream_error_state_ to ERROR_HANDLING_COMPLETE, + // indicating we have closed the stream and initiated the cancel flow + void MarkGRPCErrorHandlingComplete(); + + // Returns true ONLY when GRPC_ERROR from CORE is waiting to be processed. + bool CheckAndUpdateGRPCError(); + + // Marks error after it has been responded to + void MarkGRPCErrorEncountered(); + + // Checks if error already responded to in triton_grpc_error mode + bool GRPCErrorEncountered(); +}; // Debugging helper std::ostream& operator<<(std::ostream& out, const Steps& step); @@ -183,5 +223,4 @@ TRITONSERVER_Error* ParseClassificationParams( void ReadFile(const std::string& filename, std::string& data); - }}} // namespace triton::server::grpc diff --git a/src/grpc/infer_handler.h b/src/grpc/infer_handler.h index 0e1091feb8..6382c96c3c 100644 --- a/src/grpc/infer_handler.h +++ b/src/grpc/infer_handler.h @@ -646,6 +646,7 @@ class InferHandlerState { { ctx_.reset(new ::grpc::ServerContext()); responder_.reset(new ServerResponderType(ctx_.get())); + gRPCErrorTracker_ = std::make_unique(); } void SetCompressionLevel(grpc_compression_level compression_level) @@ -666,9 +667,11 @@ class InferHandlerState { bool IsCancelled() { - return received_notification_ ? ctx_->IsCancelled() : false; + return received_notification_ + ? (ctx_->IsCancelled() || + gRPCErrorTracker_->CheckAndUpdateGRPCError()) + : false; } - // Increments the ongoing request counter void IncrementRequestCounter() { ongoing_requests_++; } @@ -710,6 +713,37 @@ class InferHandlerState { return false; } + // Extracts headers from GRPC request and updates state + void ExtractStateFromHeaders(InferHandlerStateType* state) + { + const auto& metadata = state->context_->ctx_->client_metadata(); + std::string triton_grpc_error_key = "triton_grpc_error"; + + auto it = metadata.find( + {triton_grpc_error_key.data(), triton_grpc_error_key.size()}); + + if (it != metadata.end()) { + if (it->second == "true") { + LOG_VERBOSE(2) + << "GRPC: triton_grpc_error mode detected in new grpc stream"; + state->context_->gRPCErrorTracker_->triton_grpc_error_ = true; + } + } + } + + void WriteGRPCErrorResponse(InferHandlerStateType* state) + { + std::lock_guard lock(state->context_->mu_); + // Check if Error not responded previously + // Avoid closing connection twice on multiple errors from core + if (!state->context_->gRPCErrorTracker_->GRPCErrorEncountered()) { + state->step_ = Steps::COMPLETE; + state->context_->responder_->Finish(state->status_, state); + // Mark error for this stream + state->context_->gRPCErrorTracker_->MarkGRPCErrorEncountered(); + } + } + const std::string DebugString(InferHandlerStateType* state) { std::string debug_string(""); @@ -793,6 +827,7 @@ class InferHandlerState { bool HandleCancellation( InferHandlerStateType* state, bool rpc_ok, const std::string& name) { + // Check to avoid early exit in case of triton_grpc_error if (!IsCancelled()) { LOG_ERROR << "[INTERNAL] HandleCancellation called even when the context was " @@ -816,7 +851,6 @@ class InferHandlerState { IssueRequestCancellation(); // Mark the context as cancelled state->context_->step_ = Steps::CANCELLED; - // The state returns true because the CancelExecution // call above would have raised alarm objects on all // pending inflight states objects. This state will @@ -999,6 +1033,8 @@ class InferHandlerState { // Tracks whether the async notification has been delivered by // completion queue. bool received_notification_; + + std::unique_ptr gRPCErrorTracker_; }; // This constructor is used to build a wrapper state object @@ -1090,7 +1126,6 @@ class InferHandlerState { void MarkAsAsyncNotifyState() { async_notify_state_ = true; } bool IsAsyncNotifyState() { return async_notify_state_; } - // Needed in the response handle for classification outputs. TRITONSERVER_Server* tritonserver_; diff --git a/src/grpc/stream_infer_handler.cc b/src/grpc/stream_infer_handler.cc index 585f88d536..6651eca813 100644 --- a/src/grpc/stream_infer_handler.cc +++ b/src/grpc/stream_infer_handler.cc @@ -189,7 +189,7 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok) state->context_->responder_->Finish(status, state); return !finished; } - + state->context_->ExtractStateFromHeaders(state); } else if (state->step_ == Steps::READ) { TRITONSERVER_Error* err = nullptr; const inference::ModelInferRequest& request = state->request_; @@ -355,7 +355,6 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok) GrpcStatusUtil::Create(&status, err); TRITONSERVER_ErrorDelete(err); response->set_error_message(status.error_message()); - response->mutable_infer_response()->Clear(); // repopulate the id so that client knows which request failed. response->mutable_infer_response()->set_id(request.id()); @@ -596,7 +595,13 @@ ModelStreamInferHandler::StreamInferResponseComplete( void* userp) { State* state = reinterpret_cast(userp); - + // Ignore Response from CORE in case GRPC Strict as we dont care about + if (state->context_->gRPCErrorTracker_->triton_grpc_error_) { + std::lock_guard lock(state->context_->mu_); + if (state->context_->gRPCErrorTracker_->GRPCErrorEncountered()) { + return; + } + } // Increment the callback index uint32_t response_index = state->cb_count_++; @@ -671,14 +676,27 @@ ModelStreamInferHandler::StreamInferResponseComplete( } else { LOG_ERROR << "expected the response allocator to have added the response"; } - if (err != nullptr) { failed = true; ::grpc::Status status; + // Converts CORE errors to GRPC error codes GrpcStatusUtil::Create(&status, err); response->mutable_infer_response()->Clear(); response->set_error_message(status.error_message()); LOG_VERBOSE(1) << "Failed for ID: " << log_request_id << std::endl; + if (state->context_->gRPCErrorTracker_->triton_grpc_error_) { + state->status_ = status; + // Finish only once, if backend ignores cancellation + LOG_VERBOSE(1) << "GRPC streaming error detected with status: " + << status.error_code() << "Closing stream connection." + << std::endl; + state->context_->WriteGRPCErrorResponse(state); + TRITONSERVER_ErrorDelete(err); + LOG_TRITONSERVER_ERROR( + TRITONSERVER_InferenceResponseDelete(iresponse), + "deleting GRPC inference response"); + return; + } } TRITONSERVER_ErrorDelete(err); @@ -802,4 +820,42 @@ ModelStreamInferHandler::StreamInferResponseComplete( } } +// Changes the state of grpc_stream_error_state_ to ERROR_HANDLING_COMPLETE, +// indicating we have closed the stream and initiated the cancel flow +void +gRPCErrorTracker::MarkGRPCErrorHandlingComplete() +{ + grpc_stream_error_state_ = TritonGRPCErrorSteps::ERROR_HANDLING_COMPLETE; +} + +// Returns true ONLY when GRPC_ERROR from CORE is waiting to be processed. +bool +gRPCErrorTracker::CheckAndUpdateGRPCError() +{ + if (grpc_stream_error_state_ == TritonGRPCErrorSteps::ERROR_ENCOUNTERED) { + // Change the state to ERROR_HANDLING_COMPLETE as we have called + // HandleCancellation + MarkGRPCErrorHandlingComplete(); + return true; + } + return false; +} + +// Marks error after it has been responded to +void +gRPCErrorTracker::MarkGRPCErrorEncountered() +{ + grpc_stream_error_state_ = TritonGRPCErrorSteps::ERROR_ENCOUNTERED; +} + +// Checks if error already responded to in triton_grpc_error mode +bool +gRPCErrorTracker::GRPCErrorEncountered() +{ + if (grpc_stream_error_state_ == TritonGRPCErrorSteps::NONE) { + return false; + } + return true; +} + }}} // namespace triton::server::grpc From 62184db709e7bc6e43d3098c837a464b9f2249cb Mon Sep 17 00:00:00 2001 From: Yingge He <157551214+yinggeh@users.noreply.github.com> Date: Fri, 16 Aug 2024 17:12:43 -0700 Subject: [PATCH 12/44] test: Add python backend tests for the new histogram metric (#7540) --- qa/python_models/custom_metrics/model.py | 144 ++++++++++++++++++++++- 1 file changed, 141 insertions(+), 3 deletions(-) diff --git a/qa/python_models/custom_metrics/model.py b/qa/python_models/custom_metrics/model.py index 31f105a1dd..7c78b46894 100644 --- a/qa/python_models/custom_metrics/model.py +++ b/qa/python_models/custom_metrics/model.py @@ -1,4 +1,4 @@ -# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -74,6 +74,96 @@ def _metric_api_helper(self, metric, kind): self.assertEqual(metric.value(), value) logger.log_info("Set metric to : {}".format(metric.value())) + # Test observe value + observe = 0.05 + # Counter and gauge do not support observe + with self.assertRaises(pb_utils.TritonModelException): + metric.observe(observe) + + def _histogram_api_helper(self, metric, name, labels): + def histogram_str_builder(name, type, labels, value, le=None): + if type == "count" or type == "sum": + return f"{name}_{type}{{{labels}}} {value}" + elif type == "bucket": + return f'{name}_bucket{{{labels},le="{le}"}} {value}' + else: + raise + + # Adding logger to test if custom metrics and logging work together + # as they use the same message queue. + logger = pb_utils.Logger + + # All values should be 0.0 before the test + metrics = self._get_metrics() + self.assertIn(histogram_str_builder(name, "count", labels, "0"), metrics) + self.assertIn(histogram_str_builder(name, "sum", labels, "0"), metrics) + self.assertIn( + histogram_str_builder(name, "bucket", labels, "0", le="0.1"), metrics + ) + self.assertIn( + histogram_str_builder(name, "bucket", labels, "0", le="1"), metrics + ) + self.assertIn( + histogram_str_builder(name, "bucket", labels, "0", le="2.5"), metrics + ) + self.assertIn( + histogram_str_builder(name, "bucket", labels, "0", le="5"), metrics + ) + self.assertIn( + histogram_str_builder(name, "bucket", labels, "0", le="10"), metrics + ) + self.assertIn( + histogram_str_builder(name, "bucket", labels, "0", le="+Inf"), metrics + ) + + # Histogram does not support value + with self.assertRaises(pb_utils.TritonModelException): + metric.value() + + # Test increment value + increment = 2023.0 + # Histogram does not support increment + with self.assertRaises(pb_utils.TritonModelException): + metric.increment(increment) + + # Test set value + value = 999.9 + # Histogram does not support set + with self.assertRaises(pb_utils.TritonModelException): + metric.set(value) + + # Test observe value + data = [0.05, 1.5, 6.0] + for datum in data: + metric.observe(datum) + logger.log_info("Observe histogram metric with value : {}".format(datum)) + + metrics = self._get_metrics() + self.assertIn( + histogram_str_builder(name, "count", labels, str(len(data))), metrics + ) + self.assertIn( + histogram_str_builder(name, "sum", labels, str(sum(data))), metrics + ) + self.assertIn( + histogram_str_builder(name, "bucket", labels, "1", le="0.1"), metrics + ) + self.assertIn( + histogram_str_builder(name, "bucket", labels, "1", le="1"), metrics + ) + self.assertIn( + histogram_str_builder(name, "bucket", labels, "2", le="2.5"), metrics + ) + self.assertIn( + histogram_str_builder(name, "bucket", labels, "2", le="5"), metrics + ) + self.assertIn( + histogram_str_builder(name, "bucket", labels, "3", le="10"), metrics + ) + self.assertIn( + histogram_str_builder(name, "bucket", labels, "3", le="+Inf"), metrics + ) + def _dup_metric_helper(self, labels={}): # Adding logger to test if custom metrics and logging work together # as they use the same message queue. @@ -128,14 +218,62 @@ def test_gauge_e2e(self): description="test metric gauge kind end to end", kind=pb_utils.MetricFamily.GAUGE, ) - labels = {"example1": "counter_label1", "example2": "counter_label2"} + labels = {"example1": "gauge_label1", "example2": "gauge_label2"} metric = metric_family.Metric(labels=labels) self._metric_api_helper(metric, "gauge") - pattern = 'test_gauge_e2e{example1="counter_label1",example2="counter_label2"}' + pattern = 'test_gauge_e2e{example1="gauge_label1",example2="gauge_label2"}' metrics = self._get_metrics() self.assertIn(pattern, metrics) + def test_histogram_e2e(self): + name = "test_histogram_e2e" + metric_family = pb_utils.MetricFamily( + name=name, + description="test metric histogram kind end to end", + kind=pb_utils.MetricFamily.HISTOGRAM, + ) + + labels = {"example1": "histogram_label1", "example2": "histogram_label2"} + buckets = [0.1, 1.0, 2.5, 5.0, 10.0] + metric = metric_family.Metric(labels=labels, buckets=buckets) + + labels_str = 'example1="histogram_label1",example2="histogram_label2"' + self._histogram_api_helper(metric, name, labels_str) + + metrics = self._get_metrics() + count_pattern = f"{name}_count{{{labels_str}}}" + sum_pattern = f"{name}_sum{{{labels_str}}}" + bucket_pattern = f"{name}_bucket{{{labels_str}" + self.assertEqual(metrics.count(count_pattern), 1) + self.assertEqual(metrics.count(sum_pattern), 1) + self.assertEqual(metrics.count(bucket_pattern), len(buckets) + 1) + + def test_histogram_args(self): + name = "test_histogram_args" + metric_family = pb_utils.MetricFamily( + name=name, + description="test metric histogram args", + kind=pb_utils.MetricFamily.HISTOGRAM, + ) + + # Test "None" value buckets + with self.assertRaises(pb_utils.TritonModelException): + metric_family.Metric(labels={}) + with self.assertRaises(pb_utils.TritonModelException): + metric_family.Metric(labels={}, buckets=None) + + # Test non-ascending order buckets + with self.assertRaises(pb_utils.TritonModelException): + metric_family.Metric(labels={}, buckets=[2.5, 0.1, 1.0, 10.0, 5.0]) + + # Test duplicate value buckets + with self.assertRaises(pb_utils.TritonModelException): + metric_family.Metric(labels={}, buckets=[1, 1, 2, 5, 5]) + + # Test empty list bucket + metric_family.Metric(labels={}, buckets=[]) + def test_dup_metric_family_diff_kind(self): # Test that a duplicate metric family can't be added with a conflicting type/kind metric_family1 = pb_utils.MetricFamily( From 5e397715635ca57f762e9df667e5192a0ed7d6f0 Mon Sep 17 00:00:00 2001 From: Jacky <18255193+kthui@users.noreply.github.com> Date: Tue, 20 Aug 2024 12:22:39 -0700 Subject: [PATCH 13/44] test: Load new model version should not reload loaded existing model version(s) (#7527) --- qa/L0_lifecycle/lifecycle_test.py | 127 ++++++++++++++++++++++++++++++ qa/L0_lifecycle/test.sh | 35 ++++++++ 2 files changed, 162 insertions(+) diff --git a/qa/L0_lifecycle/lifecycle_test.py b/qa/L0_lifecycle/lifecycle_test.py index a2bfc067bc..49fe684ff1 100755 --- a/qa/L0_lifecycle/lifecycle_test.py +++ b/qa/L0_lifecycle/lifecycle_test.py @@ -3493,6 +3493,133 @@ def test_delete_custom_config(self): except Exception as ex: self.assertTrue(False, "unexpected error {}".format(ex)) + def test_load_new_model_version(self): + model_name = "identity_fp32" + client = self._get_client(use_grpc=True) + + # version 1 and 2 are already loaded + # version 3 is in the model directory but not loaded + # version 4 does not exist anywhere + self.assertTrue(client.is_model_ready(model_name, "1")) + self.assertTrue(client.is_model_ready(model_name, "2")) + self.assertFalse(client.is_model_ready(model_name, "3")) + self.assertFalse(client.is_model_ready(model_name, "4")) + with open(os.environ["SERVER_LOG"]) as f: + server_log = f.read() + self.assertEqual(server_log.count("[PB model] Loading version 1"), 1) + self.assertEqual(server_log.count("[PB model] Loading version 2"), 1) + self.assertEqual(server_log.count("[PB model] Loading version 3"), 0) + self.assertEqual(server_log.count("[PB model] Loading version 4"), 0) + self.assertEqual(server_log.count("successfully loaded 'identity_fp32'"), 1) + + # update version 2 model file + Path(os.path.join("models", model_name, "2", "model.py")).touch() + # add version 4 model file + src_path = os.path.join("models", model_name, "3") + dst_path = os.path.join("models", model_name, "4") + shutil.copytree(src_path, dst_path) + # update model config to load version 1 to 4 + config_path = os.path.join("models", model_name, "config.pbtxt") + with open(config_path, mode="r+", encoding="utf-8", errors="strict") as f: + config = f.read() + config = config.replace( + "version_policy: { specific: { versions: [1, 2] } }", + "version_policy: { specific: { versions: [1, 2, 3, 4] } }", + ) + f.truncate(0) + f.seek(0) + f.write(config) + # reload the model + client.load_model(model_name) + + # version 1 is unmodified so it should not be reloaded + # version 2 is modified so it should be reloaded + # version 3 model file existed but not loaded so it should be loaded + # version 4 is a new version so it should be loaded + self.assertTrue(client.is_model_ready(model_name, "1")) + self.assertTrue(client.is_model_ready(model_name, "2")) + self.assertTrue(client.is_model_ready(model_name, "3")) + self.assertTrue(client.is_model_ready(model_name, "4")) + with open(os.environ["SERVER_LOG"]) as f: + server_log = f.read() + self.assertEqual(server_log.count("[PB model] Loading version 1"), 1) + self.assertEqual(server_log.count("[PB model] Loading version 2"), 2) + self.assertEqual(server_log.count("[PB model] Loading version 3"), 1) + self.assertEqual(server_log.count("[PB model] Loading version 4"), 1) + self.assertEqual(server_log.count("successfully loaded 'identity_fp32'"), 2) + + # simulate a dependency change to all versions + Path(os.path.join("models", model_name, "dummy_dependency.py")).touch() + # reload the model + client.load_model(model_name) + + # all 4 versions should be reloaded + self.assertTrue(client.is_model_ready(model_name, "1")) + self.assertTrue(client.is_model_ready(model_name, "2")) + self.assertTrue(client.is_model_ready(model_name, "3")) + self.assertTrue(client.is_model_ready(model_name, "4")) + with open(os.environ["SERVER_LOG"]) as f: + server_log = f.read() + self.assertEqual(server_log.count("[PB model] Loading version 1"), 2) + self.assertEqual(server_log.count("[PB model] Loading version 2"), 3) + self.assertEqual(server_log.count("[PB model] Loading version 3"), 2) + self.assertEqual(server_log.count("[PB model] Loading version 4"), 2) + self.assertEqual(server_log.count("successfully loaded 'identity_fp32'"), 3) + + # update model config to only load version 4 + config_path = os.path.join("models", model_name, "config.pbtxt") + with open(config_path, mode="r+", encoding="utf-8", errors="strict") as f: + config = f.read() + config = config.replace( + "version_policy: { specific: { versions: [1, 2, 3, 4] } }", + "version_policy: { specific: { versions: [4] } }", + ) + f.truncate(0) + f.seek(0) + f.write(config) + # reload the model + client.load_model(model_name) + + # only version 4 should be available and no reloads should happen + self.assertFalse(client.is_model_ready(model_name, "1")) + self.assertFalse(client.is_model_ready(model_name, "2")) + self.assertFalse(client.is_model_ready(model_name, "3")) + self.assertTrue(client.is_model_ready(model_name, "4")) + with open(os.environ["SERVER_LOG"]) as f: + server_log = f.read() + self.assertEqual(server_log.count("[PB model] Loading version 1"), 2) + self.assertEqual(server_log.count("[PB model] Loading version 2"), 3) + self.assertEqual(server_log.count("[PB model] Loading version 3"), 2) + self.assertEqual(server_log.count("[PB model] Loading version 4"), 2) + self.assertEqual(server_log.count("successfully loaded 'identity_fp32'"), 4) + + # update model config to load version 1 and 4 + config_path = os.path.join("models", model_name, "config.pbtxt") + with open(config_path, mode="r+", encoding="utf-8", errors="strict") as f: + config = f.read() + config = config.replace( + "version_policy: { specific: { versions: [4] } }", + "version_policy: { specific: { versions: [1, 4] } }", + ) + f.truncate(0) + f.seek(0) + f.write(config) + # reload the model + client.load_model(model_name) + + # version 1 should be loaded and version 4 should not be reloaded + self.assertTrue(client.is_model_ready(model_name, "1")) + self.assertFalse(client.is_model_ready(model_name, "2")) + self.assertFalse(client.is_model_ready(model_name, "3")) + self.assertTrue(client.is_model_ready(model_name, "4")) + with open(os.environ["SERVER_LOG"]) as f: + server_log = f.read() + self.assertEqual(server_log.count("[PB model] Loading version 1"), 3) + self.assertEqual(server_log.count("[PB model] Loading version 2"), 3) + self.assertEqual(server_log.count("[PB model] Loading version 3"), 2) + self.assertEqual(server_log.count("[PB model] Loading version 4"), 2) + self.assertEqual(server_log.count("successfully loaded 'identity_fp32'"), 5) + if __name__ == "__main__": unittest.main() diff --git a/qa/L0_lifecycle/test.sh b/qa/L0_lifecycle/test.sh index 9236fdabfb..4efd244c76 100755 --- a/qa/L0_lifecycle/test.sh +++ b/qa/L0_lifecycle/test.sh @@ -2196,6 +2196,41 @@ set -e kill $SERVER_PID wait $SERVER_PID +LOG_IDX=$((LOG_IDX+1)) + +# LifeCycleTest.test_load_new_model_version +rm -rf models +mkdir models +cp -r ../python_models/identity_fp32 models/ && (cd models/identity_fp32 && \ + echo "version_policy: { specific: { versions: [1, 2] } }" >> config.pbtxt && \ + echo " def initialize(self, args):" >> model.py && \ + echo " pb_utils.Logger.log_info(f'[PB model] Loading version {args[\"model_version\"]}')" >> model.py && \ + mkdir 1 && cp model.py 1 && \ + mkdir 2 && cp model.py 2 && \ + mkdir 3 && mv model.py 3) + +export PYTHONDONTWRITEBYTECODE="True" +SERVER_ARGS="--model-repository=`pwd`/models --model-control-mode=explicit --load-model=*" +SERVER_LOG="./inference_server_$LOG_IDX.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +SERVER_LOG=$SERVER_LOG python $LC_TEST LifeCycleTest.test_load_new_model_version >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID +unset PYTHONDONTWRITEBYTECODE if [ $RET -eq 0 ]; then echo -e "\n***\n*** Test Passed\n***" From 44fa2078859ef2e0b116b26b7d27184b836c8c33 Mon Sep 17 00:00:00 2001 From: Indrajit Bhosale Date: Tue, 20 Aug 2024 12:49:24 -0700 Subject: [PATCH 14/44] Intermittent `L0_decoupled_grpc_error` crash fixed. (#7552) --- src/grpc/infer_handler.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/grpc/infer_handler.h b/src/grpc/infer_handler.h index 6382c96c3c..51307d4ae0 100644 --- a/src/grpc/infer_handler.h +++ b/src/grpc/infer_handler.h @@ -667,6 +667,7 @@ class InferHandlerState { bool IsCancelled() { + std::lock_guard lock(mu_); return received_notification_ ? (ctx_->IsCancelled() || gRPCErrorTracker_->CheckAndUpdateGRPCError()) From fb60c0ea1fcb81ae4906531aa1b722111260f4d4 Mon Sep 17 00:00:00 2001 From: Francesco Petrini Date: Thu, 22 Aug 2024 10:13:08 -0700 Subject: [PATCH 15/44] ci: Raise Documentation Generation Errors (#7559) * ci: Raise Documentation Generation Errors --- docs/generate_docs.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/docs/generate_docs.py b/docs/generate_docs.py index 6982294d21..0e4079e40b 100755 --- a/docs/generate_docs.py +++ b/docs/generate_docs.py @@ -123,9 +123,7 @@ def run_command(command): stderr=subprocess.PIPE, ) except subprocess.CalledProcessError as e: - log_message(f"Error executing command: {e.cmd}") - log_message(e.output) - log_message(e.stderr) + raise (e) def clone_from_github(repo, tag, org): @@ -137,7 +135,6 @@ def clone_from_github(repo, tag, org): """ # Construct the full GitHub repository URL repo_url = f"https://github.com/{org}/{repo}.git" - print(repo_url) # Construct the git clone command if tag: clone_command = [ @@ -155,7 +152,7 @@ def clone_from_github(repo, tag, org): subprocess.run(clone_command, check=True) log_message(f"Successfully cloned {repo}") except subprocess.CalledProcessError as e: - log_message(f"Failed to clone {repo}. Error: {e}") + raise (e) def parse_repo_tag(repo_tags): @@ -189,8 +186,8 @@ def get_git_repo_name(file_path): .decode() .strip() ) - except subprocess.CalledProcessError: - return None + except subprocess.CalledProcessError as e: + raise (e) # Extract repository name from the remote URL. if remote_url.endswith(".git"): From 187a4a3f8fc8d6b577bebe716f7a8028ea3005ac Mon Sep 17 00:00:00 2001 From: Kris Hung Date: Fri, 23 Aug 2024 10:51:26 -0700 Subject: [PATCH 16/44] docs: Add tensorrtllm_backend into doc generation (#7563) --- docs/generate_docs.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/docs/generate_docs.py b/docs/generate_docs.py index 0e4079e40b..acf0afc981 100755 --- a/docs/generate_docs.py +++ b/docs/generate_docs.py @@ -393,6 +393,12 @@ def main(): if "python_backend" in repo_tags: clone_from_github("python_backend", repo_tags["python_backend"], github_org) + # Usage generate_docs.py --repo-tag=tensorrtllm_backend:main + if "tensorrtllm_backend" in repo_tags: + clone_from_github( + "tensorrtllm_backend", repo_tags["tensorrtllm_backend"], github_org + ) + # Usage generate_docs.py --backend-tag=custom_backend:main # Custom backend can be anything currently empty if "custom_backend" in backend_tags: @@ -409,6 +415,10 @@ def main(): run_command("rm -rf python_backend") if "custom_backend" in backend_tags: run_command("rm -rf custom_backend") + if "tensorrtllm_backend" in repo_tags: + run_command("rm -rf tensorrtllm_backend") + if "perf_analyzer" in repo_tags: + run_command("rm -rf perf_analyzer") # Return to previous working directory server/. os.chdir(server_abspath) From 6a697632aa5c40791ad2e040a9921ffc80c766bb Mon Sep 17 00:00:00 2001 From: Francesco Petrini Date: Tue, 27 Aug 2024 09:34:47 -0700 Subject: [PATCH 17/44] build: RHEL8 EA2 Backends (#7568) * build: RHEL8 EA2 Backends --- build.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/build.py b/build.py index 2a9b2469fc..6901c2e201 100755 --- a/build.py +++ b/build.py @@ -116,7 +116,8 @@ def fail_if(p, msg): def target_platform(): - if FLAGS.target_platform is not None: + # When called by compose.py, FLAGS will be None + if FLAGS and FLAGS.target_platform is not None: return FLAGS.target_platform platform_string = platform.system().lower() if platform_string == "linux": @@ -132,7 +133,8 @@ def target_platform(): def target_machine(): - if FLAGS.target_machine is not None: + # When called by compose.py, FLAGS will be None + if FLAGS and FLAGS.target_machine is not None: return FLAGS.target_machine return platform.machine().lower() @@ -639,13 +641,16 @@ def pytorch_cmake_args(images): cmake_backend_arg("pytorch", "TRITON_PYTORCH_DOCKER_IMAGE", None, image), ] - if FLAGS.enable_gpu: + # TODO: TPRD-372 TorchTRT extension is not currently supported by our manylinux build + # TODO: TPRD-373 NVTX extension is not currently supported by our manylinux build + if target_platform() != "rhel": + if FLAGS.enable_gpu: + cargs.append( + cmake_backend_enable("pytorch", "TRITON_PYTORCH_ENABLE_TORCHTRT", True) + ) cargs.append( - cmake_backend_enable("pytorch", "TRITON_PYTORCH_ENABLE_TORCHTRT", True) + cmake_backend_enable("pytorch", "TRITON_ENABLE_NVTX", FLAGS.enable_nvtx) ) - cargs.append( - cmake_backend_enable("pytorch", "TRITON_ENABLE_NVTX", FLAGS.enable_nvtx) - ) return cargs @@ -1301,7 +1306,6 @@ def dockerfile_prepare_container_linux(argmap, backends, enable_gpu, target_mach gpu_enabled=gpu_enabled ) - # This if target_platform() == "rhel": df += """ # Common dpeendencies. From 89641b693e2ea4762577209f468da38a0c88fa5e Mon Sep 17 00:00:00 2001 From: Pavithra Vijayakrishnan <160681768+pvijayakrish@users.noreply.github.com> Date: Tue, 27 Aug 2024 12:16:08 -0700 Subject: [PATCH 18/44] Release: Update NGC versions post-24.08 release (#7565) Co-authored-by: Misha Chornyi <99709299+mc-nv@users.noreply.github.com> Co-authored-by: Francesco Petrini --- Dockerfile.sdk | 2 +- Dockerfile.win10.min | 12 +++++------ README.md | 8 ++++---- TRITON_VERSION | 2 +- build.py | 12 +++++++---- deploy/aws/values.yaml | 2 +- deploy/fleetcommand/Chart.yaml | 2 +- deploy/fleetcommand/values.yaml | 6 +++--- deploy/gcp/values.yaml | 2 +- .../perf-analyzer-script/triton_client.yaml | 2 +- .../server-deployer/build_and_push.sh | 6 +++--- .../server-deployer/chart/triton/Chart.yaml | 4 ++-- .../server-deployer/chart/triton/values.yaml | 6 +++--- .../server-deployer/data-test/schema.yaml | 2 +- .../server-deployer/schema.yaml | 4 ++-- .../gke-marketplace-app/trt-engine/README.md | 6 +++--- deploy/k8s-onprem/values.yaml | 2 +- deploy/oci/values.yaml | 2 +- docs/customization_guide/build.md | 6 +++--- docs/customization_guide/compose.md | 18 ++++++++--------- docs/customization_guide/test.md | 2 +- docs/generate_docs.py | 4 ++-- docs/user_guide/custom_operations.md | 6 +++--- docs/user_guide/performance_tuning.md | 4 ++-- qa/L0_backend_python/test.sh | 6 +++--- qa/L0_batcher/test.sh | 4 ++-- qa/L0_grpc/test.sh | 2 +- qa/L0_http/test.sh | 2 +- qa/L0_infer/test.sh | 2 +- qa/L0_sequence_batcher/test.sh | 2 +- qa/L0_trt_plugin/test.sh | 4 ++-- qa/common/gen_jetson_trt_models | 2 +- qa/common/gen_qa_custom_ops | 2 +- qa/common/gen_qa_model_repository | 2 +- qa/common/util.sh | 20 +++++++++++++------ 35 files changed, 91 insertions(+), 79 deletions(-) diff --git a/Dockerfile.sdk b/Dockerfile.sdk index 7897c2a215..c7a68fc6af 100644 --- a/Dockerfile.sdk +++ b/Dockerfile.sdk @@ -29,7 +29,7 @@ # # Base image on the minimum Triton container -ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.07-py3-min +ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.08-py3-min ARG TRITON_CLIENT_REPO_SUBDIR=clientrepo ARG TRITON_PA_REPO_SUBDIR=perfanalyzerrepo diff --git a/Dockerfile.win10.min b/Dockerfile.win10.min index 0a554fbcf4..29d2c2a43a 100644 --- a/Dockerfile.win10.min +++ b/Dockerfile.win10.min @@ -37,7 +37,7 @@ RUN choco install unzip -y # # Installing TensorRT # -ARG TENSORRT_VERSION=10.2.0.19 +ARG TENSORRT_VERSION=10.3.0.26 ARG TENSORRT_ZIP="TensorRT-${TENSORRT_VERSION}.Windows10.x86_64.cuda-12.5.zip" ARG TENSORRT_SOURCE=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.2.0/zip/TensorRT-10.2.0.19.Windows10.x86_64.cuda-12.5.zip # COPY ${TENSORRT_ZIP} /tmp/${TENSORRT_ZIP} @@ -51,7 +51,7 @@ LABEL TENSORRT_VERSION="${TENSORRT_VERSION}" # # Installing cuDNN # -ARG CUDNN_VERSION=9.2.1.18 +ARG CUDNN_VERSION=9.3.0.75 ARG CUDNN_ZIP=cudnn-windows-x86_64-${CUDNN_VERSION}_cuda12-archive.zip ARG CUDNN_SOURCE=https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/windows-x86_64/cudnn-windows-x86_64-9.2.1.18_cuda12-archive.zip ADD ${CUDNN_SOURCE} /tmp/${CUDNN_ZIP} @@ -101,14 +101,14 @@ LABEL CMAKE_VERSION=${CMAKE_VERSION} # # Installing Visual Studio BuildTools: VS17 2022 # -ARG BUILDTOOLS_VERSION=17.9.34622.214 +ARG BUILDTOOLS_VERSION=17.10.35201.131 # Download collect.exe in case of an install failure. ADD https://aka.ms/vscollect.exe "C:\tmp\collect.exe" # Use the latest release channel. For more control, specify the location of an internal layout. # Download the Build Tools bootstrapper. # ARG BUILD_TOOLS_SOURCE=https://aka.ms/vs/17/release/vs_buildtools.exe -ARG BUILD_TOOLS_SOURCE=https://download.visualstudio.microsoft.com/download/pr/5e7b923b-7d89-4e14-95b8-a84ab168e243/96b21d216c7954aaf606c6d7ba59a3de991884a8a86c578c767ba349c23188a9/vs_BuildTools.exe +ARG BUILD_TOOLS_SOURCE=https://download.visualstudio.microsoft.com/download/pr/28626b4b-f88f-4b55-a0cf-f3eaa2c643fb/e6c43d4dfb36338d954cdb3ad9010ab2a479e712088f4f6b016eadcc721bab28/vs_BuildTools.exe ADD ${BUILD_TOOLS_SOURCE} vs_buildtools.exe # Install Build Tools with the Microsoft.VisualStudio.Workload.VCTools workload, including recommended. ARG VS_INSTALL_PATH_WP="C:\BuildTools" @@ -175,7 +175,7 @@ RUN copy "%CUDA_INSTALL_ROOT_WP%\extras\visual_studio_integration\MSBuildExtensi RUN setx PATH "%CUDA_INSTALL_ROOT_WP%\bin;%PATH%" -ARG CUDNN_VERSION=9.2.1.18 +ARG CUDNN_VERSION=9.3.0.75 ENV CUDNN_VERSION ${CUDNN_VERSION} COPY --from=dependency_base /cudnn /cudnn RUN copy cudnn\bin\cudnn*.dll "%CUDA_INSTALL_ROOT_WP%\bin\." @@ -183,7 +183,7 @@ RUN copy cudnn\lib\x64\cudnn*.lib "%CUDA_INSTALL_ROOT_WP%\lib\x64\." RUN copy cudnn\include\cudnn*.h "%CUDA_INSTALL_ROOT_WP%\include\." LABEL CUDNN_VERSION="${CUDNN_VERSION}" -ARG TENSORRT_VERSION=10.2.0.19 +ARG TENSORRT_VERSION=10.3.0.26 ENV TRT_VERSION ${TENSORRT_VERSION} COPY --from=dependency_base /TensorRT /TensorRT RUN setx PATH "c:\TensorRT\lib;%PATH%" diff --git a/README.md b/README.md index f9b1a483f3..da80cc3a2b 100644 --- a/README.md +++ b/README.md @@ -45,7 +45,7 @@ ___ ##### LATEST RELEASE You are currently on the `main` branch which tracks under-development progress towards the next release. -The current release is version [2.48.0](https://github.com/triton-inference-server/server/releases/latest) and corresponds to the 24.07 container release on NVIDIA GPU Cloud (NGC). +The current release is version [2.49.0](https://github.com/triton-inference-server/server/releases/latest) and corresponds to the 24.08 container release on NVIDIA GPU Cloud (NGC). Triton Inference Server is an open source inference serving software that streamlines AI inferencing. Triton enables teams to deploy any AI model from @@ -103,16 +103,16 @@ Inference Server with the ```bash # Step 1: Create the example model repository -git clone -b r24.07 https://github.com/triton-inference-server/server.git +git clone -b r24.08 https://github.com/triton-inference-server/server.git cd server/docs/examples ./fetch_models.sh # Step 2: Launch triton from the NGC Triton container -docker run --gpus=1 --rm --net=host -v ${PWD}/model_repository:/models nvcr.io/nvidia/tritonserver:24.07-py3 tritonserver --model-repository=/models +docker run --gpus=1 --rm --net=host -v ${PWD}/model_repository:/models nvcr.io/nvidia/tritonserver:24.08-py3 tritonserver --model-repository=/models # Step 3: Sending an Inference Request # In a separate console, launch the image_client example from the NGC Triton SDK container -docker run -it --rm --net=host nvcr.io/nvidia/tritonserver:24.07-py3-sdk +docker run -it --rm --net=host nvcr.io/nvidia/tritonserver:24.08-py3-sdk /workspace/install/bin/image_client -m densenet_onnx -c 3 -s INCEPTION /workspace/images/mug.jpg # Inference should return the following diff --git a/TRITON_VERSION b/TRITON_VERSION index 37433781ef..5db7ab5ba3 100644 --- a/TRITON_VERSION +++ b/TRITON_VERSION @@ -1 +1 @@ -2.49.0dev \ No newline at end of file +2.50.0dev \ No newline at end of file diff --git a/build.py b/build.py index 6901c2e201..3195c50cbb 100755 --- a/build.py +++ b/build.py @@ -70,9 +70,9 @@ # incorrectly load the other version of the openvino libraries. # TRITON_VERSION_MAP = { - "2.49.0dev": ( - "24.08dev", # triton container - "24.07", # upstream container + "2.50.0dev": ( + "24.09dev", # triton container + "24.08", # upstream container "1.18.1", # ORT "2024.0.0", # ORT OpenVINO "2024.0.0", # Standalone OpenVINO @@ -216,6 +216,8 @@ def header(self, desc=None): self.comment("Exit script immediately if any command fails") if target_platform() == "windows": + self._file.write("$UseStructuredOutput = $false\n") + self.blankln() self._file.write("function ExitWithCode($exitcode) {\n") self._file.write(" $host.SetShouldExit($exitcode)\n") self._file.write(" exit $exitcode\n") @@ -660,7 +662,9 @@ def onnxruntime_cmake_args(images, library_paths): "onnxruntime", "TRITON_BUILD_ONNXRUNTIME_VERSION", None, - TRITON_VERSION_MAP[FLAGS.version][2], + os.getenv("TRITON_BUILD_ONNXRUNTIME_VERSION") + if os.getenv("TRITON_BUILD_ONNXRUNTIME_VERSION") + else TRITON_VERSION_MAP[FLAGS.version][2], ) ] diff --git a/deploy/aws/values.yaml b/deploy/aws/values.yaml index 98151829c7..67ecba6c53 100644 --- a/deploy/aws/values.yaml +++ b/deploy/aws/values.yaml @@ -27,7 +27,7 @@ replicaCount: 1 image: - imageName: nvcr.io/nvidia/tritonserver:24.07-py3 + imageName: nvcr.io/nvidia/tritonserver:24.08-py3 pullPolicy: IfNotPresent modelRepositoryPath: s3://triton-inference-server-repository/model_repository numGpus: 1 diff --git a/deploy/fleetcommand/Chart.yaml b/deploy/fleetcommand/Chart.yaml index 340e19fb50..68aaf8f405 100644 --- a/deploy/fleetcommand/Chart.yaml +++ b/deploy/fleetcommand/Chart.yaml @@ -26,7 +26,7 @@ apiVersion: v1 # appVersion is the Triton version; update when changing release -appVersion: "2.48.0" +appVersion: "2.49.0" description: Triton Inference Server (Fleet Command) name: triton-inference-server # version is the Chart version; update when changing anything in the chart diff --git a/deploy/fleetcommand/values.yaml b/deploy/fleetcommand/values.yaml index 7a556ef7df..f3036d5bee 100644 --- a/deploy/fleetcommand/values.yaml +++ b/deploy/fleetcommand/values.yaml @@ -27,7 +27,7 @@ replicaCount: 1 image: - imageName: nvcr.io/nvidia/tritonserver:24.07-py3 + imageName: nvcr.io/nvidia/tritonserver:24.08-py3 pullPolicy: IfNotPresent numGpus: 1 serverCommand: tritonserver @@ -47,13 +47,13 @@ image: # # To set model control mode, uncomment and configure below # TODO: Fix the following url, it is invalid - # See https://github.com/triton-inference-server/server/blob/r24.07/docs/model_management.md + # See https://github.com/triton-inference-server/server/blob/r24.08/docs/model_management.md # for more details #- --model-control-mode=explicit|poll|none # # Additional server args # - # see https://github.com/triton-inference-server/server/blob/r24.07/README.md + # see https://github.com/triton-inference-server/server/blob/r24.08/README.md # for more details service: diff --git a/deploy/gcp/values.yaml b/deploy/gcp/values.yaml index 937acc6b80..c25bcf58ce 100644 --- a/deploy/gcp/values.yaml +++ b/deploy/gcp/values.yaml @@ -27,7 +27,7 @@ replicaCount: 1 image: - imageName: nvcr.io/nvidia/tritonserver:24.07-py3 + imageName: nvcr.io/nvidia/tritonserver:24.08-py3 pullPolicy: IfNotPresent modelRepositoryPath: gs://triton-inference-server-repository/model_repository numGpus: 1 diff --git a/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml b/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml index 21e5a34077..4b896a1ac7 100644 --- a/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml +++ b/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml @@ -33,7 +33,7 @@ metadata: namespace: default spec: containers: - - image: nvcr.io/nvidia/tritonserver:24.07-py3-sdk + - image: nvcr.io/nvidia/tritonserver:24.08-py3-sdk imagePullPolicy: Always name: nv-triton-client securityContext: diff --git a/deploy/gke-marketplace-app/server-deployer/build_and_push.sh b/deploy/gke-marketplace-app/server-deployer/build_and_push.sh index e4fe8fe04f..cc5fa998b4 100755 --- a/deploy/gke-marketplace-app/server-deployer/build_and_push.sh +++ b/deploy/gke-marketplace-app/server-deployer/build_and_push.sh @@ -27,9 +27,9 @@ export REGISTRY=gcr.io/$(gcloud config get-value project | tr ':' '/') export APP_NAME=tritonserver -export MAJOR_VERSION=2.48 -export MINOR_VERSION=2.48.0 -export NGC_VERSION=24.07-py3 +export MAJOR_VERSION=2.49 +export MINOR_VERSION=2.49.0 +export NGC_VERSION=24.08-py3 docker pull nvcr.io/nvidia/$APP_NAME:$NGC_VERSION diff --git a/deploy/gke-marketplace-app/server-deployer/chart/triton/Chart.yaml b/deploy/gke-marketplace-app/server-deployer/chart/triton/Chart.yaml index e2b00ad12b..41e2e8cdb9 100644 --- a/deploy/gke-marketplace-app/server-deployer/chart/triton/Chart.yaml +++ b/deploy/gke-marketplace-app/server-deployer/chart/triton/Chart.yaml @@ -25,7 +25,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. apiVersion: v1 -appVersion: "2.48" +appVersion: "2.49" description: Triton Inference Server name: triton-inference-server -version: 2.48.0 +version: 2.49.0 diff --git a/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml b/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml index 3d460f8aa0..7f8a96608f 100644 --- a/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml +++ b/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml @@ -31,14 +31,14 @@ maxReplicaCount: 3 tritonProtocol: HTTP # HPA GPU utilization autoscaling target HPATargetAverageValue: 85 -modelRepositoryPath: gs://triton_sample_models/24.07 -publishedVersion: '2.48.0' +modelRepositoryPath: gs://triton_sample_models/24.08 +publishedVersion: '2.49.0' gcpMarketplace: true image: registry: gcr.io repository: nvidia-ngc-public/tritonserver - tag: 24.07-py3 + tag: 24.08-py3 pullPolicy: IfNotPresent # modify the model repository here to match your GCP storage bucket numGpus: 1 diff --git a/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml b/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml index 0ecf429a44..356b3cce0f 100644 --- a/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml +++ b/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml @@ -27,7 +27,7 @@ x-google-marketplace: schemaVersion: v2 applicationApiVersion: v1beta1 - publishedVersion: '2.48.0' + publishedVersion: '2.49.0' publishedVersionMetadata: releaseNote: >- Initial release. diff --git a/deploy/gke-marketplace-app/server-deployer/schema.yaml b/deploy/gke-marketplace-app/server-deployer/schema.yaml index c82f73e47f..076f62e05b 100644 --- a/deploy/gke-marketplace-app/server-deployer/schema.yaml +++ b/deploy/gke-marketplace-app/server-deployer/schema.yaml @@ -27,7 +27,7 @@ x-google-marketplace: schemaVersion: v2 applicationApiVersion: v1beta1 - publishedVersion: '2.48.0' + publishedVersion: '2.49.0' publishedVersionMetadata: releaseNote: >- Initial release. @@ -89,7 +89,7 @@ properties: modelRepositoryPath: type: string title: Bucket where models are stored. Please make sure the user/service account to create the GKE app has permission to this GCS bucket. Read Triton documentation on configs and formatting details, supporting TensorRT, TensorFlow, Pytorch, Onnx ... etc. - default: gs://triton_sample_models/24.07 + default: gs://triton_sample_models/24.08 image.ldPreloadPath: type: string title: Leave this empty by default. Triton allows users to create custom layers for backend such as TensorRT plugin or Tensorflow custom ops, the compiled shared library must be provided via LD_PRELOAD environment variable. diff --git a/deploy/gke-marketplace-app/trt-engine/README.md b/deploy/gke-marketplace-app/trt-engine/README.md index 22343d966d..aa8fa2a399 100644 --- a/deploy/gke-marketplace-app/trt-engine/README.md +++ b/deploy/gke-marketplace-app/trt-engine/README.md @@ -33,7 +33,7 @@ ``` docker run --gpus all -it --network host \ --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 \ - -v ~:/scripts nvcr.io/nvidia/tensorrt:24.07-py3 + -v ~:/scripts nvcr.io/nvidia/tensorrt:24.08-py3 pip install onnx six torch tf2onnx tensorflow @@ -57,7 +57,7 @@ mkdir -p engines python3 builder.py -m models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/model.ckpt -o engines/bert_large_int8_bs1_s128.engine -b 1 -s 128 -c models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/ -v models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/vocab.txt --int8 --fp16 --strict --calib-num 1 -iln -imh -gsutil cp bert_large_int8_bs1_s128.engine gs://triton_sample_models/24.07/bert/1/model.plan +gsutil cp bert_large_int8_bs1_s128.engine gs://triton_sample_models/24.08/bert/1/model.plan ``` -For each Triton upgrade, container version used to generate the model, and the model path in GCS `gs://triton_sample_models/24.07/` should be updated accordingly with the correct version. +For each Triton upgrade, container version used to generate the model, and the model path in GCS `gs://triton_sample_models/24.08/` should be updated accordingly with the correct version. diff --git a/deploy/k8s-onprem/values.yaml b/deploy/k8s-onprem/values.yaml index 9366a0710c..f0f28b68e1 100644 --- a/deploy/k8s-onprem/values.yaml +++ b/deploy/k8s-onprem/values.yaml @@ -29,7 +29,7 @@ tags: loadBalancing: true image: - imageName: nvcr.io/nvidia/tritonserver:24.07-py3 + imageName: nvcr.io/nvidia/tritonserver:24.08-py3 pullPolicy: IfNotPresent modelRepositoryServer: < Replace with the IP Address of your file server > modelRepositoryPath: /srv/models diff --git a/deploy/oci/values.yaml b/deploy/oci/values.yaml index 3a85e7901b..bf83490db4 100644 --- a/deploy/oci/values.yaml +++ b/deploy/oci/values.yaml @@ -27,7 +27,7 @@ replicaCount: 1 image: - imageName: nvcr.io/nvidia/tritonserver:24.07-py3 + imageName: nvcr.io/nvidia/tritonserver:24.08-py3 pullPolicy: IfNotPresent modelRepositoryPath: s3://https://.compat.objectstorage..oraclecloud.com:443/triton-inference-server-repository numGpus: 1 diff --git a/docs/customization_guide/build.md b/docs/customization_guide/build.md index 0c1cc08a41..43160f43bb 100644 --- a/docs/customization_guide/build.md +++ b/docs/customization_guide/build.md @@ -173,7 +173,7 @@ $ ./build.py ... --repo-tag=common: --repo-tag=core:` will default to the branch name. For example, if you are building on the -r24.07 branch, `` will default to r24.07. If you are +r24.08 branch, `` will default to r24.08. If you are building on any other branch (including the *main* branch) then `` will default to "main". Therefore, you typically do not need to provide `` at all (nor the preceding @@ -334,8 +334,8 @@ python build.py --cmake-dir=/build --build-dir=/tmp/citritonbuild If you are building on *main* branch then `` will default to "main". If you are building on a release branch then `` will default to the branch name. For example, if you -are building on the r24.07 branch, `` will default to -r24.07. Therefore, you typically do not need to provide `` will default to +r24.08. Therefore, you typically do not need to provide `` at all (nor the preceding colon). You can use a different `` for a component to instead use the corresponding branch/tag in the build. For example, if you have a branch called diff --git a/docs/customization_guide/compose.md b/docs/customization_guide/compose.md index ca3aafdbd0..563061c317 100644 --- a/docs/customization_guide/compose.md +++ b/docs/customization_guide/compose.md @@ -46,8 +46,8 @@ The `compose.py` script can be found in the Simply clone the repository and run `compose.py` to create a custom container. Note: Created container version will depend on the branch that was cloned. For example branch - [r24.07](https://github.com/triton-inference-server/server/tree/r24.07) -should be used to create a image based on the NGC 24.07 Triton release. + [r24.08](https://github.com/triton-inference-server/server/tree/r24.08) +should be used to create a image based on the NGC 24.08 Triton release. `compose.py` provides `--backend`, `--repoagent` options that allow you to specify which backends and repository agents to include in the custom image. @@ -79,20 +79,20 @@ For example, running ``` python3 compose.py --backend pytorch --repoagent checksum ``` -on branch [r24.07](https://github.com/triton-inference-server/server/tree/r24.07) pulls: -- `min` container `nvcr.io/nvidia/tritonserver:24.07-py3-min` -- `full` container `nvcr.io/nvidia/tritonserver:24.07-py3` +on branch [r24.08](https://github.com/triton-inference-server/server/tree/r24.08) pulls: +- `min` container `nvcr.io/nvidia/tritonserver:24.08-py3-min` +- `full` container `nvcr.io/nvidia/tritonserver:24.08-py3` Alternatively, users can specify the version of Triton container to pull from any branch by either: 1. Adding flag `--container-version ` to branch ``` -python3 compose.py --backend pytorch --repoagent checksum --container-version 24.07 +python3 compose.py --backend pytorch --repoagent checksum --container-version 24.08 ``` 2. Specifying `--image min, --image full,`. The user is responsible for specifying compatible `min` and `full` containers. ``` -python3 compose.py --backend pytorch --repoagent checksum --image min,nvcr.io/nvidia/tritonserver:24.07-py3-min --image full,nvcr.io/nvidia/tritonserver:24.07-py3 +python3 compose.py --backend pytorch --repoagent checksum --image min,nvcr.io/nvidia/tritonserver:24.08-py3-min --image full,nvcr.io/nvidia/tritonserver:24.08-py3 ``` Method 1 and 2 will result in the same composed container. Furthermore, `--image` flag overrides the `--container-version` flag when both are specified. @@ -103,8 +103,8 @@ Note: 2. vLLM and TensorRT-LLM backends are currently not supported backends for `compose.py`. If you want to build additional backends on top of these backends, it would be better to [build it yourself](#build-it-yourself) by using -`nvcr.io/nvidia/tritonserver:24.07-vllm-python-py3` or -`nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3` as a `min` container. +`nvcr.io/nvidia/tritonserver:24.08-vllm-python-py3` or +`nvcr.io/nvidia/tritonserver:24.08-trtllm-python-py3` as a `min` container. ### CPU-only container composition diff --git a/docs/customization_guide/test.md b/docs/customization_guide/test.md index e066d31493..898267e34f 100644 --- a/docs/customization_guide/test.md +++ b/docs/customization_guide/test.md @@ -49,7 +49,7 @@ $ ./gen_qa_custom_ops ``` This will create multiple model repositories in /tmp/\/qa_* -(for example /tmp/24.07/qa_model_repository). The TensorRT models +(for example /tmp/24.08/qa_model_repository). The TensorRT models will be created for the GPU on the system that CUDA considers device 0 (zero). If you have multiple GPUs on your system see the documentation in the scripts for how to target a specific GPU. diff --git a/docs/generate_docs.py b/docs/generate_docs.py index acf0afc981..3cb9de4bc6 100755 --- a/docs/generate_docs.py +++ b/docs/generate_docs.py @@ -43,11 +43,11 @@ """ TODO: Needs to handle cross-branch linkage. -For example, server/docs/user_guide/architecture.md on branch 24.07 links to +For example, server/docs/user_guide/architecture.md on branch 24.08 links to server/docs/user_guide/model_analyzer.md on main branch. In this case, the hyperlink of model_analyzer.md should be a URL instead of relative path. -Another example can be server/docs/user_guide/model_analyzer.md on branch 24.07 +Another example can be server/docs/user_guide/model_analyzer.md on branch 24.08 links to a file in server repo with relative path. Currently all URLs are hardcoded to main branch. We need to make sure that the URL actually points to the correct branch. We also need to handle cases like deprecated or removed files from diff --git a/docs/user_guide/custom_operations.md b/docs/user_guide/custom_operations.md index 136edd180f..6fa3cee3dc 100644 --- a/docs/user_guide/custom_operations.md +++ b/docs/user_guide/custom_operations.md @@ -64,7 +64,7 @@ simple way to ensure you are using the correct version of TensorRT is to use the [NGC TensorRT container](https://ngc.nvidia.com/catalog/containers/nvidia:tensorrt) corresponding to the Triton container. For example, if you are using -the 24.07 version of Triton, use the 24.07 version of the TensorRT +the 24.08 version of Triton, use the 24.08 version of the TensorRT container. ## TensorFlow @@ -123,7 +123,7 @@ simple way to ensure you are using the correct version of TensorFlow is to use the [NGC TensorFlow container](https://ngc.nvidia.com/catalog/containers/nvidia:tensorflow) corresponding to the Triton container. For example, if you are using -the 24.07 version of Triton, use the 24.07 version of the TensorFlow +the 24.08 version of Triton, use the 24.08 version of the TensorFlow container. ## PyTorch @@ -167,7 +167,7 @@ simple way to ensure you are using the correct version of PyTorch is to use the [NGC PyTorch container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) corresponding to the Triton container. For example, if you are using -the 24.07 version of Triton, use the 24.07 version of the PyTorch +the 24.08 version of Triton, use the 24.08 version of the PyTorch container. ## ONNX diff --git a/docs/user_guide/performance_tuning.md b/docs/user_guide/performance_tuning.md index 446534da99..70e76cd5ef 100644 --- a/docs/user_guide/performance_tuning.md +++ b/docs/user_guide/performance_tuning.md @@ -235,7 +235,7 @@ with a `tritonserver` binary. ```bash # Start server container -docker run -ti --rm --gpus=all --network=host -v $PWD:/mnt --name triton-server nvcr.io/nvidia/tritonserver:24.07-py3 +docker run -ti --rm --gpus=all --network=host -v $PWD:/mnt --name triton-server nvcr.io/nvidia/tritonserver:24.08-py3 # Start serving your models tritonserver --model-repository=/mnt/models @@ -284,7 +284,7 @@ by setting the `-u` flag, such as `perf_analyzer -m densenet_onnx -u ```bash # Start the SDK container interactively -docker run -ti --rm --gpus=all --network=host -v $PWD:/mnt --name triton-client nvcr.io/nvidia/tritonserver:24.07-py3-sdk +docker run -ti --rm --gpus=all --network=host -v $PWD:/mnt --name triton-client nvcr.io/nvidia/tritonserver:24.08-py3-sdk # Benchmark model being served from step 3 perf_analyzer -m densenet_onnx --concurrency-range 1:4 diff --git a/qa/L0_backend_python/test.sh b/qa/L0_backend_python/test.sh index f6d4b7b445..324ee5ba1f 100755 --- a/qa/L0_backend_python/test.sh +++ b/qa/L0_backend_python/test.sh @@ -39,18 +39,18 @@ fi # /mnt/c when needed but the paths on the tritonserver command-line # must be C:/ style. export TEST_WINDOWS=0 -if [[ "$(< /proc/sys/kernel/osrelease)" == *microsoft* ]]; then +if [[ -v WSL_DISTRO_NAME ]] || [[ -v MSYSTEM ]]; then export DATADIR=${DATADIR:="/c/data/inferenceserver/${REPO_VERSION}"} export TRITON_DIR=${TRITON_DIR:=c:/tritonserver} # This will run in WSL, but Triton will run in windows, so environment # variables meant for loaded models must be exported using WSLENV. # The /w flag indicates the value should only be included when invoking # Win32 from WSL. - export WSLENV=TRITON_DIR/w + export WSLENV=TRITON_DIR export SERVER=${SERVER:=c:/tritonserver/bin/tritonserver.exe} export BACKEND_DIR=${BACKEND_DIR:=c:/tritonserver/backends} export MODELDIR=${MODELDIR:=c:/} - TEST_WINDOWS=1 + export TEST_WINDOWS=1 else export DATADIR=${DATADIR:="/data/inferenceserver/${REPO_VERSION}"} export TRITON_DIR=${TRITON_DIR:="/opt/tritonserver"} diff --git a/qa/L0_batcher/test.sh b/qa/L0_batcher/test.sh index 827751eb40..7043aab2a5 100755 --- a/qa/L0_batcher/test.sh +++ b/qa/L0_batcher/test.sh @@ -79,7 +79,7 @@ TF_VERSION=${TF_VERSION:=2} # On windows the paths invoked by the script (running in WSL) must use # /mnt/c when needed but the paths on the tritonserver command-line # must be C:/ style. -if [[ "$(< /proc/sys/kernel/osrelease)" == *microsoft* ]]; then +if [[ -v WSL_DISTRO_NAME ]] || [[ -v MSYSTEM ]]; then MODELDIR=${MODELDIR:=C:/models} DATADIR=${DATADIR:="/mnt/c/data/inferenceserver/${REPO_VERSION}"} BACKEND_DIR=${BACKEND_DIR:=C:/tritonserver/backends} @@ -601,7 +601,7 @@ done TEST_CASE=test_multi_batch_preserve_ordering # Skip test for Windows. Trace file concats at 8192 chars on Windows. -if [[ "$(< /proc/sys/kernel/osrelease)" != *microsoft* ]]; then +if [[ ! -v WSL_DISTRO_NAME ]] || [[ ! -v MSYSTEM ]]; then rm -fr ./custom_models && mkdir ./custom_models && \ cp -r ../custom_models/custom_zero_1_float32 ./custom_models/. && \ mkdir -p ./custom_models/custom_zero_1_float32/1 diff --git a/qa/L0_grpc/test.sh b/qa/L0_grpc/test.sh index 50cf5a6f91..93d22e75be 100755 --- a/qa/L0_grpc/test.sh +++ b/qa/L0_grpc/test.sh @@ -48,7 +48,7 @@ NGINX_CONF="./nginx.conf" # On windows the paths invoked by the script (running in WSL) must use # /mnt/c when needed but the paths on the tritonserver command-line # must be C:/ style. -if [[ "$(< /proc/sys/kernel/osrelease)" == *microsoft* ]]; then +if [[ -v WSL_DISTRO_NAME ]] || [[ -v MSYSTEM ]]; then SDKDIR=${SDKDIR:=C:/sdk} MODELDIR=${MODELDIR:=C:/models} CLIENT_PLUGIN_MODELDIR=${MODELDIR:=C:/client_plugin_models} diff --git a/qa/L0_http/test.sh b/qa/L0_http/test.sh index 321c398995..81ae4c254c 100755 --- a/qa/L0_http/test.sh +++ b/qa/L0_http/test.sh @@ -49,7 +49,7 @@ NGINX_CONF="./nginx.conf" # On windows the paths invoked by the script (running in WSL) must use # /mnt/c when needed but the paths on the tritonserver command-line # must be C:/ style. -if [[ "$(< /proc/sys/kernel/osrelease)" == *microsoft* ]]; then +if [[ -v WSL_DISTRO_NAME ]] || [[ -v MSYSTEM ]]; then SDKDIR=${SDKDIR:=C:/sdk} MODELDIR=${MODELDIR:=C:/models} DATADIR=${DATADIR:="/mnt/c/data/inferenceserver/${REPO_VERSION}"} diff --git a/qa/L0_infer/test.sh b/qa/L0_infer/test.sh index dba4d7dbcc..36f63053e3 100755 --- a/qa/L0_infer/test.sh +++ b/qa/L0_infer/test.sh @@ -87,7 +87,7 @@ DEFAULT_SHM_SIZE_BYTES=$((1024*1024*$DEFAULT_SHM_SIZE_MB)) # On windows the paths invoked by the script (running in WSL) must use # /mnt/c when needed but the paths on the tritonserver command-line # must be C:/ style. -if [[ "$(< /proc/sys/kernel/osrelease)" == *microsoft* ]]; then +if [[ -v WSL_DISTRO_NAME ]] || [[ -v MSYSTEM ]]; then MODELDIR=${MODELDIR:=C:/models} DATADIR=${DATADIR:="/mnt/c/data/inferenceserver/${REPO_VERSION}"} BACKEND_DIR=${BACKEND_DIR:=C:/tritonserver/backends} diff --git a/qa/L0_sequence_batcher/test.sh b/qa/L0_sequence_batcher/test.sh index d91b433966..23ee387b55 100755 --- a/qa/L0_sequence_batcher/test.sh +++ b/qa/L0_sequence_batcher/test.sh @@ -93,7 +93,7 @@ TF_VERSION=${TF_VERSION:=2} # /mnt/c when needed but the paths on the tritonserver command-line # must be C:/ style. WINDOWS=0 -if [[ "$(< /proc/sys/kernel/osrelease)" == *microsoft* ]]; then +if [[ -v WSL_DISTRO_NAME ]] || [[ -v MSYSTEM ]]; then MODELDIR=${MODELDIR:=C:/models} DATADIR=${DATADIR:="/mnt/c/data/inferenceserver/${REPO_VERSION}"} BACKEND_DIR=${BACKEND_DIR:=C:/tritonserver/backends} diff --git a/qa/L0_trt_plugin/test.sh b/qa/L0_trt_plugin/test.sh index 075dd54eab..a9d04331f0 100755 --- a/qa/L0_trt_plugin/test.sh +++ b/qa/L0_trt_plugin/test.sh @@ -47,7 +47,7 @@ PLUGIN_TEST=trt_plugin_test.py # On windows the paths invoked by the script (running in WSL) must use # /mnt/c when needed but the paths on the tritonserver command-line # must be C:/ style. -if [[ "$(< /proc/sys/kernel/osrelease)" == *microsoft* ]]; then +if [[ -v WSL_DISTRO_NAME ]] || [[ -v MSYSTEM ]]; then DATADIR=${DATADIR:="/mnt/c/data/inferenceserver/${REPO_VERSION}"} MODELDIR=${MODELDIR:=C:/models} CUSTOMPLUGIN=${CUSTOMPLUGIN:=$MODELDIR/HardmaxPlugin.dll} @@ -135,7 +135,7 @@ SERVER_LD_PRELOAD=$CUSTOMPLUGIN SERVER_ARGS=$SERVER_ARGS_BASE SERVER_LOG="./inference_server_$LOG_IDX.log" -if [[ "$(< /proc/sys/kernel/osrelease)" != *microsoft* ]]; then +if [[ ! -v WSL_DISTRO_NAME ]] || [[ ! -v MSYSTEM ]]; then run_server if [ "$SERVER_PID" == "0" ]; then echo -e "\n***\n*** Failed to start $SERVER\n***" diff --git a/qa/common/gen_jetson_trt_models b/qa/common/gen_jetson_trt_models index 99a6175a08..173d8c1efc 100755 --- a/qa/common/gen_jetson_trt_models +++ b/qa/common/gen_jetson_trt_models @@ -34,7 +34,7 @@ # Make all generated files accessible outside of container umask 0000 # Set the version of the models -TRITON_VERSION=${TRITON_VERSION:=24.07} +TRITON_VERSION=${TRITON_VERSION:=24.08} # Set the CUDA device to use CUDA_DEVICE=${RUNNER_ID:=0} # Set TensorRT image diff --git a/qa/common/gen_qa_custom_ops b/qa/common/gen_qa_custom_ops index 4ae0f006b3..c72ea49ca1 100755 --- a/qa/common/gen_qa_custom_ops +++ b/qa/common/gen_qa_custom_ops @@ -37,7 +37,7 @@ ## ############################################################################ -TRITON_VERSION=${TRITON_VERSION:=24.07} +TRITON_VERSION=${TRITON_VERSION:=24.08} NVIDIA_UPSTREAM_VERSION=${NVIDIA_UPSTREAM_VERSION:=$TRITON_VERSION} TENSORFLOW_IMAGE=${TENSORFLOW_IMAGE:=nvcr.io/nvidia/tensorflow:$NVIDIA_UPSTREAM_VERSION-tf2-py3} PYTORCH_IMAGE=${PYTORCH_IMAGE:=nvcr.io/nvidia/pytorch:$NVIDIA_UPSTREAM_VERSION-py3} diff --git a/qa/common/gen_qa_model_repository b/qa/common/gen_qa_model_repository index cab497aa86..be9383217b 100755 --- a/qa/common/gen_qa_model_repository +++ b/qa/common/gen_qa_model_repository @@ -48,7 +48,7 @@ ## ############################################################################ -TRITON_VERSION=${TRITON_VERSION:=24.07} +TRITON_VERSION=${TRITON_VERSION:=24.08} # ONNX. Use ONNX_OPSET 0 to use the default for ONNX version ONNX_VERSION=1.13.0 diff --git a/qa/common/util.sh b/qa/common/util.sh index 3297dd2914..3874916573 100755 --- a/qa/common/util.sh +++ b/qa/common/util.sh @@ -257,7 +257,7 @@ function run_server_nowait () { return fi - if [[ "$(< /proc/sys/kernel/osrelease)" == *microsoft* ]]; then + if [[ -v WSL_DISTRO_NAME ]] || [[ -v MSYSTEM ]]; then # LD_PRELOAD not yet supported on windows if [ -z "$SERVER_LD_PRELOAD" ]; then echo "=== Running $SERVER $SERVER_ARGS" @@ -329,7 +329,7 @@ function kill_server () { # causes the entire WSL shell to just exit. So instead we must use # taskkill.exe which can only forcefully kill tritonserver which # means that it does not gracefully exit. - if [[ "$(< /proc/sys/kernel/osrelease)" == *microsoft* ]]; then + if [[ -v WSL_DISTRO_NAME ]]; then # Disable -x as it makes output below hard to read oldstate="$(set +o)"; [[ -o errexit ]] && oldstate="$oldstate; set -e" set +x @@ -353,6 +353,8 @@ function kill_server () { fi set +vx; eval "$oldstate" + elif [[ -v MSYSTEM ]] ; then + taskkill //F //IM tritonserver.exe else # Non-windows... kill $SERVER_PID @@ -512,17 +514,23 @@ remove_array_outliers() { function setup_virtualenv() { # Create and activate virtual environment - virtualenv --system-site-packages venv - source venv/bin/activate - pip install pytest + if [[ -v MSYSTEM ]]; then + pip3 install pytest + else + virtualenv --system-site-packages venv + source venv/bin/activate + pip install pytest + fi if [[ ${TEST_WINDOWS} == 1 ]]; then - pip3 install "numpy<2" tritonclient[all] + pip3 install "numpy<2" tritonclient[all] fi } function deactivate_virtualenv() { # Deactivate virtual environment and clean up + if [[ ! -v MSYSTEM ]]; then deactivate rm -fr venv + fi } From 96144e0a0c51a09122229c885ac84b2756ba0248 Mon Sep 17 00:00:00 2001 From: Kris Hung Date: Tue, 27 Aug 2024 12:28:47 -0700 Subject: [PATCH 19/44] docs: Add python backend to windows build command (#7572) --- docs/customization_guide/build.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/customization_guide/build.md b/docs/customization_guide/build.md index 43160f43bb..f0f3bd99e2 100644 --- a/docs/customization_guide/build.md +++ b/docs/customization_guide/build.md @@ -328,7 +328,7 @@ and so you must enable them explicitly. The following build.py invocation builds all features and backends available on windows. ```bash -python build.py --cmake-dir=/build --build-dir=/tmp/citritonbuild --no-container-pull --image=base,win10-py3-min --enable-logging --enable-stats --enable-tracing --enable-gpu --endpoint=grpc --endpoint=http --repo-tag=common: --repo-tag=core: --repo-tag=backend: --repo-tag=thirdparty: --backend=ensemble --backend=tensorrt: --backend=onnxruntime: --backend=openvino: +python build.py --cmake-dir=/build --build-dir=/tmp/citritonbuild --no-container-pull --image=base,win10-py3-min --enable-logging --enable-stats --enable-tracing --enable-gpu --endpoint=grpc --endpoint=http --repo-tag=common: --repo-tag=core: --repo-tag=backend: --repo-tag=thirdparty: --backend=ensemble --backend=tensorrt: --backend=onnxruntime: --backend=openvino: --backend=python: ``` If you are building on *main* branch then `` will From 4ef45da2e468130ba774000416a285082d5d88f4 Mon Sep 17 00:00:00 2001 From: Kris Hung Date: Tue, 27 Aug 2024 12:30:23 -0700 Subject: [PATCH 20/44] docs: Triton TRT-LLM user guide (#7529) --- docs/getting_started/trtllm_user_guide.md | 118 ++++++++++++++++++++++ 1 file changed, 118 insertions(+) create mode 100644 docs/getting_started/trtllm_user_guide.md diff --git a/docs/getting_started/trtllm_user_guide.md b/docs/getting_started/trtllm_user_guide.md new file mode 100644 index 0000000000..7f128e98c7 --- /dev/null +++ b/docs/getting_started/trtllm_user_guide.md @@ -0,0 +1,118 @@ + + +# TensorRT-LLM User Guide + +## What is TensorRT-LLM + +[TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) +(TRT-LLM) is an open-source library designed to accelerate and optimize the +inference performance of large language models (LLMs) on NVIDIA GPUs. TRT-LLM +offers users an easy-to-use Python API to build TensorRT engines for LLMs, +incorporating state-of-the-art optimizations to ensure efficient inference on +NVIDIA GPUs. + +## How to run TRT-LLM models with Triton Server via TensorRT-LLM backend + +The +[TensorRT-LLM Backend](https://github.com/triton-inference-server/tensorrtllm_backend) +lets you serve TensorRT-LLM models with Triton Inference Server. Check out the +[Getting Started](https://github.com/triton-inference-server/tensorrtllm_backend?tab=readme-ov-file#getting-started) +section in the TensorRT-LLM Backend repo to learn how to utlize the +[NGC Triton TRT-LLM container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/tritonserver) +to prepare engines for your LLM models and serve them with Triton. + +## How to use your custom TRT-LLM model + +All the supported models can be found in the +[examples](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples) folder in +the TRT-LLM repo. Follow the examples to convert your models to TensorRT +engines. + +After the engine is built, [prepare the model repository](https://github.com/triton-inference-server/tensorrtllm_backend?tab=readme-ov-file#prepare-the-model-repository) +for Triton, and +[modify the model configuration](https://github.com/triton-inference-server/tensorrtllm_backend?tab=readme-ov-file#modify-the-model-configuration). + +Only the *mandatory parameters* need to be set in the model config file. Feel free +to modify the optional parameters as needed. To learn more about the +parameters, model inputs, and outputs, see the +[model config documentation](ttps://github.com/triton-inference-server/tensorrtllm_backend/blob/main/docs/model_config.md) for more details. + +## Advanced Configuration Options and Deployment Strategies + +Explore advanced configuration options and deployment strategies to optimize +and run Triton with your TRT-LLM models effectively: + +- [Model Deployment](https://github.com/triton-inference-server/tensorrtllm_backend/tree/main?tab=readme-ov-file#model-deployment): Techniques for efficiently deploying and managing your models in various environments. +- [Multi-Instance GPU (MIG) Support](https://github.com/triton-inference-server/tensorrtllm_backend/tree/main?tab=readme-ov-file#mig-support): Run Triton and TRT-LLM models with MIG to optimize GPU resource management. +- [Scheduling](https://github.com/triton-inference-server/tensorrtllm_backend/tree/main?tab=readme-ov-file#scheduling): Configure scheduling policies to control how requests are managed and executed. +- [Key-Value Cache](https://github.com/triton-inference-server/tensorrtllm_backend/tree/main?tab=readme-ov-file#key-value-cache): Utlizte KV cache and KV cache reuse to optimize memory usage and improve performance. +- [Decoding](https://github.com/triton-inference-server/tensorrtllm_backend/tree/main?tab=readme-ov-file#decoding): Advanced methods for generating text, including top-k, top-p, top-k top-p, beam search, Medusa, and speculative decoding. +- [Chunked Context](https://github.com/triton-inference-server/tensorrtllm_backend/tree/main?tab=readme-ov-file#chunked-context): Splitting the context into several chunks and batching them during generation phase to increase overall throughput. +- [Quantization](https://github.com/triton-inference-server/tensorrtllm_backend/tree/main?tab=readme-ov-file#quantization): Apply quantization techniques to reduce model size and enhance inference speed. +- [LoRa (Low-Rank Adaptation)](https://github.com/triton-inference-server/tensorrtllm_backend/tree/main?tab=readme-ov-file#lora): Use LoRa for efficient model fine-tuning and adaptation. + +## Tutorials + +Make sure to check out the +[tutorials](https://github.com/triton-inference-server/tutorials) repo to see +more guides on serving popular LLM models with Triton Server and TensorRT-LLM, +as well as deploying them on Kubernetes. + +## Benchmark + +[GenAI-Perf](https://github.com/triton-inference-server/perf_analyzer/tree/main/genai-perf) +is a command line tool for measuring the throughput and latency of LLMs served +by Triton Inference Server. Check out the +[Quick Start](https://github.com/triton-inference-server/perf_analyzer/tree/main/genai-perf#quick-start) +to learn how to use GenAI-Perf to benchmark your LLM models. + +## Performance Best Practices + +Check out the +[Performance Best Practices guide](https://nvidia.github.io/TensorRT-LLM/performance/perf-best-practices.html) +to learn how to optimize your TensorRT-LLM models for better performance. + +## Metrics + +Triton Server provides +[metrics](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/metrics.md) +indicating GPU and request statistics. +See the +[Triton Metrics](https://github.com/triton-inference-server/tensorrtllm_backend?tab=readme-ov-file#triton-metrics) +section in the TensorRT-LLM Backend repo to learn how to query the Triton +metrics endpoint to obtain TRT-LLM statistics. + +## Ask questions or report issues + +Can't find what you're looking for, or have a question or issue? Feel free to +ask questions or report issues in the GitHub issues page: + +- [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM/issues) +- [TensorRT-LLM Backend](https://github.com/triton-inference-server/tensorrtllm_backend/issues) +- [Triton Inference Server](https://github.com/triton-inference-server/server/issues) From 9ec820b0f0b8b063589ecfd140cdb8c5ea45a035 Mon Sep 17 00:00:00 2001 From: Pavithra Vijayakrishnan <160681768+pvijayakrish@users.noreply.github.com> Date: Tue, 27 Aug 2024 16:07:58 -0700 Subject: [PATCH 21/44] Build: Updating to allow passing DOCKER_GPU_ARGS at model generation (#7566) --- qa/common/gen_jetson_trt_models | 4 +--- qa/common/gen_qa_custom_ops | 2 +- qa/common/gen_qa_model_repository | 9 ++++++--- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/qa/common/gen_jetson_trt_models b/qa/common/gen_jetson_trt_models index 173d8c1efc..892b8dd383 100755 --- a/qa/common/gen_jetson_trt_models +++ b/qa/common/gen_jetson_trt_models @@ -48,9 +48,7 @@ HOST_MODEL_DIR=${HOST_MODEL_DIR:="${HOST_BUILD_DIR}/${TRITON_VERSION}"} HOST_SOURCE_DIR=$HOST_BUILD_DIR/gen_srcdir # Set CI specific parameters -DOCKER_GPU_ARGS="${DOCKER_GPU_ARGS:="--gpus device=$CUDA_DEVICE"}" -[[ $RUNNER_GPUS =~ ^[0-9] ]] && DOCKER_GPU_ARGS=$(eval $NV_DOCKER_ARGS) - +DOCKER_GPU_ARGS=${DOCKER_GPU_ARGS:-$([[ $RUNNER_GPUS =~ ^[0-9] ]] && eval $NV_DOCKER_ARGS || echo "--gpus device=$CUDA_DEVICE" )} # Set model output directories diff --git a/qa/common/gen_qa_custom_ops b/qa/common/gen_qa_custom_ops index c72ea49ca1..8864da69f5 100755 --- a/qa/common/gen_qa_custom_ops +++ b/qa/common/gen_qa_custom_ops @@ -44,7 +44,7 @@ PYTORCH_IMAGE=${PYTORCH_IMAGE:=nvcr.io/nvidia/pytorch:$NVIDIA_UPSTREAM_VERSION-p CUDA_DEVICE=${NV_GPU:=0} -[[ $RUNNER_GPUS =~ ^[0-9] ]] && DOCKER_GPU_ARGS=$(eval $NV_DOCKER_ARGS) || DOCKER_GPU_ARGS="--gpus device=$CUDA_DEVICE" +DOCKER_GPU_ARGS=${DOCKER_GPU_ARGS:-$([[ $RUNNER_GPUS =~ ^[0-9] ]] && eval $NV_DOCKER_ARGS || echo "--gpus device=$CUDA_DEVICE" )} ### HOST_BUILD_DIR=${HOST_BUILD_DIR:=/tmp} diff --git a/qa/common/gen_qa_model_repository b/qa/common/gen_qa_model_repository index be9383217b..900b8fdc03 100755 --- a/qa/common/gen_qa_model_repository +++ b/qa/common/gen_qa_model_repository @@ -63,7 +63,8 @@ TENSORFLOW_IMAGE=${TENSORFLOW_IMAGE:=nvcr.io/nvidia/tensorflow:$TRITON_VERSION-t TENSORRT_IMAGE=${TENSORRT_IMAGE:=nvcr.io/nvidia/tensorrt:$TRITON_VERSION-py3} CUDA_DEVICE=${NV_GPU:=0} -[[ $RUNNER_GPUS =~ ^[0-9] ]] && DOCKER_GPU_ARGS=$(eval $NV_DOCKER_ARGS) || DOCKER_GPU_ARGS="--gpus device=$CUDA_DEVICE" +DOCKER_GPU_ARGS=${DOCKER_GPU_ARGS:-$([[ $RUNNER_GPUS =~ ^[0-9] ]] && eval $NV_DOCKER_ARGS || echo "--gpus device=$CUDA_DEVICE" )} +MODEL_TYPE=${MODEL_TYPE:-""} ### HOST_BUILD_DIR=${HOST_BUILD_DIR:=/tmp} @@ -360,8 +361,10 @@ python3 $SRCDIR/gen_qa_implicit_models.py --libtorch --variable --models_dir=$VA chmod -R 777 $VARIMPLICITSEQDESTDIR python3 $SRCDIR/gen_qa_dyna_sequence_models.py --libtorch --models_dir=$DYNASEQDESTDIR chmod -R 777 $DYNASEQDESTDIR -python3 $SRCDIR/gen_qa_torchtrt_models.py --models_dir=$TORCHTRTDESTDIR -chmod -R 777 $TORCHTRTDESTDIR +if [ -z "$MODEL_TYPE" ] || [ "$MODEL_TYPE" != "igpu" ]; then + python3 $SRCDIR/gen_qa_torchtrt_models.py --models_dir=$TORCHTRTDESTDIR + chmod -R 777 $TORCHTRTDESTDIR +fi python3 $SRCDIR/gen_qa_ragged_models.py --libtorch --models_dir=$RAGGEDDESTDIR chmod -R 777 $RAGGEDDESTDIR # Export torchvision image models to ONNX From f6021f7c48f58f7e299027e7479fb39cabfba36d Mon Sep 17 00:00:00 2001 From: KrishnanPrash <140860868+KrishnanPrash@users.noreply.github.com> Date: Fri, 30 Aug 2024 18:23:03 -0500 Subject: [PATCH 22/44] feat: Python Deployment of Triton Inference Server (#7501) Co-authored-by: GuanLuo <41310872+GuanLuo@users.noreply.github.com> Co-authored-by: Ryan McCormick Co-authored-by: Misha Chornyi <99709299+mc-nv@users.noreply.github.com> --- Dockerfile.QA | 4 + docs/customization_guide/tritonfrontend.md | 113 ++++++++ qa/L0_python_api/test.sh | 11 +- qa/L0_python_api/test_kserve.py | 252 ++++++++++++++++++ .../delayed_identity/1/model.py | 51 ++++ .../delayed_identity/config.pbtxt | 52 ++++ .../test_model_repository/identity/1/model.py | 49 ++++ .../identity/config.pbtxt | 44 +++ qa/L0_python_api/testing_utils.py | 95 +++++++ src/CMakeLists.txt | 6 +- src/common.h | 60 +++++ src/grpc/grpc_server.cc | 95 +++++++ src/grpc/grpc_server.h | 18 ++ src/grpc/infer_handler.cc | 20 +- src/http_server.cc | 38 ++- src/http_server.h | 8 + src/python/CMakeLists.txt | 78 ++++++ src/python/build_wheel.py | 135 ++++++++++ src/python/examples/example.py | 85 ++++++ .../1/model.savedmodel/saved_model.pb | Bin 0 -> 531 bytes .../identity/config.pbtxt | 44 +++ src/python/setup.py | 112 ++++++++ src/python/tritonfrontend/CMakeLists.txt | 181 +++++++++++++ src/python/tritonfrontend/__init__.py | 33 +++ src/python/tritonfrontend/__init__.pyi | 1 + src/python/tritonfrontend/_api/__init__.py | 25 ++ .../tritonfrontend/_api/_error_mapping.py | 48 ++++ src/python/tritonfrontend/_api/_kservegrpc.py | 137 ++++++++++ .../tritonfrontend/_api/_kservegrpc.pyi | 74 +++++ src/python/tritonfrontend/_api/_kservehttp.py | 97 +++++++ .../tritonfrontend/_api/_kservehttp.pyi | 49 ++++ src/python/tritonfrontend/_c/__init__.py | 27 ++ src/python/tritonfrontend/_c/__init__.pyi | 27 ++ src/python/tritonfrontend/_c/tritonfrontend.h | 139 ++++++++++ .../_c/tritonfrontend_bindings.pyi | 44 +++ .../_c/tritonfrontend_pybind.cc | 76 ++++++ src/python/tritonfrontend/py.typed | 0 37 files changed, 2315 insertions(+), 13 deletions(-) create mode 100644 docs/customization_guide/tritonfrontend.md create mode 100644 qa/L0_python_api/test_kserve.py create mode 100644 qa/L0_python_api/test_model_repository/delayed_identity/1/model.py create mode 100644 qa/L0_python_api/test_model_repository/delayed_identity/config.pbtxt create mode 100644 qa/L0_python_api/test_model_repository/identity/1/model.py create mode 100644 qa/L0_python_api/test_model_repository/identity/config.pbtxt create mode 100644 qa/L0_python_api/testing_utils.py create mode 100644 src/python/CMakeLists.txt create mode 100755 src/python/build_wheel.py create mode 100644 src/python/examples/example.py create mode 100755 src/python/examples/example_model_repository/identity/1/model.savedmodel/saved_model.pb create mode 100644 src/python/examples/example_model_repository/identity/config.pbtxt create mode 100755 src/python/setup.py create mode 100644 src/python/tritonfrontend/CMakeLists.txt create mode 100644 src/python/tritonfrontend/__init__.py create mode 100644 src/python/tritonfrontend/__init__.pyi create mode 100644 src/python/tritonfrontend/_api/__init__.py create mode 100644 src/python/tritonfrontend/_api/_error_mapping.py create mode 100644 src/python/tritonfrontend/_api/_kservegrpc.py create mode 100644 src/python/tritonfrontend/_api/_kservegrpc.pyi create mode 100644 src/python/tritonfrontend/_api/_kservehttp.py create mode 100644 src/python/tritonfrontend/_api/_kservehttp.pyi create mode 100644 src/python/tritonfrontend/_c/__init__.py create mode 100644 src/python/tritonfrontend/_c/__init__.pyi create mode 100644 src/python/tritonfrontend/_c/tritonfrontend.h create mode 100644 src/python/tritonfrontend/_c/tritonfrontend_bindings.pyi create mode 100644 src/python/tritonfrontend/_c/tritonfrontend_pybind.cc create mode 100644 src/python/tritonfrontend/py.typed diff --git a/Dockerfile.QA b/Dockerfile.QA index b381abfaaf..68ab519b41 100644 --- a/Dockerfile.QA +++ b/Dockerfile.QA @@ -390,6 +390,10 @@ RUN rm -fr qa/L0_copyrights qa/L0_build_variants && \ RUN find qa/pkgs/ -maxdepth 1 -type f -name \ "tritonserver-*.whl" | xargs -I {} pip3 install --upgrade {}[all] +# Install Triton Frontend Python API +RUN find qa/pkgs/ -type f -name \ + "tritonfrontend-*.whl" | xargs -I {} pip3 install --upgrade {}[all] + ENV LD_LIBRARY_PATH /opt/tritonserver/qa/clients:${LD_LIBRARY_PATH} # DLIS-3631: Needed to run Perf Analyzer CI tests correctly diff --git a/docs/customization_guide/tritonfrontend.md b/docs/customization_guide/tritonfrontend.md new file mode 100644 index 0000000000..caaac9308d --- /dev/null +++ b/docs/customization_guide/tritonfrontend.md @@ -0,0 +1,113 @@ +### Triton Server (tritonfrontend) Bindings + +The `tritonfrontend` python package is a set of bindings to Triton's existing frontends implemented in C++. Currently, `tritonfrontend` supports starting up `KServeHttp` and `KServeGrpc` frontends. These bindings used in-combination with Triton's Python In-Process API ([`tritonserver`](https://github.com/triton-inference-server/core/tree/main/python/tritonserver)) and [`tritonclient`](https://github.com/triton-inference-server/client/tree/main/src/python/library) extend the ability to use Triton's full feature set with a couple of lines of Python. + +Let us walk through a simple example: +1. First we need to load the desired models and start the server with `tritonserver`. +```python +import tritonserver + +# Constructing path to Model Repository +model_path = f"server/src/python/examples/example_model_repository" + +server_options = tritonserver.Options( + server_id="ExampleServer", + model_repository=model_path, + log_error=True, + log_warn=True, + log_info=True, +) +server = tritonserver.Server(server_options).start(wait_until_ready=True) +``` +Note: `model_path` may need to be edited depending on your setup. + + +2. Now, to start up the respective services with `tritonfrontend` +```python +from tritonfrontend import KServeHttp, KServeGrpc +http_options = KServeHttp.Options(thread_count=5) +http_service = KServeHttp.Server(server, http_options) +http_service.start() + +# Default options (if none provided) +grpc_service = KServeGrpc.Server(server) +grpc_service.start() +``` + +3. Finally, with running services, we can use `tritonclient` or simple `curl` commands to send requests and receive responses from the frontends. + +```python +import tritonclient.http as httpclient +import numpy as np # Use version numpy < 2 +model_name = "identity" # output == input +url = "localhost:8000" + +# Create a Triton client +client = httpclient.InferenceServerClient(url=url) + +# Prepare input data +input_data = np.array([["Roger Roger"]], dtype=object) + +# Create input and output objects +inputs = [httpclient.InferInput("INPUT0", input_data.shape, "BYTES")] + +# Set the data for the input tensor +inputs[0].set_data_from_numpy(input_data) + +results = client.infer(model_name, inputs=inputs) + +# Get the output data +output_data = results.as_numpy("OUTPUT0") + +# Print results +print("[INFERENCE RESULTS]") +print("Output data:", output_data) + +# Stop respective services and server. +http_service.stop() +grpc_service.stop() +server.stop() +``` + +--- + +Additionally, `tritonfrontend` provides context manager support as well. So steps 2-3, could also be achieved through: +```python +from tritonfrontend import KServeHttp +import tritonclient.http as httpclient +import numpy as np # Use version numpy < 2 + +with KServeHttp.Server(server) as http_service: + # The identity model returns an exact duplicate of the input data as output + model_name = "identity" + url = "localhost:8000" + # Create a Triton client + with httpclient.InferenceServerClient(url=url) as client: + # Prepare input data + input_data = np.array(["Roger Roger"], dtype=object) + # Create input and output objects + inputs = [httpclient.InferInput("INPUT0", input_data.shape, "BYTES")] + # Set the data for the input tensor + inputs[0].set_data_from_numpy(input_data) + # Perform inference + results = client.infer(model_name, inputs=inputs) + # Get the output data + output_data = results.as_numpy("OUTPUT0") + # Print results + print("[INFERENCE RESULTS]") + print("Output data:", output_data) + +server.stop() +``` +With this workflow, you can avoid having to stop each service after client requests have terminated. + + +## Known Issues +- The following features are not currently supported when launching the Triton frontend services through the python bindings: + - [Tracing](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/trace.md) + - [Shared Memory](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_shared_memory.md) + - [Metrics](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/metrics.md) + - [Restricted Protocols](https://github.com/triton-inference-server/server/blob/main/docs/customization_guide/inference_protocols.md#limit-endpoint-access-beta) + - VertexAI + - Sagemaker +- After a running server has been stopped, if the client sends an inference request, a Segmentation Fault will occur. \ No newline at end of file diff --git a/qa/L0_python_api/test.sh b/qa/L0_python_api/test.sh index 6dc7206fe3..0d87d16771 100755 --- a/qa/L0_python_api/test.sh +++ b/qa/L0_python_api/test.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -49,6 +49,15 @@ if [ $? -ne 0 ]; then RET=1 fi + +FRONTEND_TEST_LOG="./python_kserve.log" +python -m pytest --junitxml=test_kserve.xml test_kserve.py > $FRONTEND_TEST_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $FRONTEND_TEST_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + set -e if [ $RET -eq 0 ]; then diff --git a/qa/L0_python_api/test_kserve.py b/qa/L0_python_api/test_kserve.py new file mode 100644 index 0000000000..ab77783d0c --- /dev/null +++ b/qa/L0_python_api/test_kserve.py @@ -0,0 +1,252 @@ +import time +from functools import partial + +import numpy as np +import pytest +import tritonclient.grpc as grpcclient +import tritonclient.http as httpclient +import tritonserver +from testing_utils import ( + send_and_test_inference_identity, + setup_client, + setup_server, + setup_service, + teardown_client, + teardown_server, + teardown_service, +) +from tritonclient.utils import InferenceServerException +from tritonfrontend import KServeGrpc, KServeHttp + + +class TestHttpOptions: + def test_correct_http_parameters(self): + KServeHttp.Options( + address="0.0.0.1", port=8080, reuse_port=True, thread_count=16 + ) + + def test_wrong_http_parameters(self): + # Out of range + with pytest.raises(Exception): + KServeHttp.Options(port=-15) + with pytest.raises(Exception): + KServeHttp.Options(thread_count=-5) + + # Wrong data type + with pytest.raises(Exception): + KServeHttp.Options(header_forward_pattern=10) + + +class TestGrpcOptions: + def test_correct_grpc_parameters(self): + KServeGrpc.Options( + infer_compression_level=KServeGrpc.Grpc_compression_level.HIGH, + reuse_port=True, + infer_allocation_pool_size=12, + http2_max_pings_without_data=10, + ) + + def test_wrong_grpc_parameters(self): + # Out of Range + with pytest.raises(Exception): + KServeGrpc.Options(port=-5) + with pytest.raises(Exception): + KServeGrpc.Options(keepalive_timeout_ms=-20_000) + + # Wrong data type + with pytest.raises(Exception): + KServeGrpc.Options(infer_allocation_pool_size="big pool") + with pytest.raises(Exception): + KServeGrpc.Options(server_key=10) + + +HTTP_ARGS = (KServeHttp, httpclient, "localhost:8000") # Default HTTP args +GRPC_ARGS = (KServeGrpc, grpcclient, "localhost:8001") # Default GRPC args + + +class TestKServe: + @pytest.mark.parametrize("frontend, client_type, url", [HTTP_ARGS, GRPC_ARGS]) + def test_server_ready(self, frontend, client_type, url): + server = setup_server() + service = setup_service(server, frontend) + client = setup_client(client_type, url=url) + + assert client.is_server_ready() + + teardown_client(client) + teardown_service(service) + teardown_server(server) + + @pytest.mark.parametrize("frontend", [HTTP_ARGS[0], GRPC_ARGS[0]]) + def test_service_double_start(self, frontend): + server = setup_server() + # setup_service() performs service.start() + service = setup_service(server, frontend) + + with pytest.raises( + tritonserver.AlreadyExistsError, match="server is already running." + ): + service.start() + + teardown_server(server) + teardown_service(service) + + @pytest.mark.parametrize("frontend", [HTTP_ARGS[0], GRPC_ARGS[0]]) + def test_invalid_options(self, frontend): + server = setup_server() + # Current flow is KServeHttp.Options or KServeGrpc.Options have to be + # provided to ensure type and range validation occurs. + with pytest.raises( + tritonserver.InvalidArgumentError, + match="Incorrect type for options. options argument must be of type", + ): + frontend.Server(server, {"port": 8001}) + + teardown_server(server) + + @pytest.mark.parametrize("frontend", [HTTP_ARGS[0], GRPC_ARGS[0]]) + def test_server_service_order(self, frontend): + server = setup_server() + service = setup_service(server, frontend) + + teardown_server(server) + teardown_service(service) + + @pytest.mark.parametrize("frontend, client_type", [HTTP_ARGS[:2], GRPC_ARGS[:2]]) + def test_service_custom_port(self, frontend, client_type): + server = setup_server() + options = frontend.Options(port=8005) + service = setup_service(server, frontend, options) + client = setup_client(client_type, url="localhost:8005") + + # Confirms that service starts at port 8005 + client.is_server_ready() + + teardown_client(client) + teardown_service(service) + teardown_server(server) + + @pytest.mark.parametrize("frontend, client_type, url", [HTTP_ARGS, GRPC_ARGS]) + def test_inference(self, frontend, client_type, url): + server = setup_server() + service = setup_service(server, frontend) + + # TODO: use common/test_infer + assert send_and_test_inference_identity(client_type, url=url) + + teardown_service(service) + teardown_server(server) + + @pytest.mark.parametrize("frontend, client_type, url", [HTTP_ARGS]) + def test_http_req_during_shutdown(self, frontend, client_type, url): + server = setup_server() + http_service = setup_service(server, frontend) + http_client = httpclient.InferenceServerClient(url="localhost:8000") + model_name = "delayed_identity" + delay = 2 # seconds + input_data0 = np.array([[delay]], dtype=np.float32) + + input0 = httpclient.InferInput("INPUT0", input_data0.shape, "FP32") + input0.set_data_from_numpy(input_data0) + + inputs = [input0] + outputs = [httpclient.InferRequestedOutput("OUTPUT0")] + + async_request = http_client.async_infer( + model_name=model_name, inputs=inputs, outputs=outputs + ) + # http_service.stop() does not use graceful shutdown + teardown_service(http_service) + + # So, inference request will fail as http endpoints have been stopped. + with pytest.raises( + InferenceServerException, match="failed to obtain inference response" + ): + async_request.get_result(block=True, timeout=delay) + + # http_client.close() calls join() to terminate pool of greenlets + # However, due to an unsuccessful get_result(), async_request is still + # an active thread. Hence, join stalls until greenlet timeouts. + # Does not throw an exception, but displays error in logs. + teardown_client(http_client) + + # delayed_identity will still be an active model + # Hence, server.stop() causes InternalError: Timeout. + with pytest.raises( + tritonserver.InternalError, + match="Exit timeout expired. Exiting immediately.", + ): + teardown_server(server) + + @pytest.mark.parametrize("frontend, client_type, url", [GRPC_ARGS]) + def test_grpc_req_during_shutdown(self, frontend, client_type, url): + server = setup_server() + grpc_service = setup_service(server, frontend) + grpc_client = grpcclient.InferenceServerClient(url=url) + user_data = [] + + def callback(user_data, result, error): + if error: + user_data.append(error) + else: + user_data.append(result) + + model_name = "delayed_identity" + delay = 2 # seconds + + input_data0 = np.array([[delay]], dtype=np.float32) + input0 = client_type.InferInput("INPUT0", input_data0.shape, "FP32") + input0.set_data_from_numpy(input_data0) + + inputs = [input0] + outputs = [client_type.InferRequestedOutput("OUTPUT0")] + + grpc_client.async_infer( + model_name=model_name, + inputs=inputs, + outputs=outputs, + callback=partial(callback, user_data), + ) + + teardown_service(grpc_service) + + time_out = delay + 1 + while (len(user_data) == 0) and time_out > 0: + time_out = time_out - 1 + time.sleep(1) + + assert ( + len(user_data) == 1 + and isinstance(user_data[0], InferenceServerException) + and "[StatusCode.UNAVAILABLE] failed to connect to all addresses" + in str(user_data[0]) + ) + + teardown_client(grpc_client) + teardown_server(server) + + # KNOWN ISSUE: CAUSES SEGFAULT + # Created [DLIS-7231] to address at future date + # Once the server has been stopped, the underlying TRITONSERVER_Server instance + # is deleted. However, the frontend does not know the server instance + # is no longer valid. + # def test_inference_after_server_stop(self): + # server = setup_server() + # http_service = setup_service(server, KServeHttp) + # http_client = setup_client(httpclient, url="localhost:8000") + + # teardown_server(server) # Server has been stopped + + # model_name = "identity" + # input_data = np.array([["testing"]], dtype=object) + # # Create input and output objects + # inputs = [httpclient.InferInput("INPUT0", input_data.shape, "BYTES")] + # outputs = [httpclient.InferRequestedOutput("OUTPUT0")] + + # # Set the data for the input tensor + # inputs[0].set_data_from_numpy(input_data) + + # results = http_client.infer(model_name, inputs=inputs, outputs=outputs) + + # teardown_client(http_client) + # teardown_service(http_service) diff --git a/qa/L0_python_api/test_model_repository/delayed_identity/1/model.py b/qa/L0_python_api/test_model_repository/delayed_identity/1/model.py new file mode 100644 index 0000000000..b6095cec8f --- /dev/null +++ b/qa/L0_python_api/test_model_repository/delayed_identity/1/model.py @@ -0,0 +1,51 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import time + +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + def execute(self, requests): + """ + Mock Model that uses the input data to determine how long to wait + before returning identity data + """ + assert len(requests) == 1 + delay = 0 + request = requests[0] + responses = [] + + delay_tensor = pb_utils.get_input_tensor_by_name(request, "INPUT0") + delay_as_numpy = delay_tensor.as_numpy() + delay = float(delay_as_numpy[0][0]) + + out_tensor = pb_utils.Tensor("OUTPUT0", delay_as_numpy) + responses.append(pb_utils.InferenceResponse([out_tensor])) + + time.sleep(delay) + return responses diff --git a/qa/L0_python_api/test_model_repository/delayed_identity/config.pbtxt b/qa/L0_python_api/test_model_repository/delayed_identity/config.pbtxt new file mode 100644 index 0000000000..9ac8f1aaff --- /dev/null +++ b/qa/L0_python_api/test_model_repository/delayed_identity/config.pbtxt @@ -0,0 +1,52 @@ +# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "delayed_identity" +backend: "python" +max_batch_size: 64 + +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] + +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] + +instance_group [ + { + count: 1 + kind : KIND_CPU + } +] \ No newline at end of file diff --git a/qa/L0_python_api/test_model_repository/identity/1/model.py b/qa/L0_python_api/test_model_repository/identity/1/model.py new file mode 100644 index 0000000000..629b6469c9 --- /dev/null +++ b/qa/L0_python_api/test_model_repository/identity/1/model.py @@ -0,0 +1,49 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + """This model loops through different dtypes to make sure that + serialize_byte_tensor works correctly in the Python backend. + """ + + def initialize(self, args): + self._index = 0 + self._dtypes = [np.bytes_, np.object_] + + def execute(self, requests): + responses = [] + for request in requests: + in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0") + out_tensor_0 = pb_utils.Tensor( + "OUTPUT0", in_0.as_numpy().astype(self._dtypes[self._index]) + ) + self._index += 1 + responses.append(pb_utils.InferenceResponse([out_tensor_0])) + return responses diff --git a/qa/L0_python_api/test_model_repository/identity/config.pbtxt b/qa/L0_python_api/test_model_repository/identity/config.pbtxt new file mode 100644 index 0000000000..3f22e14468 --- /dev/null +++ b/qa/L0_python_api/test_model_repository/identity/config.pbtxt @@ -0,0 +1,44 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "identity" +backend: "python" +max_batch_size: 0 + +input [ + { + name: "INPUT0" + data_type: TYPE_STRING + dims: [ 1 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_STRING + dims: [ 1 ] + } +] \ No newline at end of file diff --git a/qa/L0_python_api/testing_utils.py b/qa/L0_python_api/testing_utils.py new file mode 100644 index 0000000000..8c63fea89b --- /dev/null +++ b/qa/L0_python_api/testing_utils.py @@ -0,0 +1,95 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +from typing import Union + +import numpy as np +import tritonserver +from tritonfrontend import KServeGrpc, KServeHttp + + +def setup_server(model_repository="test_model_repository") -> tritonserver.Server: + module_directory = os.path.split(os.path.abspath(__file__))[0] + model_path = os.path.abspath(os.path.join(module_directory, model_repository)) + + # Starting Server Instance + server_options = tritonserver.Options( + server_id="TestServer", + model_repository=model_path, + log_error=True, + log_warn=True, + log_info=True, + ) + + return tritonserver.Server(server_options).start(wait_until_ready=True) + + +def teardown_server(server: tritonserver.Server) -> None: + server.stop() + + +def setup_service( + server: tritonserver.Server, + frontend: Union[KServeHttp, KServeGrpc], + options=None, +) -> Union[KServeHttp, KServeGrpc]: + service = frontend.Server(server=server, options=options) + service.start() + return service + + +def teardown_service(service: Union[KServeHttp, KServeGrpc]) -> None: + service.stop() + + +def setup_client(frontend_client, url: str): + return frontend_client.InferenceServerClient(url=url) + + +def teardown_client(client) -> None: + client.close() + + +# Sends an inference to test_model_repository/identity model and verifies input == output. +def send_and_test_inference_identity(frontend_client, url: str) -> bool: + model_name = "identity" + client = setup_client(frontend_client, url) + input_data = np.array(["testing"], dtype=object) + + # Create input and output objects + inputs = [frontend_client.InferInput("INPUT0", input_data.shape, "BYTES")] + outputs = [frontend_client.InferRequestedOutput("OUTPUT0")] + # Set the data for the input tensor + inputs[0].set_data_from_numpy(input_data) + + # Perform inference request + results = client.infer(model_name=model_name, inputs=inputs, outputs=outputs) + + output_data = results.as_numpy("OUTPUT0") # Gather output data + + teardown_client(client) + return input_data[0] == output_data[0].decode() diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index cf43765dba..2e0380470a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -782,7 +782,11 @@ if (NOT WIN32) endif() # TRITON_ENABLE_GPU endif() # NOT WIN32 +# tritonfrontend python package +add_subdirectory(python) + # Currently unit tests do not build for windows... if ( NOT WIN32) add_subdirectory(test test) endif() # NOT WIN32 + diff --git a/src/common.h b/src/common.h index aa160f394f..011546d637 100644 --- a/src/common.h +++ b/src/common.h @@ -27,7 +27,11 @@ #include #include +#include #include +#include +#include +#include #include #include "triton/core/tritonserver.h" @@ -184,4 +188,60 @@ Join(const T& container, const std::string& delim) return ss.str(); } + +// Used by Python Bindings to accept arguments to initialize Frontends. +// Known pybind11 issue: bool has to come before int for std::variant +using VariantType = std::variant; +using UnorderedMapType = std::unordered_map; + + +template +TRITONSERVER_Error* +GetValue(const UnorderedMapType& options, const std::string& key, T* arg) +{ + auto curr = options.find(key); + bool is_present = (curr != options.end()); + std::string msg; + + if (!is_present) { + msg = "Key: " + key + " not found in options provided."; + return TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INVALID_ARG, msg.c_str()); + } + + bool correct_type = std::holds_alternative(curr->second); + if (!correct_type) { + std::string expected; + std::string found; + VariantType value = *arg; + if (std::holds_alternative(value)) { + expected = "int"; + } else if (std::holds_alternative(value)) { + expected = "bool"; + } else if (std::holds_alternative(value)) { + expected = "string"; + } + + switch (curr->second.index()) { + case 0: + found = "bool"; + break; + case 1: + found = "int"; + break; + case 2: + found = "string"; + break; + } + + msg = "Key: " + key + " found, but incorrect type. Expected " + expected + + " Found: " + found; + + return TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INVALID_ARG, msg.c_str()); + } + + *arg = std::get(curr->second); + return nullptr; +} + + }} // namespace triton::server diff --git a/src/grpc/grpc_server.cc b/src/grpc/grpc_server.cc index c0a92ebd33..74ec443ae6 100644 --- a/src/grpc/grpc_server.cc +++ b/src/grpc/grpc_server.cc @@ -2435,6 +2435,101 @@ Server::Create( return nullptr; // success } +TRITONSERVER_Error* +Server::Create( + std::shared_ptr& server, UnorderedMapType& options, + triton::server::TraceManager* trace_manager, + const std::shared_ptr& shm_manager, + const RestrictedFeatures& restricted_features, + std::unique_ptr* service) +{ + Options grpc_options; + + RETURN_IF_ERR(GetOptions(grpc_options, options)); + + return Create(server, trace_manager, shm_manager, grpc_options, service); +} + +TRITONSERVER_Error* +Server::GetOptions(Options& options, UnorderedMapType& options_map) +{ + SocketOptions socket_selection; + SslOptions ssl_selection; + KeepAliveOptions keep_alive_selection; + + RETURN_IF_ERR(GetSocketOptions(options.socket_, options_map)); + RETURN_IF_ERR(GetSslOptions(options.ssl_, options_map)); + RETURN_IF_ERR(GetKeepAliveOptions(options.keep_alive_, options_map)); + + int infer_compression_level_key; + + RETURN_IF_ERR(GetValue( + options_map, "infer_compression_level", &infer_compression_level_key)); + + options.infer_compression_level_ = + static_cast(infer_compression_level_key); + + RETURN_IF_ERR(GetValue( + options_map, "infer_allocation_pool_size", + &options.infer_allocation_pool_size_)); + RETURN_IF_ERR(GetValue( + options_map, "forward_header_pattern", &options.forward_header_pattern_)); + + return nullptr; +} + +TRITONSERVER_Error* +Server::GetSocketOptions(SocketOptions& options, UnorderedMapType& options_map) +{ + RETURN_IF_ERR(GetValue(options_map, "address", &options.address_)); + RETURN_IF_ERR(GetValue(options_map, "port", &options.port_)); + RETURN_IF_ERR(GetValue(options_map, "reuse_port", &options.reuse_port_)); + + return nullptr; +} + +TRITONSERVER_Error* +Server::GetSslOptions(SslOptions& options, UnorderedMapType& options_map) +{ + RETURN_IF_ERR(GetValue(options_map, "use_ssl", &options.use_ssl_)); + RETURN_IF_ERR(GetValue(options_map, "server_cert", &options.server_cert_)); + RETURN_IF_ERR(GetValue(options_map, "server_key", &options.server_key_)); + RETURN_IF_ERR(GetValue(options_map, "root_cert", &options.root_cert_)); + RETURN_IF_ERR( + GetValue(options_map, "use_mutual_auth", &options.use_mutual_auth_)); + + return nullptr; +} + +TRITONSERVER_Error* +Server::GetKeepAliveOptions( + KeepAliveOptions& options, UnorderedMapType& options_map) +{ + RETURN_IF_ERR( + GetValue(options_map, "keepalive_time_ms", &options.keepalive_time_ms_)); + RETURN_IF_ERR(GetValue( + options_map, "keepalive_timeout_ms", &options.keepalive_timeout_ms_)); + RETURN_IF_ERR(GetValue( + options_map, "keepalive_permit_without_calls", + &options.keepalive_permit_without_calls_)); + RETURN_IF_ERR(GetValue( + options_map, "http2_max_pings_without_data", + &options.http2_max_pings_without_data_)); + RETURN_IF_ERR(GetValue( + options_map, "http2_min_recv_ping_interval_without_data_ms", + &options.http2_min_recv_ping_interval_without_data_ms_)); + RETURN_IF_ERR(GetValue( + options_map, "http2_max_ping_strikes", &options.http2_max_ping_strikes_)); + RETURN_IF_ERR(GetValue( + options_map, "max_connection_age_ms", &options.max_connection_age_ms_)); + RETURN_IF_ERR(GetValue( + options_map, "max_connection_age_grace_ms", + &options.max_connection_age_grace_ms_)); + + return nullptr; +} + + TRITONSERVER_Error* Server::Start() { diff --git a/src/grpc/grpc_server.h b/src/grpc/grpc_server.h index 8a38cdd4fe..89d8dc7388 100644 --- a/src/grpc/grpc_server.h +++ b/src/grpc/grpc_server.h @@ -29,6 +29,7 @@ #include +#include "../common.h" #include "../restricted_features.h" #include "../shared_memory_manager.h" #include "../tracer.h" @@ -100,6 +101,13 @@ class Server { const std::shared_ptr& shm_manager, const Options& server_options, std::unique_ptr* server); + static TRITONSERVER_Error* Create( + std::shared_ptr& server, UnorderedMapType& options, + triton::server::TraceManager* trace_manager, + const std::shared_ptr& shm_manager, + const RestrictedFeatures& restricted_features, + std::unique_ptr* service); + ~Server(); TRITONSERVER_Error* Start(); @@ -112,6 +120,16 @@ class Server { const std::shared_ptr& shm_manager, const Options& server_options); + static TRITONSERVER_Error* GetSocketOptions( + SocketOptions& options, UnorderedMapType& options_map); + static TRITONSERVER_Error* GetSslOptions( + SslOptions& options, UnorderedMapType& options_map); + static TRITONSERVER_Error* GetKeepAliveOptions( + KeepAliveOptions& options, UnorderedMapType& options_map); + + static TRITONSERVER_Error* GetOptions( + Options& options, UnorderedMapType& options_map); + std::shared_ptr tritonserver_; TraceManager* trace_manager_; std::shared_ptr shm_manager_; diff --git a/src/grpc/infer_handler.cc b/src/grpc/infer_handler.cc index 35659f4900..916230381b 100644 --- a/src/grpc/infer_handler.cc +++ b/src/grpc/infer_handler.cc @@ -948,12 +948,14 @@ ModelInferHandler::Execute(InferHandler::State* state) if (err == nullptr) { TRITONSERVER_InferenceTrace* triton_trace = nullptr; #ifdef TRITON_ENABLE_TRACING - GrpcServerCarrier carrier(state->context_->ctx_.get()); - auto start_options = - trace_manager_->GetTraceStartOptions(carrier, request.model_name()); - state->trace_ = std::move(trace_manager_->SampleTrace(start_options)); - if (state->trace_ != nullptr) { - triton_trace = state->trace_->trace_; + if (trace_manager_) { + GrpcServerCarrier carrier(state->context_->ctx_.get()); + auto start_options = + trace_manager_->GetTraceStartOptions(carrier, request.model_name()); + state->trace_ = std::move(trace_manager_->SampleTrace(start_options)); + if (state->trace_ != nullptr) { + triton_trace = state->trace_->trace_; + } } #endif // TRITON_ENABLE_TRACING @@ -982,8 +984,10 @@ ModelInferHandler::Execute(InferHandler::State* state) inference::ModelInferResponse error_response; #ifdef TRITON_ENABLE_TRACING - state->trace_timestamps_.emplace_back( - std::make_pair("GRPC_SEND_START", TraceManager::CaptureTimestamp())); + if (trace_manager_) { + state->trace_timestamps_.emplace_back( + std::make_pair("GRPC_SEND_START", TraceManager::CaptureTimestamp())); + } #endif // TRITON_ENABLE_TRACING state->step_ = COMPLETE; diff --git a/src/http_server.cc b/src/http_server.cc index 68b22ae649..cfd1da88ae 100644 --- a/src/http_server.cc +++ b/src/http_server.cc @@ -1181,6 +1181,7 @@ HTTPAPIServer::HTTPAPIServer( HTTPAPIServer::~HTTPAPIServer() { + LOG_VERBOSE(1) << "~HTTPAPIServer()"; if (server_metadata_err_ != nullptr) { TRITONSERVER_ErrorDelete(server_metadata_err_); } @@ -3586,10 +3587,12 @@ HTTPAPIServer::HandleInfer( RETURN_AND_RESPOND_IF_ERR( req, CheckTransactionPolicy(req, model_name, requested_model_version)); - // If tracing is enabled see if this request should be traced. TRITONSERVER_InferenceTrace* triton_trace = nullptr; - std::shared_ptr trace = - StartTrace(req, model_name, &triton_trace); + std::shared_ptr trace; + if (trace_manager_) { + // If tracing is enabled see if this request should be traced. + trace = StartTrace(req, model_name, &triton_trace); + } // Decompress request body if it is compressed in supported type evbuffer* decompressed_buffer = nullptr; @@ -4696,6 +4699,35 @@ HTTPAPIServer::Create( return nullptr; } + +TRITONSERVER_Error* +HTTPAPIServer::Create( + std::shared_ptr& server, + const UnorderedMapType& options, + triton::server::TraceManager* trace_manager, + const std::shared_ptr& shm_manager, + const RestrictedFeatures& restricted_features, + std::unique_ptr* service) +{ + int port; + bool reuse_port; + std::string address; + std::string header_forward_pattern; + int thread_count; + + RETURN_IF_ERR(GetValue(options, "port", &port)); + RETURN_IF_ERR(GetValue(options, "reuse_port", &reuse_port)); + RETURN_IF_ERR(GetValue(options, "address", &address)); + RETURN_IF_ERR( + GetValue(options, "header_forward_pattern", &header_forward_pattern)); + RETURN_IF_ERR(GetValue(options, "thread_count", &thread_count)); + + return Create( + server, trace_manager, shm_manager, port, reuse_port, address, + header_forward_pattern, thread_count, restricted_features, service); +} + + bool HTTPAPIServer::RespondIfRestricted( evhtp_request_t* req, const Restriction& restriction) diff --git a/src/http_server.h b/src/http_server.h index 077324cba3..3ad3d60cc4 100644 --- a/src/http_server.h +++ b/src/http_server.h @@ -196,6 +196,14 @@ class HTTPAPIServer : public HTTPServer { const RestrictedFeatures& restricted_apis, std::unique_ptr* http_server); + static TRITONSERVER_Error* Create( + std::shared_ptr& server, + const UnorderedMapType& options, + triton::server::TraceManager* trace_manager, + const std::shared_ptr& shm_manager, + const RestrictedFeatures& restricted_features, + std::unique_ptr* service); + virtual ~HTTPAPIServer(); // diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt new file mode 100644 index 0000000000..f447f7eab2 --- /dev/null +++ b/src/python/CMakeLists.txt @@ -0,0 +1,78 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +cmake_minimum_required(VERSION 3.18) + +message("tritonfrontend python package build skipped when relevant frontends are disabled.") +message("In order to build tritonfrontend, the following flags are needed: -DTRITON_ENABLE_HTTP=ON -DTRITON_ENABLE_GRPC=ON") + +# [DLIS-7232] tritonfrontend package expects all supported packages to be +# built, without any check/verification for respective frontend enable flags. +# Support for partial builds(ex: HTTP but not gRPC) will be addressed later. +if(NOT (${TRITON_ENABLE_HTTP} AND ${TRITON_ENABLE_GRPC})) + return() +endif() + +add_subdirectory(tritonfrontend) + +file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/TRITON_VERSION ${TRITON_VERSION}) +configure_file(../../LICENSE LICENSE.txt COPYONLY) +configure_file(setup.py setup.py @ONLY) + +set(WHEEL_DEPENDS + ${CMAKE_CURRENT_BINARY_DIR}/TRITON_VERSION + ${CMAKE_CURRENT_BINARY_DIR}/LICENSE.txt + ${CMAKE_CURRENT_BINARY_DIR}/setup.py + ${CMAKE_CURRENT_BINARY_DIR}/tritonfrontend + py-bindings +) + +set(wheel_stamp_file "stamp.whl") + +add_custom_command( + OUTPUT "${wheel_stamp_file}" + COMMAND python3 + ARGS + "${CMAKE_CURRENT_SOURCE_DIR}/build_wheel.py" + --dest-dir "${CMAKE_CURRENT_BINARY_DIR}/generic" + --binding-path $ + DEPENDS ${WHEEL_DEPENDS} +) + +add_custom_target( + frontend-server-wheel ALL + DEPENDS + "${wheel_stamp_file}" +) + + +# Wheel +set(WHEEL_OUT_DIR "${CMAKE_CURRENT_BINARY_DIR}/generic/wheel/dist/") +install( + DIRECTORY + ${WHEEL_OUT_DIR} + DESTINATION "${CMAKE_INSTALL_PREFIX}/python" +) \ No newline at end of file diff --git a/src/python/build_wheel.py b/src/python/build_wheel.py new file mode 100755 index 0000000000..875dd32a70 --- /dev/null +++ b/src/python/build_wheel.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python3 +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import argparse +import os +import pathlib +import re +import shutil +import subprocess +import sys +from distutils.dir_util import copy_tree +from tempfile import mkstemp + + +def fail_if(p, msg): + if p: + print("error: {}".format(msg), file=sys.stderr) + sys.exit(1) + + +def mkdir(path): + pathlib.Path(path).mkdir(parents=True, exist_ok=True) + + +def touch(path): + pathlib.Path(path).touch() + + +def cpdir(src, dest): + copy_tree(src, dest, preserve_symlinks=1) + + +def sed(pattern, replace, source, dest=None): + name = None + if dest: + name = dest + if dest is None: + fd, name = mkstemp() + + with open(source, "r") as fin, open(name, "w") as fout: + for line in fin: + out = re.sub(pattern, replace, line) + fout.write(out) + + if not dest: + shutil.copyfile(name, source) + + +def main(): + parser = argparse.ArgumentParser() + + parser.add_argument( + "--dest-dir", type=str, required=True, help="Destination directory." + ) + parser.add_argument( + "--binding-path", + type=str, + required=True, + help="Path to Triton Frontend Python binding.", + ) + + FLAGS = parser.parse_args() + + FLAGS.triton_version = None + with open("TRITON_VERSION", "r") as vfile: + FLAGS.triton_version = vfile.readline().strip() + + FLAGS.whl_dir = os.path.join(FLAGS.dest_dir, "wheel") + + print("=== Building in: {}".format(os.getcwd())) + print("=== Using builddir: {}".format(FLAGS.whl_dir)) + print("Adding package files") + mkdir(os.path.join(FLAGS.whl_dir, "tritonfrontend")) + shutil.copy( + "tritonfrontend/__init__.py", os.path.join(FLAGS.whl_dir, "tritonfrontend") + ) + # Type checking marker file indicating support for type checkers. + # https://peps.python.org/pep-0561/ + shutil.copy( + "tritonfrontend/py.typed", os.path.join(FLAGS.whl_dir, "tritonfrontend") + ) + cpdir("tritonfrontend/_c", os.path.join(FLAGS.whl_dir, "tritonfrontend", "_c")) + cpdir("tritonfrontend/_api", os.path.join(FLAGS.whl_dir, "tritonfrontend", "_api")) + PYBIND_LIB = os.path.basename(FLAGS.binding_path) + shutil.copyfile( + FLAGS.binding_path, + os.path.join(FLAGS.whl_dir, "tritonfrontend", "_c", PYBIND_LIB), + ) + + shutil.copyfile("LICENSE.txt", os.path.join(FLAGS.whl_dir, "LICENSE.txt")) + shutil.copyfile("setup.py", os.path.join(FLAGS.whl_dir, "setup.py")) + + os.chdir(FLAGS.whl_dir) + print("=== Building wheel") + args = ["python3", "setup.py", "bdist_wheel"] + + wenv = os.environ.copy() + wenv["VERSION"] = FLAGS.triton_version + wenv["TRITON_PYBIND"] = PYBIND_LIB + p = subprocess.Popen(args, env=wenv) + p.wait() + fail_if(p.returncode != 0, "setup.py failed") + + cpdir("dist", FLAGS.dest_dir) + + print(f"=== Output wheel file is in: {FLAGS.dest_dir}") + touch(os.path.join(FLAGS.dest_dir, "stamp.whl")) + + +if __name__ == "__main__": + main() diff --git a/src/python/examples/example.py b/src/python/examples/example.py new file mode 100644 index 0000000000..a1fba6e9d1 --- /dev/null +++ b/src/python/examples/example.py @@ -0,0 +1,85 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pathlib + +import numpy as np +import tritonclient.http as httpclient +import tritonserver +from tritonfrontend import KServeHttp + + +def main(): + # Constructing path to Model Repository + model_path = f"{pathlib.Path(__file__).parent.resolve()}/example_model_repository" + # Selecting Server Options + server_options = tritonserver.Options( + server_id="ExampleServer", + model_repository=model_path, + log_error=True, + log_info=True, + log_warn=True, + ) + + # Creating server instance + server = tritonserver.Server(server_options).start(wait_until_ready=True) + + # Selecting Options for KServeHttp Frontend + http_options = KServeHttp.Options(port=8005) + + # or http_service = KServeHttp.Server(server, http_options) & http_service.stop() + with KServeHttp.Server(server, http_options) as http_service: + # The identity model returns an exact duplicate of the input data as output + model_name = "identity" + url = "localhost:8005" + + # Create a Triton client + client = httpclient.InferenceServerClient(url=url) + + # Prepare input data + input_data = np.array([["Roger Roger"]], dtype=object) + + # Create input and output objects + inputs = [httpclient.InferInput("INPUT0", input_data.shape, "BYTES")] + + # Set the data for the input tensor + inputs[0].set_data_from_numpy(input_data) + + results = client.infer(model_name, inputs=inputs) + + # Get the output data + output_data = results.as_numpy("OUTPUT0") + + print("--------------------- INFERENCE RESULTS ---------------------") + print("Input data:", input_data) + print("Output data:", output_data) + print("-------------------------------------------------------------") + + server.stop() + + +if __name__ == "__main__": + main() diff --git a/src/python/examples/example_model_repository/identity/1/model.savedmodel/saved_model.pb b/src/python/examples/example_model_repository/identity/1/model.savedmodel/saved_model.pb new file mode 100755 index 0000000000000000000000000000000000000000..63f78fecb4d2e7a8fe857b668055ae7d7372c891 GIT binary patch literal 531 zcmb7>J5R$f5XW)Js~17hi2+%NArcaJNYesKR;Dh{A!UuC#-$OF6D1B+)x9I^d^Wxd zkSDMd;pKd~*YAI45PVVKm0$rj7?Y3-8Ob#QsF6quX%UZ_pe2Q@9jgz-Lq)Dr4!2-V z<6zF0NgVQw4!SfoEaTPz)So)tItDdg1}VqdP{r+?5KGN@_RN_!vPU=e)mSgYlSozq z-wA9$o7_!jo`XNx|_?>a|1-JuSzzez8l a-$4IWMXBT>PqIavtcDH9FwJ-v&E+Rf?!_4Z literal 0 HcmV?d00001 diff --git a/src/python/examples/example_model_repository/identity/config.pbtxt b/src/python/examples/example_model_repository/identity/config.pbtxt new file mode 100644 index 0000000000..ae83e47556 --- /dev/null +++ b/src/python/examples/example_model_repository/identity/config.pbtxt @@ -0,0 +1,44 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "identity" +platform: "tensorflow_savedmodel" +max_batch_size: 8 + +input [ + { + name: "INPUT0" + data_type: TYPE_STRING + dims: [ -1 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_STRING + dims: [ -1 ] + } +] diff --git a/src/python/setup.py b/src/python/setup.py new file mode 100755 index 0000000000..ee1e7c0ec4 --- /dev/null +++ b/src/python/setup.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python3 +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +import sys + +from setuptools import find_packages, setup + +if "--plat-name" in sys.argv: + PLATFORM_FLAG = sys.argv[sys.argv.index("--plat-name") + 1] +else: + PLATFORM_FLAG = "any" + +if "VERSION" not in os.environ: + raise Exception("envvar VERSION must be specified") + +VERSION = os.environ["VERSION"] + +try: + from wheel.bdist_wheel import bdist_wheel as _bdist_wheel + + class bdist_wheel(_bdist_wheel): + def finalize_options(self): + _bdist_wheel.finalize_options(self) + self.root_is_pure = False + + def get_tag(self): + pyver, abi, plat = "py3", "none", PLATFORM_FLAG + return pyver, abi, plat + +except ImportError: + bdist_wheel = None + +this_directory = os.path.abspath(os.path.dirname(__file__)) + +data_files = [ + ("", ["LICENSE.txt"]), +] + +# Type checking marker file indicating support for type checkers. +# https://peps.python.org/pep-0561/ +# Type hints for c extension generated by mypy +platform_package_data = [ + os.environ["TRITON_PYBIND"], + "py.typed", + "_c/__init__.pyi", + "_c/triton_bindings.pyi", +] + +gpu_extras = ["cupy-cuda12x"] +test_extras = ["pytest"] +all_extras = gpu_extras + test_extras + +setup( + name="tritonfrontend", + version=VERSION, + author="NVIDIA Inc.", + author_email="sw-dl-triton@nvidia.com", + description="Triton Inference Server In-Process Python API", + license="BSD", + url="https://developer.nvidia.com/nvidia-triton-inference-server", + classifiers=[ + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Developers", + "Intended Audience :: Science/Research", + "Intended Audience :: Information Technology", + "Topic :: Scientific/Engineering", + "Topic :: Scientific/Engineering :: Image Recognition", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "Topic :: Software Development :: Libraries", + "Topic :: Utilities", + "License :: OSI Approved :: BSD License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.10", + "Environment :: Console", + "Natural Language :: English", + "Operating System :: OS Independent", + ], + packages=find_packages(), + package_data={ + "": platform_package_data, + }, + zip_safe=False, + cmdclass={"bdist_wheel": bdist_wheel}, + data_files=data_files, + install_requires=["tritonserver", "pydantic"], + extras_require={"GPU": gpu_extras, "test": test_extras, "all": all_extras}, +) diff --git a/src/python/tritonfrontend/CMakeLists.txt b/src/python/tritonfrontend/CMakeLists.txt new file mode 100644 index 0000000000..e22be30602 --- /dev/null +++ b/src/python/tritonfrontend/CMakeLists.txt @@ -0,0 +1,181 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +cmake_minimum_required(VERSION 3.18) + +# ================= Ensures Package is Structured Properly ================== +# Top level module entry point and typed marker +file(COPY __init__.py DESTINATION .) +file(COPY py.typed DESTINATION .) +# Copy the '__init__.py' for the '_c' module +file(COPY _c/__init__.py DESTINATION ./_c/.) +file(COPY _c/__init__.pyi DESTINATION ./_c/.) +file(COPY _c/tritonfrontend_bindings.pyi DESTINATION ./_c/.) +# Find and copy _api modules +file(GLOB PYTHON_MODULE_FILES ./_api/*.py) +file(COPY ${PYTHON_MODULE_FILES} DESTINATION ./_api/.) +# ================================= END ===================================== + + +# =================== Downloading and Installing pybind11 =================== +include(FetchContent) + +FetchContent_Declare( + pybind11 + GIT_REPOSITORY https://github.com/pybind/pybind11.git + GIT_TAG v2.13.1 + GIT_SHALLOW ON +) + +FetchContent_MakeAvailable(pybind11) +# ================================= END ===================================== + +# ================== Collect the Dependencies =============================== +set( + PYTHON_FRONTEND_BINDING_DEPS + ../../shared_memory_manager.h + ../../shared_memory_manager.cc + ../../data_compressor.h + ../../common.h + ../../common.cc + ../../restricted_features.h + ../../tracer.h + $<$:../../tracer.cc> + ../../classification.cc +) + +set(PY_BINDING_DEPENDENCY_LIBS + triton-common-json + triton-common-logging + triton-core-serverapi + triton-core-serverstub + ) + +# Conditional Linking Based on Flags +if(${TRITON_ENABLE_HTTP}) + list(APPEND PY_BINDING_DEPENDENCY_LIBS + http-endpoint-library + ) +endif() + +if(${TRITON_ENABLE_GRPC}) + list(APPEND PY_BINDING_DEPENDENCY_LIBS + grpc-endpoint-library + ) +endif() + +if(${TRITON_ENABLE_GPU}) + find_package(CUDAToolkit REQUIRED) + list(APPEND PY_BINDING_DEPENDENCY_LIBS + CUDA::cudart + ) +endif() + +if(${TRITON_ENABLE_TRACING}) + message("TRACING/STATS IS CURRENTLY NOT SUPPORTED.") + find_package(absl CONFIG REQUIRED) + find_package(CURL CONFIG REQUIRED) + find_package(nlohmann_json CONFIG REQUIRED) + find_package(opentelemetry-cpp CONFIG REQUIRED) + list(APPEND PY_BINDING_DEPENDENCY_LIBS + tracing-library + ) +endif() + +# ===================== End of Collection =================================== + + +# ================== Create Python Frontend Bindings ======================== +set( + PYTHON_FRONTEND_BINDING_SRCS + _c/tritonfrontend.h + _c/tritonfrontend_pybind.cc +) + +pybind11_add_module( + py-bindings + MODULE + ${PYTHON_FRONTEND_BINDING_DEPS} + ${PYTHON_FRONTEND_BINDING_SRCS} +) + +target_include_directories(py-bindings PRIVATE ${CMAKE_SOURCE_DIR}/src) + +target_link_libraries( + py-bindings + PRIVATE + ${PY_BINDING_DEPENDENCY_LIBS} +) + +if(${TRITON_ENABLE_HTTP}) + target_compile_definitions( + py-bindings + PRIVATE TRITON_ENABLE_HTTP=1 + ) +endif() + +if(${TRITON_ENABLE_GRPC}) + target_compile_definitions( + py-bindings + PRIVATE TRITON_ENABLE_GRPC=1 + ) +endif() + +if(${TRITON_ENABLE_GPU}) + target_compile_definitions( + py-bindings + PRIVATE TRITON_ENABLE_GPU=1 + PRIVATE TRITON_MIN_COMPUTE_CAPABILITY=${TRITON_MIN_COMPUTE_CAPABILITY} + ) +endif() + +if(${TRITON_ENABLE_TRACING}) + target_include_directories( + py-bindings + PRIVATE ${OPENTELEMETRY_CPP_INCLUDE_DIRS} + ) + target_compile_definitions( + py-bindings + PRIVATE TRITON_ENABLE_TRACING=1 + ) +endif() + +if(${TRITON_ENABLE_STATS}) + target_compile_definitions( + py-bindings + PRIVATE TRITON_ENABLE_STATS=1 + ) +endif() + + +set_property(TARGET py-bindings PROPERTY OUTPUT_NAME tritonfrontend_bindings) + +set_target_properties( + py-bindings + PROPERTIES + BUILD_RPATH "$ORIGIN:/opt/tritonserver/lib" +) +# ===================== End of Python Bindings ============================== diff --git a/src/python/tritonfrontend/__init__.py b/src/python/tritonfrontend/__init__.py new file mode 100644 index 0000000000..48eaf64e8b --- /dev/null +++ b/src/python/tritonfrontend/__init__.py @@ -0,0 +1,33 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# triton/server/src/python/tritonfrontend/__init__.py + +import builtins +from importlib.metadata import PackageNotFoundError, version + +from tritonfrontend._api._kservegrpc import KServeGrpc +from tritonfrontend._api._kservehttp import KServeHttp diff --git a/src/python/tritonfrontend/__init__.pyi b/src/python/tritonfrontend/__init__.pyi new file mode 100644 index 0000000000..17847e4038 --- /dev/null +++ b/src/python/tritonfrontend/__init__.pyi @@ -0,0 +1 @@ +# Need to automate stubgen process as a part of build: https://github.com/triton-inference-server/server/pull/7501#discussion_r1720135228 diff --git a/src/python/tritonfrontend/_api/__init__.py b/src/python/tritonfrontend/_api/__init__.py new file mode 100644 index 0000000000..dc1c939c66 --- /dev/null +++ b/src/python/tritonfrontend/_api/__init__.py @@ -0,0 +1,25 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/src/python/tritonfrontend/_api/_error_mapping.py b/src/python/tritonfrontend/_api/_error_mapping.py new file mode 100644 index 0000000000..39a1e9aeb1 --- /dev/null +++ b/src/python/tritonfrontend/_api/_error_mapping.py @@ -0,0 +1,48 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import tritonserver +from tritonfrontend._c.tritonfrontend_bindings import ( + AlreadyExistsError, + InternalError, + InvalidArgumentError, + NotFoundError, + TritonError, + UnavailableError, + UnknownError, + UnsupportedError, +) + +ERROR_MAPPING = { + TritonError: tritonserver.TritonError, + NotFoundError: tritonserver.NotFoundError, + UnknownError: tritonserver.UnknownError, + InternalError: tritonserver.InternalError, + InvalidArgumentError: tritonserver.InvalidArgumentError, + UnavailableError: tritonserver.UnavailableError, + AlreadyExistsError: tritonserver.AlreadyExistsError, + UnsupportedError: tritonserver.UnsupportedError, +} diff --git a/src/python/tritonfrontend/_api/_kservegrpc.py b/src/python/tritonfrontend/_api/_kservegrpc.py new file mode 100644 index 0000000000..5471613340 --- /dev/null +++ b/src/python/tritonfrontend/_api/_kservegrpc.py @@ -0,0 +1,137 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys +from enum import IntEnum +from typing import Union + +import tritonserver +from pydantic import Field +from pydantic.dataclasses import dataclass +from tritonfrontend._api._error_mapping import ERROR_MAPPING +from tritonfrontend._c.tritonfrontend_bindings import ( + InvalidArgumentError, + TritonError, + TritonFrontendGrpc, +) + + +# Enum (mirroring C++ format) +class Grpc_compression_level(IntEnum): + NONE = 0 + LOW = 1 + MED = 2 + HIGH = 3 + COUNT = 4 + + +class KServeGrpc: + Grpc_compression_level = ( + Grpc_compression_level # Include the enum as a class attribute + ) + + # triton::server::grpc::Options + @dataclass + class Options: + # triton::server::grpc::SocketOptions + address: str = "0.0.0.0" + port: int = Field(8001, ge=0, le=65535) + reuse_port: bool = False + # triton::server::grpc::SslOptions + use_ssl: bool = False + server_cert: str = "" + server_key: str = "" + root_cert: str = "" + use_mutual_auth: bool = False + # triton::server::grpc::KeepAliveOptions + keepalive_time_ms: int = Field(7_200_000, ge=0) + keepalive_timeout_ms: int = Field(20_000, ge=0) + keepalive_permit_without_calls: bool = False + http2_max_pings_without_data: int = Field(2, ge=0) + http2_min_recv_ping_interval_without_data_ms: int = Field(300_000, ge=0) + http2_max_ping_strikes: int = Field(2, ge=0) + max_connection_age_ms: int = Field(0, ge=0) + max_connection_age_grace_ms: int = Field(0, ge=0) + + # triton::server::grpc::Options + + infer_compression_level: Union[ + int, Grpc_compression_level + ] = Grpc_compression_level.NONE + infer_allocation_pool_size: int = Field(8, ge=0) + forward_header_pattern: str = "" + # DLIS-7215: Add restricted protocol support + # restricted_protocols: str = "" + + def __post_init__(self): + if isinstance(self.infer_compression_level, Grpc_compression_level): + self.infer_compression_level = self.infer_compression_level.value + + class Server: + def __init__(self, server: tritonserver, options: "KServeGrpc.Options" = None): + try: + server_ptr = server._ptr() # TRITONSERVER_Server pointer + + # If no options provided, default options are selected + if options is None: + options = KServeGrpc.Options() + + if not isinstance(options, KServeGrpc.Options): + raise InvalidArgumentError( + "Incorrect type for options. options argument must be of type KServeGrpc.Options" + ) + + # Converts dataclass instance -> python dictionary -> unordered_map> + options_dict: dict[str, Union[int, bool, str]] = options.__dict__ + + self.triton_frontend = TritonFrontendGrpc(server_ptr, options_dict) + except TritonError: + exc_type, exc_value, _ = sys.exc_info() + # raise ... from None masks the tritonfrontend Error from being added in traceback + raise ERROR_MAPPING[exc_type](exc_value) from None + + def __enter__(self): + self.triton_frontend.start() + return self + + def __exit__(self, exc_type, exc_value, traceback): + self.triton_frontend.stop() + if exc_type: + raise ERROR_MAPPING[exc_type](exc_value) from None + + def start(self): + try: + self.triton_frontend.start() + except TritonError: + exc_type, exc_value, _ = sys.exc_info() + raise ERROR_MAPPING[exc_type](exc_value) from None + + def stop(self): + try: + self.triton_frontend.stop() + except TritonError: + exc_type, exc_value, _ = sys.exc_info() + raise ERROR_MAPPING[exc_type](exc_value) from None diff --git a/src/python/tritonfrontend/_api/_kservegrpc.pyi b/src/python/tritonfrontend/_api/_kservegrpc.pyi new file mode 100644 index 0000000000..c81d3d6afc --- /dev/null +++ b/src/python/tritonfrontend/_api/_kservegrpc.pyi @@ -0,0 +1,74 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from enum import IntEnum + +import tritonserver +from _typeshed import Incomplete +from tritonfrontend._c.tritonfrontend_bindings import ( + InvalidArgumentError as InvalidArgumentError, +) +from tritonfrontend._c.tritonfrontend_bindings import ( + TritonFrontendGrpc as TritonFrontendGrpc, +) + +class Grpc_compression_level(IntEnum): + NONE = 0 + LOW = 1 + MED = 2 + HIGH = 3 + COUNT = 4 + +class KServeGrpc: + Grpc_compression_level = Grpc_compression_level + class Options: + address: str + port: int + reuse_port: bool + use_ssl: bool + server_cert: str + server_key: str + root_cert: str + use_mutual_auth: bool + keepalive_time_ms: int + keepalive_timeout_ms: int + keepalive_permit_without_calls: bool + http2_max_pings_without_data: int + http2_min_recv_ping_interval_without_data_ms: int + http2_max_ping_strikes: int + max_connection_age_ms: int + max_connection_age_grace_ms: int + infer_compression_level: int | Grpc_compression_level + infer_allocation_pool_size: int + forward_header_pattern: str + def __post_init__(self) -> None: ... + class Server: + triton_frontend: Incomplete + def __init__(self, server: tritonserver, options: KServeGrpc.Options = None) -> None: ... + def __enter__(self): ... + def __exit__(self, exc_type: type[BaseException] | None, exc_value: BaseException | None, traceback: types.TracebackType | None) -> None: ... + def start(self): ... + def stop(self): ... diff --git a/src/python/tritonfrontend/_api/_kservehttp.py b/src/python/tritonfrontend/_api/_kservehttp.py new file mode 100644 index 0000000000..6a2524986a --- /dev/null +++ b/src/python/tritonfrontend/_api/_kservehttp.py @@ -0,0 +1,97 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import sys +from typing import Union + +import tritonserver +from pydantic import Field +from pydantic.dataclasses import dataclass +from tritonfrontend._api._error_mapping import ERROR_MAPPING +from tritonfrontend._c.tritonfrontend_bindings import ( + InvalidArgumentError, + TritonError, + TritonFrontendHttp, +) + + +class KServeHttp: + @dataclass + class Options: + address: str = "0.0.0.0" + port: int = Field(8000, ge=0, le=65535) + reuse_port: bool = False + thread_count: int = Field(8, ge=0) + header_forward_pattern: str = "" + # DLIS-7215: Add restricted protocol support + # restricted_protocols: list + + class Server: + def __init__(self, server: tritonserver, options: "KServeHttp.Options" = None): + try: + server_ptr = server._ptr() # TRITONSERVER_Server pointer + + # If no options provided, default options are selected + if options is None: + options = KServeHttp.Options() + + if not isinstance(options, KServeHttp.Options): + raise InvalidArgumentError( + "Incorrect type for options. options argument must be of type KServeHttp.Options" + ) + + options_dict: dict[str, Union[int, bool, str]] = options.__dict__ + # Converts dataclass instance -> python dictionary -> unordered_map> + + self.triton_frontend = TritonFrontendHttp(server_ptr, options_dict) + except TritonError: + exc_type, exc_value, _ = sys.exc_info() + # raise ... from None masks the tritonfrontend Error from being added in traceback + raise ERROR_MAPPING[exc_type](exc_value) from None + + def __enter__(self): + self.triton_frontend.start() + return self + + def __exit__(self, exc_type, exc_value, traceback): + self.triton_frontend.stop() + if exc_type: + raise ERROR_MAPPING[exc_type](exc_value) from None + + def start(self): + try: + self.triton_frontend.start() + except TritonError: + exc_type, exc_value, _ = sys.exc_info() + raise ERROR_MAPPING[exc_type](exc_value) from None + + def stop(self): + try: + self.triton_frontend.stop() + except TritonError: + exc_type, exc_value, _ = sys.exc_info() + raise ERROR_MAPPING[exc_type](exc_value) from None diff --git a/src/python/tritonfrontend/_api/_kservehttp.pyi b/src/python/tritonfrontend/_api/_kservehttp.pyi new file mode 100644 index 0000000000..60f3997f39 --- /dev/null +++ b/src/python/tritonfrontend/_api/_kservehttp.pyi @@ -0,0 +1,49 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import tritonserver +from _typeshed import Incomplete +from tritonfrontend._c.tritonfrontend_bindings import ( + InvalidArgumentError as InvalidArgumentError, +) +from tritonfrontend._c.tritonfrontend_bindings import ( + TritonFrontendHttp as TritonFrontendHttp, +) + +class KServeHttp: + class Options: + address: str + port: int + reuse_port: bool + thread_count: int + header_forward_pattern: str + class Server: + triton_frontend: Incomplete + def __init__(self, server: tritonserver, options: KServeHttp.Options = None) -> None: ... + def __enter__(self): ... + def __exit__(self, exc_type: type[BaseException] | None, exc_value: BaseException | None, traceback: types.TracebackType | None) -> None: ... + def start(self) -> None: ... + def stop(self) -> None: ... diff --git a/src/python/tritonfrontend/_c/__init__.py b/src/python/tritonfrontend/_c/__init__.py new file mode 100644 index 0000000000..3e892ede64 --- /dev/null +++ b/src/python/tritonfrontend/_c/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from .tritonfrontend_bindings import * diff --git a/src/python/tritonfrontend/_c/__init__.pyi b/src/python/tritonfrontend/_c/__init__.pyi new file mode 100644 index 0000000000..99eaf9dace --- /dev/null +++ b/src/python/tritonfrontend/_c/__init__.pyi @@ -0,0 +1,27 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from tritonfrontend._c.tritonfrontend_bindings import * diff --git a/src/python/tritonfrontend/_c/tritonfrontend.h b/src/python/tritonfrontend/_c/tritonfrontend.h new file mode 100644 index 0000000000..172147f566 --- /dev/null +++ b/src/python/tritonfrontend/_c/tritonfrontend.h @@ -0,0 +1,139 @@ +// Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#pragma once + +#include // For shared_ptr +#include +#include + +#include "../../../common.h" +#include "../../../restricted_features.h" +#include "../../../shared_memory_manager.h" +#include "../../../tracer.h" +#include "triton/common/logging.h" +#include "triton/core/tritonserver.h" + + +struct TRITONSERVER_Server {}; + +namespace triton { namespace server { namespace python { + +// base exception for all Triton error code +struct TritonError : public std::runtime_error { + explicit TritonError(const std::string& what) : std::runtime_error(what) {} +}; + +// triton::core::python exceptions map 1:1 to TRITONSERVER_Error_Code. +struct UnknownError : public TritonError { + explicit UnknownError(const std::string& what) : TritonError(what) {} +}; +struct InternalError : public TritonError { + explicit InternalError(const std::string& what) : TritonError(what) {} +}; +struct NotFoundError : public TritonError { + explicit NotFoundError(const std::string& what) : TritonError(what) {} +}; +struct InvalidArgumentError : public TritonError { + explicit InvalidArgumentError(const std::string& what) : TritonError(what) {} +}; +struct UnavailableError : public TritonError { + explicit UnavailableError(const std::string& what) : TritonError(what) {} +}; +struct UnsupportedError : public TritonError { + explicit UnsupportedError(const std::string& what) : TritonError(what) {} +}; +struct AlreadyExistsError : public TritonError { + explicit AlreadyExistsError(const std::string& what) : TritonError(what) {} +}; + +void +ThrowIfError(TRITONSERVER_Error* err) +{ + if (err == nullptr) { + return; + } + std::shared_ptr managed_err( + err, TRITONSERVER_ErrorDelete); + std::string msg = TRITONSERVER_ErrorMessage(err); + switch (TRITONSERVER_ErrorCode(err)) { + case TRITONSERVER_ERROR_INTERNAL: + throw InternalError(std::move(msg)); + case TRITONSERVER_ERROR_NOT_FOUND: + throw NotFoundError(std::move(msg)); + case TRITONSERVER_ERROR_INVALID_ARG: + throw InvalidArgumentError(std::move(msg)); + case TRITONSERVER_ERROR_UNAVAILABLE: + throw UnavailableError(std::move(msg)); + case TRITONSERVER_ERROR_UNSUPPORTED: + throw UnsupportedError(std::move(msg)); + case TRITONSERVER_ERROR_ALREADY_EXISTS: + throw AlreadyExistsError(std::move(msg)); + default: + throw UnknownError(std::move(msg)); + } +} + + +template +class TritonFrontend { + private: + std::shared_ptr server_; + std::unique_ptr service; + triton::server::RestrictedFeatures restricted_features; + // TODO: [DLIS-7194] Add support for TraceManager & SharedMemoryManager + // triton::server::TraceManager trace_manager_; + // triton::server::SharedMemoryManager shm_manager_; + + public: + TritonFrontend(uintptr_t server_mem_addr, UnorderedMapType data) + { + TRITONSERVER_Server* server_ptr = + reinterpret_cast(server_mem_addr); + + server_.reset(server_ptr, EmptyDeleter); + + ThrowIfError(FrontendServer::Create( + server_, data, nullptr /* TraceManager */, + nullptr /* SharedMemoryManager */, restricted_features, &service)); + }; + + // TODO: [DLIS-7194] Add support for TraceManager & SharedMemoryManager + // TritonFrontend( + // uintptr_t server_mem_addr, UnorderedMapType data, + // TraceManager trace_manager, SharedMemoryManager shm_manager) + + void StartService() { ThrowIfError(service->Start()); }; + void StopService() { ThrowIfError(service->Stop()); }; + + // The frontend does not own the TRITONSERVER_Server* object. + // Hence, deleting the underlying server instance, + // will cause a double-free when the core bindings attempt to + // delete the TRITONSERVER_Server instance. + static void EmptyDeleter(TRITONSERVER_Server* obj){}; +}; + +}}} // namespace triton::server::python diff --git a/src/python/tritonfrontend/_c/tritonfrontend_bindings.pyi b/src/python/tritonfrontend/_c/tritonfrontend_bindings.pyi new file mode 100644 index 0000000000..535693a5cb --- /dev/null +++ b/src/python/tritonfrontend/_c/tritonfrontend_bindings.pyi @@ -0,0 +1,44 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from tritonfrontend import AlreadyExistsError as AlreadyExistsError +from tritonfrontend import InternalError as InternalError +from tritonfrontend import InvalidArgumentError as InvalidArgumentError +from tritonfrontend import NotFoundError as NotFoundError +from tritonfrontend import TritonError as TritonError +from tritonfrontend import UnavailableError as UnavailableError +from tritonfrontend import UnknownError as UnknownError +from tritonfrontend import UnsupportedError as UnsupportedError + +class TritonFrontendGrpc: + def __init__(self, arg0: int, arg1: dict[str, bool | int | str]) -> None: ... + def start(self) -> None: ... + def stop(self) -> None: ... + +class TritonFrontendHttp: + def __init__(self, arg0: int, arg1: dict[str, bool | int | str]) -> None: ... + def start(self) -> None: ... + def stop(self) -> None: ... diff --git a/src/python/tritonfrontend/_c/tritonfrontend_pybind.cc b/src/python/tritonfrontend/_c/tritonfrontend_pybind.cc new file mode 100644 index 0000000000..86a0ac1c41 --- /dev/null +++ b/src/python/tritonfrontend/_c/tritonfrontend_pybind.cc @@ -0,0 +1,76 @@ +// Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include +#include + +#include "../../../grpc/grpc_server.h" +#include "../../../http_server.h" +#include "triton/core/tritonserver.h" +#include "tritonfrontend.h" + + +namespace py = pybind11; + +namespace triton { namespace server { namespace python { + + +PYBIND11_MODULE(tritonfrontend_bindings, m) +{ + m.doc() = "Python bindings for Triton Inference Server Frontend Endpoints"; + + auto tfe = py::register_exception(m, "TritonError"); + py::register_exception(m, "UnknownError", tfe.ptr()); + py::register_exception(m, "InternalError", tfe.ptr()); + py::register_exception(m, "NotFoundError", tfe.ptr()); + py::register_exception( + m, "InvalidArgumentError", tfe.ptr()); + py::register_exception(m, "UnavailableError", tfe.ptr()); + py::register_exception(m, "UnsupportedError", tfe.ptr()); + py::register_exception( + m, "AlreadyExistsError", tfe.ptr()); + + + py::class_>(m, "TritonFrontendHttp") + .def(py::init()) + .def("start", &TritonFrontend::StartService) + .def("stop", &TritonFrontend::StopService); + + py::class_>( + m, "TritonFrontendGrpc") + .def(py::init()) + .def( + "start", &TritonFrontend< + triton::server::grpc::Server, + triton::server::grpc::Server>::StartService) + .def( + "stop", &TritonFrontend< + triton::server::grpc::Server, + triton::server::grpc::Server>::StopService); +} + +}}} // namespace triton::server::python diff --git a/src/python/tritonfrontend/py.typed b/src/python/tritonfrontend/py.typed new file mode 100644 index 0000000000..e69de29bb2 From a6fff975a214ff00221790dd0a5521fb05ce3ac9 Mon Sep 17 00:00:00 2001 From: KrishnanPrash <140860868+KrishnanPrash@users.noreply.github.com> Date: Tue, 3 Sep 2024 17:53:56 -0500 Subject: [PATCH 23/44] fix: Adding copyright info (#7591) --- docs/customization_guide/tritonfrontend.md | 27 ++++++++++++++++++++++ qa/L0_python_api/test_kserve.py | 26 +++++++++++++++++++++ src/python/tritonfrontend/__init__.pyi | 26 +++++++++++++++++++++ 3 files changed, 79 insertions(+) diff --git a/docs/customization_guide/tritonfrontend.md b/docs/customization_guide/tritonfrontend.md index caaac9308d..0ec4b32749 100644 --- a/docs/customization_guide/tritonfrontend.md +++ b/docs/customization_guide/tritonfrontend.md @@ -1,3 +1,30 @@ + ### Triton Server (tritonfrontend) Bindings The `tritonfrontend` python package is a set of bindings to Triton's existing frontends implemented in C++. Currently, `tritonfrontend` supports starting up `KServeHttp` and `KServeGrpc` frontends. These bindings used in-combination with Triton's Python In-Process API ([`tritonserver`](https://github.com/triton-inference-server/core/tree/main/python/tritonserver)) and [`tritonclient`](https://github.com/triton-inference-server/client/tree/main/src/python/library) extend the ability to use Triton's full feature set with a couple of lines of Python. diff --git a/qa/L0_python_api/test_kserve.py b/qa/L0_python_api/test_kserve.py index ab77783d0c..703d86ca43 100644 --- a/qa/L0_python_api/test_kserve.py +++ b/qa/L0_python_api/test_kserve.py @@ -1,3 +1,29 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + import time from functools import partial diff --git a/src/python/tritonfrontend/__init__.pyi b/src/python/tritonfrontend/__init__.pyi index 17847e4038..0afb0cb886 100644 --- a/src/python/tritonfrontend/__init__.pyi +++ b/src/python/tritonfrontend/__init__.pyi @@ -1 +1,27 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + # Need to automate stubgen process as a part of build: https://github.com/triton-inference-server/server/pull/7501#discussion_r1720135228 From ca8ae28bbf1e1de0d7b1d1cb71f930fccdac5a84 Mon Sep 17 00:00:00 2001 From: Yingge He <157551214+yinggeh@users.noreply.github.com> Date: Wed, 4 Sep 2024 13:54:43 -0700 Subject: [PATCH 24/44] test: Refactor core input size checks (#7592) --- qa/L0_input_validation/input_validation_test.py | 4 ++-- qa/L0_input_validation/test.sh | 17 +++++++++++------ 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/qa/L0_input_validation/input_validation_test.py b/qa/L0_input_validation/input_validation_test.py index 33360b7a08..8e7f58bb0c 100755 --- a/qa/L0_input_validation/input_validation_test.py +++ b/qa/L0_input_validation/input_validation_test.py @@ -195,7 +195,7 @@ def get_input_array(input_size, np_dtype): triton_client.infer(model_name=model_name, inputs=inputs) err_str = str(e.exception) self.assertIn( - f"expected {input_size} string elements for inference input 'INPUT1', got {input_size-2}", + f"expected {input_size} string elements for inference input 'INPUT1' for model '{model_name}', got {input_size-2}", err_str, ) @@ -208,7 +208,7 @@ def get_input_array(input_size, np_dtype): triton_client.infer(model_name=model_name, inputs=inputs) err_str = str(e.exception) self.assertIn( - f"expected {input_size} string elements for inference input 'INPUT1', got {input_size+2}", + f"unexpected number of string elements {input_size+1} for inference input 'INPUT1' for model '{model_name}', expecting {input_size}", err_str, ) diff --git a/qa/L0_input_validation/test.sh b/qa/L0_input_validation/test.sh index fc70abd969..22e0560959 100755 --- a/qa/L0_input_validation/test.sh +++ b/qa/L0_input_validation/test.sh @@ -68,7 +68,9 @@ set +e python3 -m pytest --junitxml="input_validation.report.xml" $TEST_PY::InputValTest >> $CLIENT_LOG 2>&1 if [ $? -ne 0 ]; then - echo -e "\n***\n*** input_validation_test.py FAILED. \n***" + cat $CLIENT_LOG + cat $SERVER_LOG + echo -e "\n***\n*** input_validation_test.py::InputValTest FAILED. \n***" RET=1 fi set -e @@ -138,7 +140,9 @@ set +e python3 -m pytest --junitxml="input_shape_validation.report.xml" $TEST_PY::InputShapeTest >> $CLIENT_LOG 2>&1 if [ $? -ne 0 ]; then - echo -e "\n***\n*** input_validation_test.py FAILED. \n***" + cat $CLIENT_LOG + cat $SERVER_LOG + echo -e "\n***\n*** input_validation_test.py::InputShapeTest FAILED. \n***" RET=1 fi set -e @@ -147,10 +151,13 @@ kill $SERVER_PID wait $SERVER_PID # input_byte_size_test +cp -r /data/inferenceserver/${REPO_VERSION}/qa_identity_model_repository/{savedmodel_zero_1_float32,savedmodel_zero_1_object} ./models + set +e -LD_LIBRARY_PATH=/opt/tritonserver/lib:$LD_LIBRARY_PATH $TEST_EXEC >>$TEST_LOG 2>&1 +LD_LIBRARY_PATH=/opt/tritonserver/lib:$LD_LIBRARY_PATH $TEST_EXEC >> $TEST_LOG 2>&1 if [ $? -ne 0 ]; then - echo -e "\n***\n*** Query Unit Test Failed\n***" + cat $TEST_LOG + echo -e "\n***\n*** input_byte_size_test FAILED\n***" RET=1 fi set -e @@ -158,8 +165,6 @@ set -e if [ $RET -eq 0 ]; then echo -e "\n***\n*** Input Validation Test Passed\n***" else - cat $CLIENT_LOG - cat $SERVER_LOG echo -e "\n***\n*** Input Validation Test FAILED\n***" fi From be557b6ffc8d180b86ddbd0e1ddad615dd913df2 Mon Sep 17 00:00:00 2001 From: Francesco Petrini Date: Fri, 6 Sep 2024 17:13:23 -0700 Subject: [PATCH 25/44] Don't Build `tritonfrontend` for Windows. (#7599) Don't Build `tritonfrontend` for Windows. --- src/CMakeLists.txt | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 2e0380470a..9488fc6233 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -782,8 +782,11 @@ if (NOT WIN32) endif() # TRITON_ENABLE_GPU endif() # NOT WIN32 -# tritonfrontend python package -add_subdirectory(python) +# DLIS-7292: Extend tritonfrontend to build for Windows +if (NOT WIN32) + # tritonfrontend python package + add_subdirectory(python) +endif (NOT WIN32) # Currently unit tests do not build for windows... if ( NOT WIN32) From edd0ac1b02f415a658758410903900fc5017e4f8 Mon Sep 17 00:00:00 2001 From: Sai Kiran Polisetty Date: Wed, 11 Sep 2024 09:08:29 +0530 Subject: [PATCH 26/44] fix: Add reference count tracking for shared memory regions (#7567) Co-authored-by: GuanLuo <41310872+GuanLuo@users.noreply.github.com> --- .../cuda_shared_memory_test.py | 312 ++++++++++++++---- qa/L0_cuda_shared_memory/test.sh | 41 +++ qa/L0_shared_memory/shared_memory_test.py | 290 ++++++++++++---- qa/L0_shared_memory/test.sh | 40 +++ qa/L0_trt_shape_tensors/test.sh | 2 +- .../execute_delayed_model/config.pbtxt | 55 +++ .../execute_delayed_model/model.py | 72 ++++ src/grpc/infer_handler.cc | 54 +-- src/grpc/infer_handler.h | 32 +- src/grpc/stream_infer_handler.cc | 36 +- src/http_server.cc | 13 +- src/http_server.h | 15 + src/shared_memory_manager.cc | 47 ++- src/shared_memory_manager.h | 99 +++--- 14 files changed, 886 insertions(+), 222 deletions(-) create mode 100644 qa/python_models/execute_delayed_model/config.pbtxt create mode 100644 qa/python_models/execute_delayed_model/model.py diff --git a/qa/L0_cuda_shared_memory/cuda_shared_memory_test.py b/qa/L0_cuda_shared_memory/cuda_shared_memory_test.py index 07f9c05a88..51137e8934 100755 --- a/qa/L0_cuda_shared_memory/cuda_shared_memory_test.py +++ b/qa/L0_cuda_shared_memory/cuda_shared_memory_test.py @@ -31,18 +31,20 @@ sys.path.append("../common") import os +import time import unittest +from functools import partial import infer_util as iu import numpy as np import test_util as tu import tritonclient.grpc as grpcclient import tritonclient.http as httpclient -import tritonshmutils.cuda_shared_memory as cshm +import tritonclient.utils.cuda_shared_memory as cshm from tritonclient.utils import * -class CudaSharedMemoryTest(tu.TestResultCollector): +class CudaSharedMemoryTestBase(tu.TestResultCollector): DEFAULT_SHM_BYTE_SIZE = 64 def setUp(self): @@ -61,76 +63,6 @@ def _setup_client(self): self.url, verbose=True ) - def test_invalid_create_shm(self): - # Raises error since tried to create invalid cuda shared memory region - try: - shm_op0_handle = cshm.create_shared_memory_region("dummy_data", -1, 0) - cshm.destroy_shared_memory_region(shm_op0_handle) - except Exception as ex: - self.assertEqual(str(ex), "unable to create cuda shared memory handle") - - def test_valid_create_set_register(self): - # Create a valid cuda shared memory region, fill data in it and register - shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0) - cshm.set_shared_memory_region( - shm_op0_handle, [np.array([1, 2], dtype=np.float32)] - ) - self.triton_client.register_cuda_shared_memory( - "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8 - ) - shm_status = self.triton_client.get_cuda_shared_memory_status() - if self.protocol == "http": - self.assertEqual(len(shm_status), 1) - else: - self.assertEqual(len(shm_status.regions), 1) - cshm.destroy_shared_memory_region(shm_op0_handle) - - def test_unregister_before_register(self): - # Create a valid cuda shared memory region and unregister before register - shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0) - self.triton_client.unregister_cuda_shared_memory("dummy_data") - shm_status = self.triton_client.get_cuda_shared_memory_status() - if self.protocol == "http": - self.assertEqual(len(shm_status), 0) - else: - self.assertEqual(len(shm_status.regions), 0) - cshm.destroy_shared_memory_region(shm_op0_handle) - - def test_unregister_after_register(self): - # Create a valid cuda shared memory region and unregister after register - shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0) - self.triton_client.register_cuda_shared_memory( - "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8 - ) - self.triton_client.unregister_cuda_shared_memory("dummy_data") - shm_status = self.triton_client.get_cuda_shared_memory_status() - if self.protocol == "http": - self.assertEqual(len(shm_status), 0) - else: - self.assertEqual(len(shm_status.regions), 0) - cshm.destroy_shared_memory_region(shm_op0_handle) - - def test_reregister_after_register(self): - # Create a valid cuda shared memory region and unregister after register - shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0) - self.triton_client.register_cuda_shared_memory( - "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8 - ) - try: - self.triton_client.register_cuda_shared_memory( - "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8 - ) - except Exception as ex: - self.assertIn( - "shared memory region 'dummy_data' already in manager", str(ex) - ) - shm_status = self.triton_client.get_cuda_shared_memory_status() - if self.protocol == "http": - self.assertEqual(len(shm_status), 1) - else: - self.assertEqual(len(shm_status.regions), 1) - cshm.destroy_shared_memory_region(shm_op0_handle) - def _configure_server( self, create_byte_size=DEFAULT_SHM_BYTE_SIZE, @@ -205,6 +137,78 @@ def _cleanup_server(self, shm_handles): for shm_handle in shm_handles: cshm.destroy_shared_memory_region(shm_handle) + +class CudaSharedMemoryTest(CudaSharedMemoryTestBase): + def test_invalid_create_shm(self): + # Raises error since tried to create invalid cuda shared memory region + try: + shm_op0_handle = cshm.create_shared_memory_region("dummy_data", -1, 0) + cshm.destroy_shared_memory_region(shm_op0_handle) + except Exception as ex: + self.assertEqual(str(ex), "unable to create cuda shared memory handle") + + def test_valid_create_set_register(self): + # Create a valid cuda shared memory region, fill data in it and register + shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0) + cshm.set_shared_memory_region( + shm_op0_handle, [np.array([1, 2], dtype=np.float32)] + ) + self.triton_client.register_cuda_shared_memory( + "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8 + ) + shm_status = self.triton_client.get_cuda_shared_memory_status() + if self.protocol == "http": + self.assertEqual(len(shm_status), 1) + else: + self.assertEqual(len(shm_status.regions), 1) + cshm.destroy_shared_memory_region(shm_op0_handle) + + def test_unregister_before_register(self): + # Create a valid cuda shared memory region and unregister before register + shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0) + self.triton_client.unregister_cuda_shared_memory("dummy_data") + shm_status = self.triton_client.get_cuda_shared_memory_status() + if self.protocol == "http": + self.assertEqual(len(shm_status), 0) + else: + self.assertEqual(len(shm_status.regions), 0) + cshm.destroy_shared_memory_region(shm_op0_handle) + + def test_unregister_after_register(self): + # Create a valid cuda shared memory region and unregister after register + shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0) + self.triton_client.register_cuda_shared_memory( + "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8 + ) + self.triton_client.unregister_cuda_shared_memory("dummy_data") + shm_status = self.triton_client.get_cuda_shared_memory_status() + if self.protocol == "http": + self.assertEqual(len(shm_status), 0) + else: + self.assertEqual(len(shm_status.regions), 0) + cshm.destroy_shared_memory_region(shm_op0_handle) + + def test_reregister_after_register(self): + # Create a valid cuda shared memory region and unregister after register + shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0) + self.triton_client.register_cuda_shared_memory( + "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8 + ) + try: + self.triton_client.register_cuda_shared_memory( + "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8 + ) + except Exception as ex: + self.assertIn( + "shared memory region 'dummy_data' already in manager", str(ex) + ) + shm_status = self.triton_client.get_cuda_shared_memory_status() + if self.protocol == "http": + self.assertEqual(len(shm_status), 1) + else: + self.assertEqual(len(shm_status.regions), 1) + cshm.destroy_shared_memory_region(shm_op0_handle) + def test_unregister_after_inference(self): # Unregister after inference error_msg = [] @@ -396,5 +400,169 @@ def test_infer_byte_size_out_of_bound(self): self._cleanup_server(shm_handles) +class TestCudaSharedMemoryUnregister(CudaSharedMemoryTestBase): + def _test_unregister_shm_fail(self): + second_client = httpclient.InferenceServerClient("localhost:8000", verbose=True) + + with self.assertRaises(InferenceServerException) as ex: + second_client.unregister_cuda_shared_memory() + self.assertIn( + "Failed to unregister the following cuda shared memory regions: input0_data ,input1_data ,output0_data ,output1_data", + str(ex.exception), + ) + + with self.assertRaises(InferenceServerException) as ex: + second_client.unregister_cuda_shared_memory("input0_data") + self.assertIn( + "Cannot unregister shared memory region 'input0_data', it is currently in use.", + str(ex.exception), + ) + + with self.assertRaises(InferenceServerException) as ex: + second_client.unregister_cuda_shared_memory("input1_data") + self.assertIn( + "Cannot unregister shared memory region 'input1_data', it is currently in use.", + str(ex.exception), + ) + + with self.assertRaises(InferenceServerException) as ex: + second_client.unregister_cuda_shared_memory("output0_data") + self.assertIn( + "Cannot unregister shared memory region 'output0_data', it is currently in use.", + str(ex.exception), + ) + + with self.assertRaises(InferenceServerException) as ex: + second_client.unregister_cuda_shared_memory("output1_data") + self.assertIn( + "Cannot unregister shared memory region 'output1_data', it is currently in use.", + str(ex.exception), + ) + + def _test_shm_not_found(self): + second_client = httpclient.InferenceServerClient("localhost:8000", verbose=True) + + with self.assertRaises(InferenceServerException) as ex: + second_client.get_cuda_shared_memory_status("input0_data") + self.assertIn( + "Unable to find cuda shared memory region: 'input0_data'", + str(ex.exception), + ) + + with self.assertRaises(InferenceServerException) as ex: + second_client.get_cuda_shared_memory_status("input1_data") + self.assertIn( + "Unable to find cuda shared memory region: 'input1_data'", + str(ex.exception), + ) + + with self.assertRaises(InferenceServerException) as ex: + second_client.get_cuda_shared_memory_status("output0_data") + self.assertIn( + "Unable to find cuda shared memory region: 'output0_data'", + str(ex.exception), + ) + + with self.assertRaises(InferenceServerException) as ex: + second_client.get_cuda_shared_memory_status("output1_data") + self.assertIn( + "Unable to find cuda shared memory region: 'output1_data'", + str(ex.exception), + ) + + def test_unregister_shm_during_inference_http(self): + try: + self.triton_client.unregister_cuda_shared_memory() + shm_handles = self._configure_server() + + inputs = [ + httpclient.InferInput("INPUT0", [1, 16], "INT32"), + httpclient.InferInput("INPUT1", [1, 16], "INT32"), + ] + outputs = [ + httpclient.InferRequestedOutput("OUTPUT0", binary_data=True), + httpclient.InferRequestedOutput("OUTPUT1", binary_data=False), + ] + + inputs[0].set_shared_memory("input0_data", self.DEFAULT_SHM_BYTE_SIZE) + inputs[1].set_shared_memory("input1_data", self.DEFAULT_SHM_BYTE_SIZE) + outputs[0].set_shared_memory("output0_data", self.DEFAULT_SHM_BYTE_SIZE) + outputs[1].set_shared_memory("output1_data", self.DEFAULT_SHM_BYTE_SIZE) + + async_request = self.triton_client.async_infer( + model_name="simple", inputs=inputs, outputs=outputs + ) + + # Ensure inference started + time.sleep(2) + + # Try unregister shm regions during inference + self._test_unregister_shm_fail() + + # Blocking call + async_request.get_result() + + # Try unregister shm regions after inference + self.triton_client.unregister_cuda_shared_memory() + self._test_shm_not_found() + + finally: + self._cleanup_server(shm_handles) + + def test_unregister_shm_during_inference_grpc(self): + try: + self.triton_client.unregister_cuda_shared_memory() + shm_handles = self._configure_server() + + inputs = [ + grpcclient.InferInput("INPUT0", [1, 16], "INT32"), + grpcclient.InferInput("INPUT1", [1, 16], "INT32"), + ] + outputs = [ + grpcclient.InferRequestedOutput("OUTPUT0"), + grpcclient.InferRequestedOutput("OUTPUT1"), + ] + + inputs[0].set_shared_memory("input0_data", self.DEFAULT_SHM_BYTE_SIZE) + inputs[1].set_shared_memory("input1_data", self.DEFAULT_SHM_BYTE_SIZE) + outputs[0].set_shared_memory("output0_data", self.DEFAULT_SHM_BYTE_SIZE) + outputs[1].set_shared_memory("output1_data", self.DEFAULT_SHM_BYTE_SIZE) + + def callback(user_data, result, error): + if error: + user_data.append(error) + else: + user_data.append(result) + + user_data = [] + + self.triton_client.async_infer( + model_name="simple", + inputs=inputs, + outputs=outputs, + callback=partial(callback, user_data), + ) + + # Ensure inference started + time.sleep(2) + + # Try unregister shm regions during inference + self._test_unregister_shm_fail() + + # Wait until the results are available in user_data + time_out = 20 + while (len(user_data) == 0) and time_out > 0: + time_out = time_out - 1 + time.sleep(1) + time.sleep(2) + + # Try unregister shm regions after inference + self.triton_client.unregister_cuda_shared_memory() + self._test_shm_not_found() + + finally: + self._cleanup_server(shm_handles) + + if __name__ == "__main__": unittest.main() diff --git a/qa/L0_cuda_shared_memory/test.sh b/qa/L0_cuda_shared_memory/test.sh index 02857b2153..b7126a9295 100755 --- a/qa/L0_cuda_shared_memory/test.sh +++ b/qa/L0_cuda_shared_memory/test.sh @@ -84,6 +84,47 @@ for i in \ done done +mkdir -p python_models/simple/1/ +cp ../python_models/execute_delayed_model/model.py ./python_models/simple/1/ +cp ../python_models/execute_delayed_model/config.pbtxt ./python_models/simple/ +sed -i 's/KIND_CPU/KIND_GPU/g' ./python_models/simple/config.pbtxt + +for client_type in http grpc; do + SERVER_ARGS="--model-repository=`pwd`/python_models --log-verbose=1 ${SERVER_ARGS_EXTRA}" + SERVER_LOG="./unregister_shm.$client_type.server.log" + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + export CLIENT_TYPE=$client_type + CLIENT_LOG="./unregister_shm.$client_type.client.log" + set +e + python3 $SHM_TEST TestCudaSharedMemoryUnregister.test_unregister_shm_during_inference_$client_type >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $TEST_RESULT_FILE + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi + fi + + kill $SERVER_PID + wait $SERVER_PID + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Server shut down non-gracefully\n***" + RET=1 + fi + set -e + done + if [ $RET -eq 0 ]; then echo -e "\n***\n*** Test Passed\n***" else diff --git a/qa/L0_shared_memory/shared_memory_test.py b/qa/L0_shared_memory/shared_memory_test.py index c38ecb4814..871fca9b2a 100755 --- a/qa/L0_shared_memory/shared_memory_test.py +++ b/qa/L0_shared_memory/shared_memory_test.py @@ -31,7 +31,9 @@ sys.path.append("../common") import os +import time import unittest +from functools import partial import infer_util as iu import numpy as np @@ -43,7 +45,7 @@ from tritonclient import utils -class SharedMemoryTest(tu.TestResultCollector): +class SystemSharedMemoryTestBase(tu.TestResultCollector): DEFAULT_SHM_BYTE_SIZE = 64 def setUp(self): @@ -62,6 +64,68 @@ def _setup_client(self): self.url, verbose=True ) + def _configure_server( + self, + create_byte_size=DEFAULT_SHM_BYTE_SIZE, + register_byte_size=DEFAULT_SHM_BYTE_SIZE, + register_offset=0, + ): + """Creates and registers shared memory regions for testing. + + Parameters + ---------- + create_byte_size: int + Size of each system shared memory region to create. + NOTE: This should be sufficiently large to hold the inputs/outputs + stored in shared memory. + + register_byte_size: int + Size of each system shared memory region to register with server. + NOTE: The (offset + register_byte_size) should be less than or equal + to the create_byte_size. Otherwise an exception will be raised for + an invalid set of registration args. + + register_offset: int + Offset into the shared memory object to start the registered region. + + """ + shm_ip0_handle = shm.create_shared_memory_region( + "input0_data", "/input0_data", create_byte_size + ) + shm_ip1_handle = shm.create_shared_memory_region( + "input1_data", "/input1_data", create_byte_size + ) + shm_op0_handle = shm.create_shared_memory_region( + "output0_data", "/output0_data", create_byte_size + ) + shm_op1_handle = shm.create_shared_memory_region( + "output1_data", "/output1_data", create_byte_size + ) + # Implicit assumption that input and output byte_sizes are 64 bytes for now + input0_data = np.arange(start=0, stop=16, dtype=np.int32) + input1_data = np.ones(shape=16, dtype=np.int32) + shm.set_shared_memory_region(shm_ip0_handle, [input0_data]) + shm.set_shared_memory_region(shm_ip1_handle, [input1_data]) + self.triton_client.register_system_shared_memory( + "input0_data", "/input0_data", register_byte_size, offset=register_offset + ) + self.triton_client.register_system_shared_memory( + "input1_data", "/input1_data", register_byte_size, offset=register_offset + ) + self.triton_client.register_system_shared_memory( + "output0_data", "/output0_data", register_byte_size, offset=register_offset + ) + self.triton_client.register_system_shared_memory( + "output1_data", "/output1_data", register_byte_size, offset=register_offset + ) + return [shm_ip0_handle, shm_ip1_handle, shm_op0_handle, shm_op1_handle] + + def _cleanup_server(self, shm_handles): + for shm_handle in shm_handles: + shm.destroy_shared_memory_region(shm_handle) + + +class SharedMemoryTest(SystemSharedMemoryTestBase): def test_invalid_create_shm(self): # Raises error since tried to create invalid system shared memory region try: @@ -128,66 +192,6 @@ def test_reregister_after_register(self): self.assertTrue(len(shm_status.regions) == 1) shm.destroy_shared_memory_region(shm_op0_handle) - def _configure_server( - self, - create_byte_size=DEFAULT_SHM_BYTE_SIZE, - register_byte_size=DEFAULT_SHM_BYTE_SIZE, - register_offset=0, - ): - """Creates and registers shared memory regions for testing. - - Parameters - ---------- - create_byte_size: int - Size of each system shared memory region to create. - NOTE: This should be sufficiently large to hold the inputs/outputs - stored in shared memory. - - register_byte_size: int - Size of each system shared memory region to register with server. - NOTE: The (offset + register_byte_size) should be less than or equal - to the create_byte_size. Otherwise an exception will be raised for - an invalid set of registration args. - - register_offset: int - Offset into the shared memory object to start the registered region. - - """ - shm_ip0_handle = shm.create_shared_memory_region( - "input0_data", "/input0_data", create_byte_size - ) - shm_ip1_handle = shm.create_shared_memory_region( - "input1_data", "/input1_data", create_byte_size - ) - shm_op0_handle = shm.create_shared_memory_region( - "output0_data", "/output0_data", create_byte_size - ) - shm_op1_handle = shm.create_shared_memory_region( - "output1_data", "/output1_data", create_byte_size - ) - # Implicit assumption that input and output byte_sizes are 64 bytes for now - input0_data = np.arange(start=0, stop=16, dtype=np.int32) - input1_data = np.ones(shape=16, dtype=np.int32) - shm.set_shared_memory_region(shm_ip0_handle, [input0_data]) - shm.set_shared_memory_region(shm_ip1_handle, [input1_data]) - self.triton_client.register_system_shared_memory( - "input0_data", "/input0_data", register_byte_size, offset=register_offset - ) - self.triton_client.register_system_shared_memory( - "input1_data", "/input1_data", register_byte_size, offset=register_offset - ) - self.triton_client.register_system_shared_memory( - "output0_data", "/output0_data", register_byte_size, offset=register_offset - ) - self.triton_client.register_system_shared_memory( - "output1_data", "/output1_data", register_byte_size, offset=register_offset - ) - return [shm_ip0_handle, shm_ip1_handle, shm_op0_handle, shm_op1_handle] - - def _cleanup_server(self, shm_handles): - for shm_handle in shm_handles: - shm.destroy_shared_memory_region(shm_handle) - def test_unregister_after_inference(self): # Unregister after inference error_msg = [] @@ -443,5 +447,169 @@ def test_python_client_leak(self): ) +class TestSharedMemoryUnregister(SystemSharedMemoryTestBase): + def _test_unregister_shm_fail(self): + second_client = httpclient.InferenceServerClient("localhost:8000", verbose=True) + + with self.assertRaises(utils.InferenceServerException) as ex: + second_client.unregister_system_shared_memory() + self.assertIn( + "Failed to unregister the following system shared memory regions: input0_data ,input1_data ,output0_data ,output1_data", + str(ex.exception), + ) + + with self.assertRaises(utils.InferenceServerException) as ex: + second_client.unregister_system_shared_memory("input0_data") + self.assertIn( + "Cannot unregister shared memory region 'input0_data', it is currently in use.", + str(ex.exception), + ) + + with self.assertRaises(utils.InferenceServerException) as ex: + second_client.unregister_system_shared_memory("input1_data") + self.assertIn( + "Cannot unregister shared memory region 'input1_data', it is currently in use.", + str(ex.exception), + ) + + with self.assertRaises(utils.InferenceServerException) as ex: + second_client.unregister_system_shared_memory("output0_data") + self.assertIn( + "Cannot unregister shared memory region 'output0_data', it is currently in use.", + str(ex.exception), + ) + + with self.assertRaises(utils.InferenceServerException) as ex: + second_client.unregister_system_shared_memory("output1_data") + self.assertIn( + "Cannot unregister shared memory region 'output1_data', it is currently in use.", + str(ex.exception), + ) + + def _test_shm_not_found(self): + second_client = httpclient.InferenceServerClient("localhost:8000", verbose=True) + + with self.assertRaises(utils.InferenceServerException) as ex: + second_client.get_system_shared_memory_status("input0_data") + self.assertIn( + "Unable to find system shared memory region: 'input0_data'", + str(ex.exception), + ) + + with self.assertRaises(utils.InferenceServerException) as ex: + second_client.get_system_shared_memory_status("input1_data") + self.assertIn( + "Unable to find system shared memory region: 'input1_data'", + str(ex.exception), + ) + + with self.assertRaises(utils.InferenceServerException) as ex: + second_client.get_system_shared_memory_status("output0_data") + self.assertIn( + "Unable to find system shared memory region: 'output0_data'", + str(ex.exception), + ) + + with self.assertRaises(utils.InferenceServerException) as ex: + second_client.get_system_shared_memory_status("output1_data") + self.assertIn( + "Unable to find system shared memory region: 'output1_data'", + str(ex.exception), + ) + + def test_unregister_shm_during_inference_http(self): + try: + self.triton_client.unregister_system_shared_memory() + shm_handles = self._configure_server() + + inputs = [ + httpclient.InferInput("INPUT0", [1, 16], "INT32"), + httpclient.InferInput("INPUT1", [1, 16], "INT32"), + ] + outputs = [ + httpclient.InferRequestedOutput("OUTPUT0", binary_data=True), + httpclient.InferRequestedOutput("OUTPUT1", binary_data=False), + ] + + inputs[0].set_shared_memory("input0_data", self.DEFAULT_SHM_BYTE_SIZE) + inputs[1].set_shared_memory("input1_data", self.DEFAULT_SHM_BYTE_SIZE) + outputs[0].set_shared_memory("output0_data", self.DEFAULT_SHM_BYTE_SIZE) + outputs[1].set_shared_memory("output1_data", self.DEFAULT_SHM_BYTE_SIZE) + + async_request = self.triton_client.async_infer( + model_name="simple", inputs=inputs, outputs=outputs + ) + + # Ensure inference started + time.sleep(2) + + # Try unregister shm regions during inference + self._test_unregister_shm_fail() + + # Blocking call + async_request.get_result() + + # Try unregister shm regions after inference + self.triton_client.unregister_system_shared_memory() + self._test_shm_not_found() + + finally: + self._cleanup_server(shm_handles) + + def test_unregister_shm_during_inference_grpc(self): + try: + self.triton_client.unregister_system_shared_memory() + shm_handles = self._configure_server() + + inputs = [ + grpcclient.InferInput("INPUT0", [1, 16], "INT32"), + grpcclient.InferInput("INPUT1", [1, 16], "INT32"), + ] + outputs = [ + grpcclient.InferRequestedOutput("OUTPUT0"), + grpcclient.InferRequestedOutput("OUTPUT1"), + ] + + inputs[0].set_shared_memory("input0_data", self.DEFAULT_SHM_BYTE_SIZE) + inputs[1].set_shared_memory("input1_data", self.DEFAULT_SHM_BYTE_SIZE) + outputs[0].set_shared_memory("output0_data", self.DEFAULT_SHM_BYTE_SIZE) + outputs[1].set_shared_memory("output1_data", self.DEFAULT_SHM_BYTE_SIZE) + + def callback(user_data, result, error): + if error: + user_data.append(error) + else: + user_data.append(result) + + user_data = [] + + self.triton_client.async_infer( + model_name="simple", + inputs=inputs, + outputs=outputs, + callback=partial(callback, user_data), + ) + + # Ensure inference started + time.sleep(2) + + # Try unregister shm regions during inference + self._test_unregister_shm_fail() + + # Wait until the results are available in user_data + time_out = 20 + while (len(user_data) == 0) and time_out > 0: + time_out = time_out - 1 + time.sleep(1) + time.sleep(2) + + # Try unregister shm regions after inference + self.triton_client.unregister_system_shared_memory() + self._test_shm_not_found() + + finally: + self._cleanup_server(shm_handles) + + if __name__ == "__main__": unittest.main() diff --git a/qa/L0_shared_memory/test.sh b/qa/L0_shared_memory/test.sh index ba6a2fa8f2..e711de9cff 100755 --- a/qa/L0_shared_memory/test.sh +++ b/qa/L0_shared_memory/test.sh @@ -95,6 +95,46 @@ for i in \ done done +mkdir -p python_models/simple/1/ +cp ../python_models/execute_delayed_model/model.py ./python_models/simple/1/ +cp ../python_models/execute_delayed_model/config.pbtxt ./python_models/simple/ + +for client_type in http grpc; do + SERVER_ARGS="--model-repository=`pwd`/python_models --log-verbose=1 ${SERVER_ARGS_EXTRA}" + SERVER_LOG="./unregister_shm.$client_type.server.log" + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + export CLIENT_TYPE=$client_type + CLIENT_LOG="./unregister_shm.$client_type.client.log" + set +e + python3 $SHM_TEST TestSharedMemoryUnregister.test_unregister_shm_during_inference_$client_type >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $TEST_RESULT_FILE + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi + fi + + kill $SERVER_PID + wait $SERVER_PID + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Server shut down non-gracefully\n***" + RET=1 + fi + set -e + done + if [ $RET -eq 0 ]; then echo -e "\n***\n*** Test Passed\n***" else diff --git a/qa/L0_trt_shape_tensors/test.sh b/qa/L0_trt_shape_tensors/test.sh index f08ed339b0..548ebb55af 100755 --- a/qa/L0_trt_shape_tensors/test.sh +++ b/qa/L0_trt_shape_tensors/test.sh @@ -45,7 +45,7 @@ CLIENT_LOG="./client.log" SHAPE_TENSOR_TEST=trt_shape_tensor_test.py SERVER=/opt/tritonserver/bin/tritonserver -SERVER_ARGS="--model-repository=`pwd`/models" +SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=1" SERVER_LOG="./inference_server.log" source ../common/util.sh diff --git a/qa/python_models/execute_delayed_model/config.pbtxt b/qa/python_models/execute_delayed_model/config.pbtxt new file mode 100644 index 0000000000..0a4ee59d3e --- /dev/null +++ b/qa/python_models/execute_delayed_model/config.pbtxt @@ -0,0 +1,55 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "simple" +backend: "python" +max_batch_size: 8 +input [ + { + name: "INPUT0" + data_type: TYPE_INT32 + dims: [ 16 ] + }, + { + name: "INPUT1" + data_type: TYPE_INT32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_INT32 + dims: [ 16 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_INT32 + dims: [ 16 ] + } +] + +instance_group [ { kind: KIND_CPU }] diff --git a/qa/python_models/execute_delayed_model/model.py b/qa/python_models/execute_delayed_model/model.py new file mode 100644 index 0000000000..055b321a93 --- /dev/null +++ b/qa/python_models/execute_delayed_model/model.py @@ -0,0 +1,72 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json +import time + +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + def initialize(self, args): + self.model_config = model_config = json.loads(args["model_config"]) + output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0") + output1_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT1") + self.output0_dtype = pb_utils.triton_string_to_numpy( + output0_config["data_type"] + ) + self.output1_dtype = pb_utils.triton_string_to_numpy( + output1_config["data_type"] + ) + + def execute(self, requests): + output0_dtype = self.output0_dtype + output1_dtype = self.output1_dtype + responses = [] + + time.sleep(15) + + for request in requests: + in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0") + in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1") + + out_0, out_1 = ( + in_0.as_numpy() + in_1.as_numpy(), + in_0.as_numpy() - in_1.as_numpy(), + ) + + out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0.astype(output0_dtype)) + out_tensor_1 = pb_utils.Tensor("OUTPUT1", out_1.astype(output1_dtype)) + + inference_response = pb_utils.InferenceResponse( + output_tensors=[out_tensor_0, out_tensor_1] + ) + responses.append(inference_response) + + return responses + + def finalize(self): + print("Cleaning up...") diff --git a/src/grpc/infer_handler.cc b/src/grpc/infer_handler.cc index 916230381b..c4ba9338cb 100644 --- a/src/grpc/infer_handler.cc +++ b/src/grpc/infer_handler.cc @@ -158,18 +158,6 @@ InferResponseFree( return nullptr; // Success } -TRITONSERVER_Error* InferGRPCToInputHelper( - const std::string& input_name, const std::string& model_name, - const TRITONSERVER_DataType tensor_dt, const TRITONSERVER_DataType input_dt, - const size_t binary_data_byte_size); - -TRITONSERVER_Error* InferGRPCToInput( - const std::shared_ptr& tritonserver, - const std::shared_ptr& shm_manager, - const inference::ModelInferRequest& request, - std::list* serialized_data, - TRITONSERVER_InferenceRequest* inference_request); - TRITONSERVER_Error* InferGRPCToInputHelper( const std::string& input_name, const std::string& model_name, @@ -391,7 +379,9 @@ InferGRPCToInput( const std::shared_ptr& shm_manager, const inference::ModelInferRequest& request, std::list* serialized_data, - TRITONSERVER_InferenceRequest* inference_request) + TRITONSERVER_InferenceRequest* inference_request, + std::vector>* + shm_regions_info) { // Verify that the batch-byte-size of each input matches the size of // the provided tensor data (provided raw or from shared memory) @@ -432,9 +422,14 @@ InferGRPCToInput( .c_str()); } void* tmp; + std::shared_ptr shm_info = + nullptr; RETURN_IF_ERR(shm_manager->GetMemoryInfo( - region_name, offset, byte_size, &tmp, &memory_type, &memory_type_id)); + region_name, offset, byte_size, &tmp, &memory_type, &memory_type_id, + &shm_info)); base = tmp; + shm_regions_info->emplace_back(shm_info); + if (memory_type == TRITONSERVER_MEMORY_GPU) { #ifdef TRITON_ENABLE_GPU RETURN_IF_ERR(shm_manager->GetCUDAHandle( @@ -911,18 +906,32 @@ ModelInferHandler::Execute(InferHandler::State* state) // tensors are present in the request. std::list serialized_data; + // Maintain shared pointers(read-only reference) to the shared memory block's + // information for the shared memory regions used by the request. These + // pointers will automatically increase the usage count, preventing + // unregistration of the shared memory. This vector must be cleared in the + // `InferResponseComplete` callback (after inference) to decrease the count + // and permit unregistration. The vector will be included in + // `response_release_payload` for the callback. + std::vector> + shm_regions_info; + if (err == nullptr) { err = InferGRPCToInput( - tritonserver_, shm_manager_, request, &serialized_data, irequest); + tritonserver_, shm_manager_, request, &serialized_data, irequest, + &shm_regions_info); } if (err == nullptr) { err = InferAllocatorPayload( tritonserver_, shm_manager_, request, std::move(serialized_data), - response_queue, &state->alloc_payload_); + response_queue, &state->alloc_payload_, &shm_regions_info); } auto request_release_payload = std::make_unique(state->inference_request_); + auto response_release_payload = std::make_unique( + state, std::move(shm_regions_info)); + if (err == nullptr) { err = TRITONSERVER_InferenceRequestSetReleaseCallback( irequest, InferRequestComplete, @@ -932,7 +941,8 @@ ModelInferHandler::Execute(InferHandler::State* state) err = TRITONSERVER_InferenceRequestSetResponseCallback( irequest, allocator_, &state->alloc_payload_ /* response_allocator_userp */, - InferResponseComplete, reinterpret_cast(state)); + InferResponseComplete, + response_release_payload.get() /* response_userp */); } // Get request ID for logging in case of error. const char* request_id = ""; @@ -970,8 +980,9 @@ ModelInferHandler::Execute(InferHandler::State* state) // to handle gRPC stream cancellation. if (err == nullptr) { state->context_->InsertInflightState(state); - // The payload will be cleaned in request release callback. + // The payload will be cleaned in release callback. request_release_payload.release(); + response_release_payload.release(); } else { // If error go immediately to COMPLETE. LOG_VERBOSE(1) << "[request id: " << request_id << "] " @@ -1000,7 +1011,9 @@ ModelInferHandler::InferResponseComplete( TRITONSERVER_InferenceResponse* iresponse, const uint32_t flags, void* userp) { - State* state = reinterpret_cast(userp); + ResponseReleasePayload* response_release_payload( + static_cast(userp)); + auto state = response_release_payload->state_; // There are multiple handlers registered in the gRPC service // Hence, we would need to properly synchronize this thread @@ -1042,6 +1055,7 @@ ModelInferHandler::InferResponseComplete( // in the next cycle. state->context_->PutTaskBackToQueue(state); + delete response_release_payload; return; } @@ -1104,6 +1118,8 @@ ModelInferHandler::InferResponseComplete( if (response_created) { delete response; } + + delete response_release_payload; } }}} // namespace triton::server::grpc diff --git a/src/grpc/infer_handler.h b/src/grpc/infer_handler.h index 51307d4ae0..87536dd173 100644 --- a/src/grpc/infer_handler.h +++ b/src/grpc/infer_handler.h @@ -299,7 +299,9 @@ InferAllocatorPayload( const inference::ModelInferRequest& request, std::list&& serialized_data, std::shared_ptr> response_queue, - AllocPayload* alloc_payload) + AllocPayload* alloc_payload, + std::vector>* + shm_regions_info) { alloc_payload->response_queue_ = response_queue; alloc_payload->shm_map_.clear(); @@ -335,9 +337,12 @@ InferAllocatorPayload( void* base; TRITONSERVER_MemoryType memory_type; int64_t memory_type_id; + std::shared_ptr shm_info = + nullptr; RETURN_IF_ERR(shm_manager->GetMemoryInfo( - region_name, offset, byte_size, &base, &memory_type, - &memory_type_id)); + region_name, offset, byte_size, &base, &memory_type, &memory_type_id, + &shm_info)); + shm_regions_info->emplace_back(shm_info); if (memory_type == TRITONSERVER_MEMORY_GPU) { #ifdef TRITON_ENABLE_GPU @@ -373,7 +378,9 @@ TRITONSERVER_Error* InferGRPCToInput( const std::shared_ptr& shm_manager, const inference::ModelInferRequest& request, std::list* serialized_data, - TRITONSERVER_InferenceRequest* inference_request); + TRITONSERVER_InferenceRequest* inference_request, + std::vector>* + shm_regions_info); TRITONSERVER_Error* ResponseAllocatorHelper( TRITONSERVER_ResponseAllocator* allocator, const char* tensor_name, @@ -1263,6 +1270,23 @@ class InferHandler : public HandlerBase { delete state; } + // Simple structure that carries the payload needed for + // response release callback. + struct ResponseReleasePayload final { + State* state_; + std::vector> + shm_regions_info_; + + ResponseReleasePayload( + State* state, + std::vector< + std::shared_ptr>&& + shm_regions_info) + : state_(state), shm_regions_info_(std::move(shm_regions_info)) + { + } + }; + virtual void StartNewRequest() = 0; virtual bool Process(State* state, bool rpc_ok) = 0; bool ExecutePrecondition(InferHandler::State* state); diff --git a/src/grpc/stream_infer_handler.cc b/src/grpc/stream_infer_handler.cc index 6651eca813..1f554db83c 100644 --- a/src/grpc/stream_infer_handler.cc +++ b/src/grpc/stream_infer_handler.cc @@ -282,18 +282,32 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok) // tensors are present in the request. std::list serialized_data; + // Maintain shared pointers(read-only reference) to the shared memory + // block's information for the shared memory regions used by the request. + // These pointers will automatically increase the usage count, preventing + // unregistration of the shared memory. This vector must be cleared in the + // `StreamInferResponseComplete` callback (after inference) to decrease the + // count and permit unregistration. The vector will be included in + // `response_release_payload` for the callback. + std::vector> + shm_regions_info; + if (err == nullptr) { err = InferGRPCToInput( - tritonserver_, shm_manager_, request, &serialized_data, irequest); + tritonserver_, shm_manager_, request, &serialized_data, irequest, + &shm_regions_info); } if (err == nullptr) { err = InferAllocatorPayload( tritonserver_, shm_manager_, request, std::move(serialized_data), - response_queue_, &state->alloc_payload_); + response_queue_, &state->alloc_payload_, &shm_regions_info); } auto request_release_payload = std::make_unique(state->inference_request_); + auto response_release_payload = std::make_unique( + state, std::move(shm_regions_info)); + if (err == nullptr) { err = TRITONSERVER_InferenceRequestSetReleaseCallback( irequest, InferRequestComplete, @@ -303,7 +317,8 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok) err = TRITONSERVER_InferenceRequestSetResponseCallback( irequest, allocator_, &state->alloc_payload_ /* response_allocator_userp */, - StreamInferResponseComplete, reinterpret_cast(state)); + StreamInferResponseComplete, + response_release_payload.get() /* response_userp */); } if (err == nullptr) { @@ -330,8 +345,9 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok) // irequest to handle gRPC stream cancellation. if (err == nullptr) { state->context_->InsertInflightState(state); - // The payload will be cleaned in request release callback. + // The payload will be cleaned in release callback. request_release_payload.release(); + response_release_payload.release(); } else { // If there was an error then enqueue the error response and show // it to be ready for writing. @@ -594,7 +610,10 @@ ModelStreamInferHandler::StreamInferResponseComplete( TRITONSERVER_InferenceResponse* iresponse, const uint32_t flags, void* userp) { - State* state = reinterpret_cast(userp); + ResponseReleasePayload* response_release_payload( + static_cast(userp)); + auto state = response_release_payload->state_; + // Ignore Response from CORE in case GRPC Strict as we dont care about if (state->context_->gRPCErrorTracker_->triton_grpc_error_) { std::lock_guard lock(state->context_->mu_); @@ -648,6 +667,7 @@ ModelStreamInferHandler::StreamInferResponseComplete( if (is_complete) { state->step_ = Steps::CANCELLED; state->context_->PutTaskBackToQueue(state); + delete response_release_payload; } state->complete_ = is_complete; @@ -695,6 +715,7 @@ ModelStreamInferHandler::StreamInferResponseComplete( LOG_TRITONSERVER_ERROR( TRITONSERVER_InferenceResponseDelete(iresponse), "deleting GRPC inference response"); + delete response_release_payload; return; } } @@ -774,6 +795,7 @@ ModelStreamInferHandler::StreamInferResponseComplete( if (is_complete) { state->step_ = Steps::CANCELLED; state->context_->PutTaskBackToQueue(state); + delete response_release_payload; } state->complete_ = is_complete; @@ -818,6 +840,10 @@ ModelStreamInferHandler::StreamInferResponseComplete( } state->complete_ = is_complete; } + + if (is_complete) { + delete response_release_payload; + } } // Changes the state of grpc_stream_error_state_ to ERROR_HANDLING_COMPLETE, diff --git a/src/http_server.cc b/src/http_server.cc index cfd1da88ae..2fa395fc98 100644 --- a/src/http_server.cc +++ b/src/http_server.cc @@ -2681,9 +2681,13 @@ HTTPAPIServer::ParseJsonTritonIO( void* base; TRITONSERVER_MemoryType memory_type; int64_t memory_type_id; + std::shared_ptr shm_info = + nullptr; RETURN_IF_ERR(shm_manager_->GetMemoryInfo( shm_region, shm_offset, byte_size, &base, &memory_type, - &memory_type_id)); + &memory_type_id, &shm_info)); + infer_req->AddShmRegionInfo(shm_info); + if (memory_type == TRITONSERVER_MEMORY_GPU) { #ifdef TRITON_ENABLE_GPU cudaIpcMemHandle_t* cuda_handle; @@ -2796,9 +2800,12 @@ HTTPAPIServer::ParseJsonTritonIO( void* base; TRITONSERVER_MemoryType memory_type; int64_t memory_type_id; + std::shared_ptr shm_info = + nullptr; RETURN_IF_ERR(shm_manager_->GetMemoryInfo( - shm_region, offset, byte_size, &base, &memory_type, - &memory_type_id)); + shm_region, offset, byte_size, &base, &memory_type, &memory_type_id, + &shm_info)); + infer_req->AddShmRegionInfo(shm_info); if (memory_type == TRITONSERVER_MEMORY_GPU) { #ifdef TRITON_ENABLE_GPU diff --git a/src/http_server.h b/src/http_server.h index 3ad3d60cc4..3949f97e27 100644 --- a/src/http_server.h +++ b/src/http_server.h @@ -311,6 +311,13 @@ class HTTPAPIServer : public HTTPServer { static void ReplyCallback(evthr_t* thr, void* arg, void* shared); + void AddShmRegionInfo( + const std::shared_ptr& + shm_info) + { + shm_regions_info_.push_back(shm_info); + } + protected: TRITONSERVER_Server* server_{nullptr}; evhtp_request_t* req_{nullptr}; @@ -330,6 +337,14 @@ class HTTPAPIServer : public HTTPServer { // TRITONSERVER_ServerInferAsync (except for cancellation). std::shared_ptr triton_request_{nullptr}; + // Maintain shared pointers(read-only reference) to the shared memory + // block's information for the shared memory regions used by the request. + // These pointers will automatically increase the usage count, preventing + // unregistration of the shared memory. This vector must be cleared when no + // longer needed to decrease the count and permit unregistration. + std::vector> + shm_regions_info_; + evhtp_res response_code_{EVHTP_RES_OK}; }; diff --git a/src/shared_memory_manager.cc b/src/shared_memory_manager.cc index 1f4a77e887..7b845709a1 100644 --- a/src/shared_memory_manager.cc +++ b/src/shared_memory_manager.cc @@ -69,7 +69,8 @@ TRITONSERVER_Error* SharedMemoryManager::GetMemoryInfo( const std::string& name, size_t offset, size_t byte_size, void** shm_mapped_addr, TRITONSERVER_MemoryType* memory_type, - int64_t* device_id) + int64_t* device_id, + std::shared_ptr* shm_info) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_UNSUPPORTED, @@ -408,9 +409,9 @@ SharedMemoryManager::RegisterSystemSharedMemory( } shared_memory_map_.insert(std::make_pair( - name, std::unique_ptr(new SharedMemoryInfo( + name, std::make_shared( name, shm_key, offset, byte_size, shm_fd, mapped_addr, - TRITONSERVER_MEMORY_CPU, 0)))); + TRITONSERVER_MEMORY_CPU, 0))); return nullptr; // success } @@ -444,9 +445,9 @@ SharedMemoryManager::RegisterCUDASharedMemory( name, reinterpret_cast(mapped_addr), byte_size)); shared_memory_map_.insert(std::make_pair( - name, std::unique_ptr(new CUDASharedMemoryInfo( + name, std::make_shared( name, "", 0, byte_size, 0, mapped_addr, TRITONSERVER_MEMORY_GPU, - device_id, cuda_shm_handle)))); + device_id, cuda_shm_handle))); return nullptr; // success } @@ -456,7 +457,8 @@ TRITONSERVER_Error* SharedMemoryManager::GetMemoryInfo( const std::string& name, size_t offset, size_t byte_size, void** shm_mapped_addr, TRITONSERVER_MemoryType* memory_type, - int64_t* device_id) + int64_t* device_id, + std::shared_ptr* shm_info) { // protect shared_memory_map_ from concurrent access std::lock_guard lock(mu_); @@ -494,6 +496,10 @@ SharedMemoryManager::GetMemoryInfo( .c_str()); } + if (shm_info != nullptr) { + *shm_info = std::static_pointer_cast(it->second); + } + if (it->second->kind_ == TRITONSERVER_MEMORY_CPU) { *shm_mapped_addr = (void*)((uint8_t*)it->second->mapped_addr_ + it->second->offset_ + offset); @@ -561,11 +567,19 @@ SharedMemoryManager::GetStatus( } else { auto it = shared_memory_map_.find(name); if (it == shared_memory_map_.end()) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_NOT_FOUND, - std::string( - "Unable to find system shared memory region: '" + name + "'") - .c_str()); + if (memory_type == TRITONSERVER_MEMORY_GPU) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_NOT_FOUND, + std::string( + "Unable to find cuda shared memory region: '" + name + "'") + .c_str()); + } else { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_NOT_FOUND, + std::string( + "Unable to find system shared memory region: '" + name + "'") + .c_str()); + } } if (it->second->kind_ != memory_type) { @@ -632,6 +646,7 @@ SharedMemoryManager::UnregisterAll(TRITONSERVER_MemoryType memory_type) TRITONSERVER_Error* err = UnregisterHelper(it->first, memory_type); if (err != nullptr) { unregister_fails.push_back(it->first); + LOG_VERBOSE(1) << TRITONSERVER_ErrorMessage(err); } } } @@ -645,6 +660,7 @@ SharedMemoryManager::UnregisterAll(TRITONSERVER_MemoryType memory_type) ; if (err != nullptr) { unregister_fails.push_back(it->first); + LOG_VERBOSE(1) << TRITONSERVER_ErrorMessage(err); } } } @@ -669,6 +685,15 @@ SharedMemoryManager::UnregisterHelper( // Must hold the lock on register_mu_ while calling this function. auto it = shared_memory_map_.find(name); if (it != shared_memory_map_.end() && it->second->kind_ == memory_type) { + if (it->second.use_count() > 1) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + std::string( + "Cannot unregister shared memory region '" + name + + "', it is currently in use.") + .c_str()); + } + if (it->second->kind_ == TRITONSERVER_MEMORY_CPU) { RETURN_IF_ERR( UnmapSharedMemory(it->second->mapped_addr_, it->second->byte_size_)); diff --git a/src/shared_memory_manager.h b/src/shared_memory_manager.h index 51eb0f0786..393fd29128 100644 --- a/src/shared_memory_manager.h +++ b/src/shared_memory_manager.h @@ -1,4 +1,4 @@ -// Copyright 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -50,6 +50,48 @@ class SharedMemoryManager { SharedMemoryManager() = default; ~SharedMemoryManager(); + /// A struct that records the shared memory regions registered by the shared + /// memory manager. + struct SharedMemoryInfo { + SharedMemoryInfo( + const std::string& name, const std::string& shm_key, + const size_t offset, const size_t byte_size, int shm_fd, + void* mapped_addr, const TRITONSERVER_MemoryType kind, + const int64_t device_id) + : name_(name), shm_key_(shm_key), offset_(offset), + byte_size_(byte_size), shm_fd_(shm_fd), mapped_addr_(mapped_addr), + kind_(kind), device_id_(device_id) + { + } + + std::string name_; + std::string shm_key_; + size_t offset_; + size_t byte_size_; + int shm_fd_; + void* mapped_addr_; + TRITONSERVER_MemoryType kind_; + int64_t device_id_; + }; + +#ifdef TRITON_ENABLE_GPU + struct CUDASharedMemoryInfo : SharedMemoryInfo { + CUDASharedMemoryInfo( + const std::string& name, const std::string& shm_key, + const size_t offset, const size_t byte_size, int shm_fd, + void* mapped_addr, const TRITONSERVER_MemoryType kind, + const int64_t device_id, const cudaIpcMemHandle_t* cuda_ipc_handle) + : SharedMemoryInfo( + name, shm_key, offset, byte_size, shm_fd, mapped_addr, kind, + device_id), + cuda_ipc_handle_(*cuda_ipc_handle) + { + } + + cudaIpcMemHandle_t cuda_ipc_handle_; + }; +#endif + /// Add a shared memory block representing shared memory in system /// (CPU) memory to the manager. Return TRITONSERVER_ERROR_ALREADY_EXISTS /// if a shared memory block of the same name already exists in the manager. @@ -90,11 +132,18 @@ class SharedMemoryManager { /// \param memory_type Returns the type of the memory /// \param device_id Returns the device id associated with the /// memory block - /// \return a TRITONSERVER_Error indicating success or failure. + /// \param shm_info Returns a shared pointer reference(read-only) to the + /// shared memory block's information. + /// This pointer will automatically increase the usage count, preventing + /// unregistration while the reference is held. The reference must be cleared + /// or set to nullptr when no longer needed, to decrease the count and allow + /// unregistration. + /// \return a TRITONSERVER_Error indicating success or + /// failure. TRITONSERVER_Error* GetMemoryInfo( const std::string& name, size_t offset, size_t byte_size, void** shm_mapped_addr, TRITONSERVER_MemoryType* memory_type, - int64_t* device_id); + int64_t* device_id, std::shared_ptr* shm_info); #ifdef TRITON_ENABLE_GPU /// Get the CUDA memory handle associated with the block name. @@ -139,50 +188,8 @@ class SharedMemoryManager { TRITONSERVER_Error* UnregisterHelper( const std::string& name, TRITONSERVER_MemoryType memory_type); - /// A struct that records the shared memory regions registered by the shared - /// memory manager. - struct SharedMemoryInfo { - SharedMemoryInfo( - const std::string& name, const std::string& shm_key, - const size_t offset, const size_t byte_size, int shm_fd, - void* mapped_addr, const TRITONSERVER_MemoryType kind, - const int64_t device_id) - : name_(name), shm_key_(shm_key), offset_(offset), - byte_size_(byte_size), shm_fd_(shm_fd), mapped_addr_(mapped_addr), - kind_(kind), device_id_(device_id) - { - } - - std::string name_; - std::string shm_key_; - size_t offset_; - size_t byte_size_; - int shm_fd_; - void* mapped_addr_; - TRITONSERVER_MemoryType kind_; - int64_t device_id_; - }; - -#ifdef TRITON_ENABLE_GPU - struct CUDASharedMemoryInfo : SharedMemoryInfo { - CUDASharedMemoryInfo( - const std::string& name, const std::string& shm_key, - const size_t offset, const size_t byte_size, int shm_fd, - void* mapped_addr, const TRITONSERVER_MemoryType kind, - const int64_t device_id, const cudaIpcMemHandle_t* cuda_ipc_handle) - : SharedMemoryInfo( - name, shm_key, offset, byte_size, shm_fd, mapped_addr, kind, - device_id), - cuda_ipc_handle_(*cuda_ipc_handle) - { - } - - cudaIpcMemHandle_t cuda_ipc_handle_; - }; -#endif - using SharedMemoryStateMap = - std::map>; + std::map>; // A map between the name and the details of the associated // shared memory block SharedMemoryStateMap shared_memory_map_; From 363bcdcd03cddcd00979c7fd3315557328221c6d Mon Sep 17 00:00:00 2001 From: Francesco Petrini Date: Wed, 11 Sep 2024 16:27:28 -0700 Subject: [PATCH 27/44] build/test: RHEL8 EA3 (#7595) --- build.py | 11 +++++++---- qa/L0_sequence_batcher/test.sh | 21 ++++++++++++++++++--- 2 files changed, 25 insertions(+), 7 deletions(-) diff --git a/build.py b/build.py index 3195c50cbb..4d4d911468 100755 --- a/build.py +++ b/build.py @@ -1374,12 +1374,15 @@ def dockerfile_prepare_container_linux(argmap, backends, enable_gpu, target_mach if enable_gpu: df += install_dcgm_libraries(argmap["DCGM_VERSION"], target_machine) - df += """ + # This segment will break the RHEL SBSA build. Need to determine whether + # this is necessary to incorporate. + if target_platform() != "rhel": + df += """ # Extra defensive wiring for CUDA Compat lib RUN ln -sf ${_CUDA_COMPAT_PATH}/lib.real ${_CUDA_COMPAT_PATH}/lib \\ - && echo ${_CUDA_COMPAT_PATH}/lib > /etc/ld.so.conf.d/00-cuda-compat.conf \\ - && ldconfig \\ - && rm -f ${_CUDA_COMPAT_PATH}/lib + && echo ${_CUDA_COMPAT_PATH}/lib > /etc/ld.so.conf.d/00-cuda-compat.conf \\ + && ldconfig \\ + && rm -f ${_CUDA_COMPAT_PATH}/lib """ else: df += add_cpu_libs_to_linux_dockerfile(backends, target_machine) diff --git a/qa/L0_sequence_batcher/test.sh b/qa/L0_sequence_batcher/test.sh index 23ee387b55..ac34458b4e 100755 --- a/qa/L0_sequence_batcher/test.sh +++ b/qa/L0_sequence_batcher/test.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2018-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -183,6 +183,16 @@ export USE_SINGLE_BUFFER # models4 - four instances with batch-size 1 rm -fr *.log models{0,1,2,4} queue_delay_models && mkdir models{0,1,2,4} queue_delay_models +# Search BACKENDS to determine if a backend should be tested +function should_test_backend() { + local target_backend=$1 + if [[ $(echo "${BACKENDS[@]}" | grep -c "${target_backend}") -ne 0 ]]; then + echo "true" + return + fi + echo "false" +} + # Get the datatype to use based on the backend function get_datatype () { local dtype="int32 bool" @@ -827,8 +837,13 @@ fi ### Start Preserve Ordering Tests ### -# Test only supported on windows currently due to use of python backend models -if [ ${WINDOWS} -ne 1 ]; then +# FIXME: Test only supported on windows currently due to use of python backend models. +# Now that Windows supports the PYBE, we should check that this tests works once Windows +# CI is stable. + +# These subtests use python models. They should not be executed if 'python' is not one +# of the backends under test. +if [[ $(should_test_backend "python") == "true" && !( -v WSL_DISTRO_NAME || -v MSYSTEM )]]; then # Test preserve ordering true/false and decoupled/non-decoupled TEST_CASE=SequenceBatcherPreserveOrderingTest MODEL_PATH=preserve_ordering_models From 68d4c01e4491e6bb033a4063b67eb41b55cb4ea4 Mon Sep 17 00:00:00 2001 From: Sai Kiran Polisetty Date: Tue, 17 Sep 2024 23:37:30 +0530 Subject: [PATCH 28/44] Fix: Add mutex lock for state completion check in gRPC streaming to prevent race condition (#7617) --- src/grpc/stream_infer_handler.cc | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/src/grpc/stream_infer_handler.cc b/src/grpc/stream_infer_handler.cc index 1f554db83c..cf788b1e09 100644 --- a/src/grpc/stream_infer_handler.cc +++ b/src/grpc/stream_infer_handler.cc @@ -537,15 +537,18 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok) } else if (state->step_ == Steps::WRITEREADY) { // Finish the state if all the transactions associated with // the state have completed. - if (state->IsComplete()) { - state->context_->DecrementRequestCounter(); - finished = Finish(state); - } else { - LOG_ERROR << "Should not print this! Decoupled should NOT write via " - "WRITEREADY!"; - // Remove the state from the completion queue - std::lock_guard lock(state->step_mtx_); - state->step_ = Steps::ISSUED; + std::lock_guard lk1(state->context_->mu_); + { + if (state->IsComplete()) { + state->context_->DecrementRequestCounter(); + finished = Finish(state); + } else { + LOG_ERROR << "Should not print this! Decoupled should NOT write via " + "WRITEREADY!"; + // Remove the state from the completion queue + std::lock_guard lock(state->step_mtx_); + state->step_ = Steps::ISSUED; + } } } } From 7dde2688c3f320d8314bba158efc59b727abc666 Mon Sep 17 00:00:00 2001 From: vd-nv Date: Fri, 20 Sep 2024 06:30:00 +0800 Subject: [PATCH 29/44] Update fetch_models.sh (#7621) --- docs/examples/fetch_models.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/examples/fetch_models.sh b/docs/examples/fetch_models.sh index 5594878b3e..f5aaed85aa 100755 --- a/docs/examples/fetch_models.sh +++ b/docs/examples/fetch_models.sh @@ -37,4 +37,4 @@ mv /tmp/inception_v3_2016_08_28_frozen.pb model_repository/inception_graphdef/1/ # ONNX densenet mkdir -p model_repository/densenet_onnx/1 wget -O model_repository/densenet_onnx/1/model.onnx \ - https://contentmamluswest001.blob.core.windows.net/content/14b2744cf8d6418c87ffddc3f3127242/9502630827244d60a1214f250e3bbca7/08aed7327d694b8dbaee2c97b8d0fcba/densenet121-1.2.onnx + https://github.com/onnx/models/raw/main/validated/vision/classification/densenet-121/model/densenet-7.onnx From dbb064ff930c876b5de46088f675bd60756c2969 Mon Sep 17 00:00:00 2001 From: Harshini Komali <157742537+lkomali@users.noreply.github.com> Date: Fri, 20 Sep 2024 14:47:08 -0700 Subject: [PATCH 30/44] ci: Set stability factor to a higher value (#7634) --- qa/L0_perf_analyzer_capi/test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qa/L0_perf_analyzer_capi/test.sh b/qa/L0_perf_analyzer_capi/test.sh index d031e2cacf..3e3f9e4af6 100755 --- a/qa/L0_perf_analyzer_capi/test.sh +++ b/qa/L0_perf_analyzer_capi/test.sh @@ -56,7 +56,7 @@ SHAPETENSORADTAFILE=`pwd`/../common/perf_analyzer_input_data_json/shape_tensor_d ERROR_STRING="error | Request count: 0 | : 0 infer/sec" -STABILITY_THRESHOLD="15" +STABILITY_THRESHOLD="9999" source ../common/util.sh From 92255d7a754679985e12649b5d8887259031e9d6 Mon Sep 17 00:00:00 2001 From: Olga Andreeva <124622579+oandreeva-nv@users.noreply.github.com> Date: Tue, 1 Oct 2024 15:08:40 -0700 Subject: [PATCH 31/44] [docs] Removed vLLM meetup announcement (#7673) --- README.md | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/README.md b/README.md index da80cc3a2b..63b23a9c15 100644 --- a/README.md +++ b/README.md @@ -28,17 +28,6 @@ # Triton Inference Server -📣 **vLLM x Triton Meetup at Fort Mason on Sept 9th 4:00 - 9:00 pm** - -We are excited to announce that we will be hosting our Triton user meetup with the vLLM team at -[Fort Mason](https://maps.app.goo.gl/9Lr3fxRssrpQCGK58) on Sept 9th 4:00 - 9:00 pm. Join us for this -exclusive event where you will learn about the newest vLLM and Triton features, get a -glimpse into the roadmaps, and connect with fellow users, the NVIDIA Triton and vLLM teams. Seating is limited and registration confirmation -is required to attend - please register [here](https://lu.ma/87q3nvnh) to join -the meetup. - -___ - [![License](https://img.shields.io/badge/License-BSD3-lightgrey.svg)](https://opensource.org/licenses/BSD-3-Clause) [!WARNING] From bfe2a2bec753e4e9002183b7817c849596b65dbc Mon Sep 17 00:00:00 2001 From: pvijayakrish Date: Wed, 25 Sep 2024 16:33:21 -0700 Subject: [PATCH 32/44] Update the versions post 24.09 release. --- build.py | 2 +- deploy/aws/values.yaml | 2 +- deploy/fleetcommand/Chart.yaml | 2 +- deploy/fleetcommand/values.yaml | 6 +++--- deploy/gcp/values.yaml | 2 +- .../perf-analyzer-script/triton_client.yaml | 2 +- .../server-deployer/build_and_push.sh | 6 +++--- .../server-deployer/chart/triton/Chart.yaml | 4 ++-- .../server-deployer/chart/triton/values.yaml | 6 +++--- .../server-deployer/data-test/schema.yaml | 2 +- .../server-deployer/schema.yaml | 4 ++-- .../gke-marketplace-app/trt-engine/README.md | 6 +++--- deploy/k8s-onprem/values.yaml | 2 +- deploy/oci/values.yaml | 2 +- docs/customization_guide/build.md | 6 +++--- docs/customization_guide/compose.md | 18 +++++++++--------- docs/customization_guide/test.md | 2 +- docs/generate_docs.py | 4 ++-- docs/user_guide/custom_operations.md | 6 +++--- docs/user_guide/performance_tuning.md | 4 ++-- qa/common/gen_jetson_trt_models | 2 +- qa/common/gen_qa_custom_ops | 2 +- qa/common/gen_qa_model_repository | 2 +- 23 files changed, 47 insertions(+), 47 deletions(-) diff --git a/build.py b/build.py index 4d4d911468..36aaa161bc 100755 --- a/build.py +++ b/build.py @@ -72,7 +72,7 @@ TRITON_VERSION_MAP = { "2.50.0dev": ( "24.09dev", # triton container - "24.08", # upstream container + "24.09", # upstream container "1.18.1", # ORT "2024.0.0", # ORT OpenVINO "2024.0.0", # Standalone OpenVINO diff --git a/deploy/aws/values.yaml b/deploy/aws/values.yaml index 67ecba6c53..bd8ae0fe3b 100644 --- a/deploy/aws/values.yaml +++ b/deploy/aws/values.yaml @@ -27,7 +27,7 @@ replicaCount: 1 image: - imageName: nvcr.io/nvidia/tritonserver:24.08-py3 + imageName: nvcr.io/nvidia/tritonserver:24.09-py3 pullPolicy: IfNotPresent modelRepositoryPath: s3://triton-inference-server-repository/model_repository numGpus: 1 diff --git a/deploy/fleetcommand/Chart.yaml b/deploy/fleetcommand/Chart.yaml index 68aaf8f405..8feee92b3c 100644 --- a/deploy/fleetcommand/Chart.yaml +++ b/deploy/fleetcommand/Chart.yaml @@ -26,7 +26,7 @@ apiVersion: v1 # appVersion is the Triton version; update when changing release -appVersion: "2.49.0" +appVersion: "2.50.0" description: Triton Inference Server (Fleet Command) name: triton-inference-server # version is the Chart version; update when changing anything in the chart diff --git a/deploy/fleetcommand/values.yaml b/deploy/fleetcommand/values.yaml index f3036d5bee..dc5f37ca3b 100644 --- a/deploy/fleetcommand/values.yaml +++ b/deploy/fleetcommand/values.yaml @@ -27,7 +27,7 @@ replicaCount: 1 image: - imageName: nvcr.io/nvidia/tritonserver:24.08-py3 + imageName: nvcr.io/nvidia/tritonserver:24.09-py3 pullPolicy: IfNotPresent numGpus: 1 serverCommand: tritonserver @@ -47,13 +47,13 @@ image: # # To set model control mode, uncomment and configure below # TODO: Fix the following url, it is invalid - # See https://github.com/triton-inference-server/server/blob/r24.08/docs/model_management.md + # See https://github.com/triton-inference-server/server/blob/r24.09/docs/model_management.md # for more details #- --model-control-mode=explicit|poll|none # # Additional server args # - # see https://github.com/triton-inference-server/server/blob/r24.08/README.md + # see https://github.com/triton-inference-server/server/blob/r24.09/README.md # for more details service: diff --git a/deploy/gcp/values.yaml b/deploy/gcp/values.yaml index c25bcf58ce..c5427c151e 100644 --- a/deploy/gcp/values.yaml +++ b/deploy/gcp/values.yaml @@ -27,7 +27,7 @@ replicaCount: 1 image: - imageName: nvcr.io/nvidia/tritonserver:24.08-py3 + imageName: nvcr.io/nvidia/tritonserver:24.09-py3 pullPolicy: IfNotPresent modelRepositoryPath: gs://triton-inference-server-repository/model_repository numGpus: 1 diff --git a/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml b/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml index 4b896a1ac7..a63a12ce34 100644 --- a/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml +++ b/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml @@ -33,7 +33,7 @@ metadata: namespace: default spec: containers: - - image: nvcr.io/nvidia/tritonserver:24.08-py3-sdk + - image: nvcr.io/nvidia/tritonserver:24.09-py3-sdk imagePullPolicy: Always name: nv-triton-client securityContext: diff --git a/deploy/gke-marketplace-app/server-deployer/build_and_push.sh b/deploy/gke-marketplace-app/server-deployer/build_and_push.sh index cc5fa998b4..19d84816a0 100755 --- a/deploy/gke-marketplace-app/server-deployer/build_and_push.sh +++ b/deploy/gke-marketplace-app/server-deployer/build_and_push.sh @@ -27,9 +27,9 @@ export REGISTRY=gcr.io/$(gcloud config get-value project | tr ':' '/') export APP_NAME=tritonserver -export MAJOR_VERSION=2.49 -export MINOR_VERSION=2.49.0 -export NGC_VERSION=24.08-py3 +export MAJOR_VERSION=2.50 +export MINOR_VERSION=2.50.0 +export NGC_VERSION=24.09-py3 docker pull nvcr.io/nvidia/$APP_NAME:$NGC_VERSION diff --git a/deploy/gke-marketplace-app/server-deployer/chart/triton/Chart.yaml b/deploy/gke-marketplace-app/server-deployer/chart/triton/Chart.yaml index 41e2e8cdb9..e9f8880a0b 100644 --- a/deploy/gke-marketplace-app/server-deployer/chart/triton/Chart.yaml +++ b/deploy/gke-marketplace-app/server-deployer/chart/triton/Chart.yaml @@ -25,7 +25,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. apiVersion: v1 -appVersion: "2.49" +appVersion: "2.50" description: Triton Inference Server name: triton-inference-server -version: 2.49.0 +version: 2.50.0 diff --git a/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml b/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml index 7f8a96608f..450d8f735c 100644 --- a/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml +++ b/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml @@ -31,14 +31,14 @@ maxReplicaCount: 3 tritonProtocol: HTTP # HPA GPU utilization autoscaling target HPATargetAverageValue: 85 -modelRepositoryPath: gs://triton_sample_models/24.08 -publishedVersion: '2.49.0' +modelRepositoryPath: gs://triton_sample_models/24.09 +publishedVersion: '2.50.0' gcpMarketplace: true image: registry: gcr.io repository: nvidia-ngc-public/tritonserver - tag: 24.08-py3 + tag: 24.09-py3 pullPolicy: IfNotPresent # modify the model repository here to match your GCP storage bucket numGpus: 1 diff --git a/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml b/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml index 356b3cce0f..16494b5261 100644 --- a/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml +++ b/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml @@ -27,7 +27,7 @@ x-google-marketplace: schemaVersion: v2 applicationApiVersion: v1beta1 - publishedVersion: '2.49.0' + publishedVersion: '2.50.0' publishedVersionMetadata: releaseNote: >- Initial release. diff --git a/deploy/gke-marketplace-app/server-deployer/schema.yaml b/deploy/gke-marketplace-app/server-deployer/schema.yaml index 076f62e05b..f3525a52f1 100644 --- a/deploy/gke-marketplace-app/server-deployer/schema.yaml +++ b/deploy/gke-marketplace-app/server-deployer/schema.yaml @@ -27,7 +27,7 @@ x-google-marketplace: schemaVersion: v2 applicationApiVersion: v1beta1 - publishedVersion: '2.49.0' + publishedVersion: '2.50.0' publishedVersionMetadata: releaseNote: >- Initial release. @@ -89,7 +89,7 @@ properties: modelRepositoryPath: type: string title: Bucket where models are stored. Please make sure the user/service account to create the GKE app has permission to this GCS bucket. Read Triton documentation on configs and formatting details, supporting TensorRT, TensorFlow, Pytorch, Onnx ... etc. - default: gs://triton_sample_models/24.08 + default: gs://triton_sample_models/24.09 image.ldPreloadPath: type: string title: Leave this empty by default. Triton allows users to create custom layers for backend such as TensorRT plugin or Tensorflow custom ops, the compiled shared library must be provided via LD_PRELOAD environment variable. diff --git a/deploy/gke-marketplace-app/trt-engine/README.md b/deploy/gke-marketplace-app/trt-engine/README.md index aa8fa2a399..0c8012eb68 100644 --- a/deploy/gke-marketplace-app/trt-engine/README.md +++ b/deploy/gke-marketplace-app/trt-engine/README.md @@ -33,7 +33,7 @@ ``` docker run --gpus all -it --network host \ --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 \ - -v ~:/scripts nvcr.io/nvidia/tensorrt:24.08-py3 + -v ~:/scripts nvcr.io/nvidia/tensorrt:24.09-py3 pip install onnx six torch tf2onnx tensorflow @@ -57,7 +57,7 @@ mkdir -p engines python3 builder.py -m models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/model.ckpt -o engines/bert_large_int8_bs1_s128.engine -b 1 -s 128 -c models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/ -v models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/vocab.txt --int8 --fp16 --strict --calib-num 1 -iln -imh -gsutil cp bert_large_int8_bs1_s128.engine gs://triton_sample_models/24.08/bert/1/model.plan +gsutil cp bert_large_int8_bs1_s128.engine gs://triton_sample_models/24.09/bert/1/model.plan ``` -For each Triton upgrade, container version used to generate the model, and the model path in GCS `gs://triton_sample_models/24.08/` should be updated accordingly with the correct version. +For each Triton upgrade, container version used to generate the model, and the model path in GCS `gs://triton_sample_models/24.09/` should be updated accordingly with the correct version. diff --git a/deploy/k8s-onprem/values.yaml b/deploy/k8s-onprem/values.yaml index f0f28b68e1..ccee5e9c24 100644 --- a/deploy/k8s-onprem/values.yaml +++ b/deploy/k8s-onprem/values.yaml @@ -29,7 +29,7 @@ tags: loadBalancing: true image: - imageName: nvcr.io/nvidia/tritonserver:24.08-py3 + imageName: nvcr.io/nvidia/tritonserver:24.09-py3 pullPolicy: IfNotPresent modelRepositoryServer: < Replace with the IP Address of your file server > modelRepositoryPath: /srv/models diff --git a/deploy/oci/values.yaml b/deploy/oci/values.yaml index bf83490db4..55b8193ee2 100644 --- a/deploy/oci/values.yaml +++ b/deploy/oci/values.yaml @@ -27,7 +27,7 @@ replicaCount: 1 image: - imageName: nvcr.io/nvidia/tritonserver:24.08-py3 + imageName: nvcr.io/nvidia/tritonserver:24.09-py3 pullPolicy: IfNotPresent modelRepositoryPath: s3://https://.compat.objectstorage..oraclecloud.com:443/triton-inference-server-repository numGpus: 1 diff --git a/docs/customization_guide/build.md b/docs/customization_guide/build.md index f0f3bd99e2..56e5875776 100644 --- a/docs/customization_guide/build.md +++ b/docs/customization_guide/build.md @@ -173,7 +173,7 @@ $ ./build.py ... --repo-tag=common: --repo-tag=core:` will default to the branch name. For example, if you are building on the -r24.08 branch, `` will default to r24.08. If you are +r24.09 branch, `` will default to r24.09. If you are building on any other branch (including the *main* branch) then `` will default to "main". Therefore, you typically do not need to provide `` at all (nor the preceding @@ -334,8 +334,8 @@ python build.py --cmake-dir=/build --build-dir=/tmp/citritonbuild If you are building on *main* branch then `` will default to "main". If you are building on a release branch then `` will default to the branch name. For example, if you -are building on the r24.08 branch, `` will default to -r24.08. Therefore, you typically do not need to provide `` will default to +r24.09. Therefore, you typically do not need to provide `` at all (nor the preceding colon). You can use a different `` for a component to instead use the corresponding branch/tag in the build. For example, if you have a branch called diff --git a/docs/customization_guide/compose.md b/docs/customization_guide/compose.md index 563061c317..0c6afc1e0b 100644 --- a/docs/customization_guide/compose.md +++ b/docs/customization_guide/compose.md @@ -46,8 +46,8 @@ The `compose.py` script can be found in the Simply clone the repository and run `compose.py` to create a custom container. Note: Created container version will depend on the branch that was cloned. For example branch - [r24.08](https://github.com/triton-inference-server/server/tree/r24.08) -should be used to create a image based on the NGC 24.08 Triton release. + [r24.09](https://github.com/triton-inference-server/server/tree/r24.09) +should be used to create a image based on the NGC 24.09 Triton release. `compose.py` provides `--backend`, `--repoagent` options that allow you to specify which backends and repository agents to include in the custom image. @@ -79,20 +79,20 @@ For example, running ``` python3 compose.py --backend pytorch --repoagent checksum ``` -on branch [r24.08](https://github.com/triton-inference-server/server/tree/r24.08) pulls: -- `min` container `nvcr.io/nvidia/tritonserver:24.08-py3-min` -- `full` container `nvcr.io/nvidia/tritonserver:24.08-py3` +on branch [r24.09](https://github.com/triton-inference-server/server/tree/r24.09) pulls: +- `min` container `nvcr.io/nvidia/tritonserver:24.09-py3-min` +- `full` container `nvcr.io/nvidia/tritonserver:24.09-py3` Alternatively, users can specify the version of Triton container to pull from any branch by either: 1. Adding flag `--container-version ` to branch ``` -python3 compose.py --backend pytorch --repoagent checksum --container-version 24.08 +python3 compose.py --backend pytorch --repoagent checksum --container-version 24.09 ``` 2. Specifying `--image min, --image full,`. The user is responsible for specifying compatible `min` and `full` containers. ``` -python3 compose.py --backend pytorch --repoagent checksum --image min,nvcr.io/nvidia/tritonserver:24.08-py3-min --image full,nvcr.io/nvidia/tritonserver:24.08-py3 +python3 compose.py --backend pytorch --repoagent checksum --image min,nvcr.io/nvidia/tritonserver:24.09-py3-min --image full,nvcr.io/nvidia/tritonserver:24.09-py3 ``` Method 1 and 2 will result in the same composed container. Furthermore, `--image` flag overrides the `--container-version` flag when both are specified. @@ -103,8 +103,8 @@ Note: 2. vLLM and TensorRT-LLM backends are currently not supported backends for `compose.py`. If you want to build additional backends on top of these backends, it would be better to [build it yourself](#build-it-yourself) by using -`nvcr.io/nvidia/tritonserver:24.08-vllm-python-py3` or -`nvcr.io/nvidia/tritonserver:24.08-trtllm-python-py3` as a `min` container. +`nvcr.io/nvidia/tritonserver:24.09-vllm-python-py3` or +`nvcr.io/nvidia/tritonserver:24.09-trtllm-python-py3` as a `min` container. ### CPU-only container composition diff --git a/docs/customization_guide/test.md b/docs/customization_guide/test.md index 898267e34f..8487e6e3ad 100644 --- a/docs/customization_guide/test.md +++ b/docs/customization_guide/test.md @@ -49,7 +49,7 @@ $ ./gen_qa_custom_ops ``` This will create multiple model repositories in /tmp/\/qa_* -(for example /tmp/24.08/qa_model_repository). The TensorRT models +(for example /tmp/24.09/qa_model_repository). The TensorRT models will be created for the GPU on the system that CUDA considers device 0 (zero). If you have multiple GPUs on your system see the documentation in the scripts for how to target a specific GPU. diff --git a/docs/generate_docs.py b/docs/generate_docs.py index 3cb9de4bc6..cb7ed02d9f 100755 --- a/docs/generate_docs.py +++ b/docs/generate_docs.py @@ -43,11 +43,11 @@ """ TODO: Needs to handle cross-branch linkage. -For example, server/docs/user_guide/architecture.md on branch 24.08 links to +For example, server/docs/user_guide/architecture.md on branch 24.09 links to server/docs/user_guide/model_analyzer.md on main branch. In this case, the hyperlink of model_analyzer.md should be a URL instead of relative path. -Another example can be server/docs/user_guide/model_analyzer.md on branch 24.08 +Another example can be server/docs/user_guide/model_analyzer.md on branch 24.09 links to a file in server repo with relative path. Currently all URLs are hardcoded to main branch. We need to make sure that the URL actually points to the correct branch. We also need to handle cases like deprecated or removed files from diff --git a/docs/user_guide/custom_operations.md b/docs/user_guide/custom_operations.md index 6fa3cee3dc..88a7037c7f 100644 --- a/docs/user_guide/custom_operations.md +++ b/docs/user_guide/custom_operations.md @@ -64,7 +64,7 @@ simple way to ensure you are using the correct version of TensorRT is to use the [NGC TensorRT container](https://ngc.nvidia.com/catalog/containers/nvidia:tensorrt) corresponding to the Triton container. For example, if you are using -the 24.08 version of Triton, use the 24.08 version of the TensorRT +the 24.09 version of Triton, use the 24.09 version of the TensorRT container. ## TensorFlow @@ -123,7 +123,7 @@ simple way to ensure you are using the correct version of TensorFlow is to use the [NGC TensorFlow container](https://ngc.nvidia.com/catalog/containers/nvidia:tensorflow) corresponding to the Triton container. For example, if you are using -the 24.08 version of Triton, use the 24.08 version of the TensorFlow +the 24.09 version of Triton, use the 24.09 version of the TensorFlow container. ## PyTorch @@ -167,7 +167,7 @@ simple way to ensure you are using the correct version of PyTorch is to use the [NGC PyTorch container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) corresponding to the Triton container. For example, if you are using -the 24.08 version of Triton, use the 24.08 version of the PyTorch +the 24.09 version of Triton, use the 24.09 version of the PyTorch container. ## ONNX diff --git a/docs/user_guide/performance_tuning.md b/docs/user_guide/performance_tuning.md index 70e76cd5ef..efea32a63b 100644 --- a/docs/user_guide/performance_tuning.md +++ b/docs/user_guide/performance_tuning.md @@ -235,7 +235,7 @@ with a `tritonserver` binary. ```bash # Start server container -docker run -ti --rm --gpus=all --network=host -v $PWD:/mnt --name triton-server nvcr.io/nvidia/tritonserver:24.08-py3 +docker run -ti --rm --gpus=all --network=host -v $PWD:/mnt --name triton-server nvcr.io/nvidia/tritonserver:24.09-py3 # Start serving your models tritonserver --model-repository=/mnt/models @@ -284,7 +284,7 @@ by setting the `-u` flag, such as `perf_analyzer -m densenet_onnx -u ```bash # Start the SDK container interactively -docker run -ti --rm --gpus=all --network=host -v $PWD:/mnt --name triton-client nvcr.io/nvidia/tritonserver:24.08-py3-sdk +docker run -ti --rm --gpus=all --network=host -v $PWD:/mnt --name triton-client nvcr.io/nvidia/tritonserver:24.09-py3-sdk # Benchmark model being served from step 3 perf_analyzer -m densenet_onnx --concurrency-range 1:4 diff --git a/qa/common/gen_jetson_trt_models b/qa/common/gen_jetson_trt_models index 892b8dd383..21e9fe53ff 100755 --- a/qa/common/gen_jetson_trt_models +++ b/qa/common/gen_jetson_trt_models @@ -34,7 +34,7 @@ # Make all generated files accessible outside of container umask 0000 # Set the version of the models -TRITON_VERSION=${TRITON_VERSION:=24.08} +TRITON_VERSION=${TRITON_VERSION:=24.09} # Set the CUDA device to use CUDA_DEVICE=${RUNNER_ID:=0} # Set TensorRT image diff --git a/qa/common/gen_qa_custom_ops b/qa/common/gen_qa_custom_ops index 8864da69f5..286052914b 100755 --- a/qa/common/gen_qa_custom_ops +++ b/qa/common/gen_qa_custom_ops @@ -37,7 +37,7 @@ ## ############################################################################ -TRITON_VERSION=${TRITON_VERSION:=24.08} +TRITON_VERSION=${TRITON_VERSION:=24.09} NVIDIA_UPSTREAM_VERSION=${NVIDIA_UPSTREAM_VERSION:=$TRITON_VERSION} TENSORFLOW_IMAGE=${TENSORFLOW_IMAGE:=nvcr.io/nvidia/tensorflow:$NVIDIA_UPSTREAM_VERSION-tf2-py3} PYTORCH_IMAGE=${PYTORCH_IMAGE:=nvcr.io/nvidia/pytorch:$NVIDIA_UPSTREAM_VERSION-py3} diff --git a/qa/common/gen_qa_model_repository b/qa/common/gen_qa_model_repository index 900b8fdc03..f26ba863ce 100755 --- a/qa/common/gen_qa_model_repository +++ b/qa/common/gen_qa_model_repository @@ -48,7 +48,7 @@ ## ############################################################################ -TRITON_VERSION=${TRITON_VERSION:=24.08} +TRITON_VERSION=${TRITON_VERSION:=24.09} # ONNX. Use ONNX_OPSET 0 to use the default for ONNX version ONNX_VERSION=1.13.0 From 8a66af03642fd23b11566346d697f2bd0a0e4f4f Mon Sep 17 00:00:00 2001 From: Pavithra Vijayakrishnan <160681768+pvijayakrish@users.noreply.github.com> Date: Tue, 10 Sep 2024 22:38:50 -0700 Subject: [PATCH 33/44] Build: Update triton version in Map (#7610) --- build.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/build.py b/build.py index 36aaa161bc..35d14a98f4 100755 --- a/build.py +++ b/build.py @@ -70,8 +70,8 @@ # incorrectly load the other version of the openvino libraries. # TRITON_VERSION_MAP = { - "2.50.0dev": ( - "24.09dev", # triton container + "2.50.0": ( + "24.09", # triton container "24.09", # upstream container "1.18.1", # ORT "2024.0.0", # ORT OpenVINO From 535445802752b49069f69f49ddcc1abb7fbf2f29 Mon Sep 17 00:00:00 2001 From: Francesco Petrini Date: Fri, 6 Sep 2024 17:47:33 -0700 Subject: [PATCH 34/44] Update versions post 24.09 --- Dockerfile.sdk | 2 +- README.md | 8 ++++---- TRITON_VERSION | 2 +- build.py | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/Dockerfile.sdk b/Dockerfile.sdk index c7a68fc6af..5ddaf7274f 100644 --- a/Dockerfile.sdk +++ b/Dockerfile.sdk @@ -29,7 +29,7 @@ # # Base image on the minimum Triton container -ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.08-py3-min +ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.09-py3-min ARG TRITON_CLIENT_REPO_SUBDIR=clientrepo ARG TRITON_PA_REPO_SUBDIR=perfanalyzerrepo diff --git a/README.md b/README.md index 63b23a9c15..fb347652fd 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,7 @@ ##### LATEST RELEASE You are currently on the `main` branch which tracks under-development progress towards the next release. -The current release is version [2.49.0](https://github.com/triton-inference-server/server/releases/latest) and corresponds to the 24.08 container release on NVIDIA GPU Cloud (NGC). +The current release is version [2.50.0](https://github.com/triton-inference-server/server/releases/latest) and corresponds to the 24.09 container release on NVIDIA GPU Cloud (NGC). Triton Inference Server is an open source inference serving software that streamlines AI inferencing. Triton enables teams to deploy any AI model from @@ -92,16 +92,16 @@ Inference Server with the ```bash # Step 1: Create the example model repository -git clone -b r24.08 https://github.com/triton-inference-server/server.git +git clone -b r24.09 https://github.com/triton-inference-server/server.git cd server/docs/examples ./fetch_models.sh # Step 2: Launch triton from the NGC Triton container -docker run --gpus=1 --rm --net=host -v ${PWD}/model_repository:/models nvcr.io/nvidia/tritonserver:24.08-py3 tritonserver --model-repository=/models +docker run --gpus=1 --rm --net=host -v ${PWD}/model_repository:/models nvcr.io/nvidia/tritonserver:24.09-py3 tritonserver --model-repository=/models # Step 3: Sending an Inference Request # In a separate console, launch the image_client example from the NGC Triton SDK container -docker run -it --rm --net=host nvcr.io/nvidia/tritonserver:24.08-py3-sdk +docker run -it --rm --net=host nvcr.io/nvidia/tritonserver:24.09-py3-sdk /workspace/install/bin/image_client -m densenet_onnx -c 3 -s INCEPTION /workspace/images/mug.jpg # Inference should return the following diff --git a/TRITON_VERSION b/TRITON_VERSION index 5db7ab5ba3..44f28a05f6 100644 --- a/TRITON_VERSION +++ b/TRITON_VERSION @@ -1 +1 @@ -2.50.0dev \ No newline at end of file +2.50.0 \ No newline at end of file diff --git a/build.py b/build.py index 35d14a98f4..3a3310845e 100755 --- a/build.py +++ b/build.py @@ -73,7 +73,7 @@ "2.50.0": ( "24.09", # triton container "24.09", # upstream container - "1.18.1", # ORT + "1.19.2", # ORT "2024.0.0", # ORT OpenVINO "2024.0.0", # Standalone OpenVINO "3.2.6", # DCGM version From 8cfb3b00c98783959ec417b7ca8098d984e68c36 Mon Sep 17 00:00:00 2001 From: Misha Chornyi <99709299+mc-nv@users.noreply.github.com> Date: Tue, 24 Sep 2024 11:46:51 -0700 Subject: [PATCH 35/44] Dockerfile.win10.min - Update dependency versions (#7633) --- Dockerfile.win10.min | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/Dockerfile.win10.min b/Dockerfile.win10.min index 29d2c2a43a..dec972eaf3 100644 --- a/Dockerfile.win10.min +++ b/Dockerfile.win10.min @@ -37,9 +37,9 @@ RUN choco install unzip -y # # Installing TensorRT # -ARG TENSORRT_VERSION=10.3.0.26 -ARG TENSORRT_ZIP="TensorRT-${TENSORRT_VERSION}.Windows10.x86_64.cuda-12.5.zip" -ARG TENSORRT_SOURCE=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.2.0/zip/TensorRT-10.2.0.19.Windows10.x86_64.cuda-12.5.zip +ARG TENSORRT_VERSION=10.4.0.26 +ARG TENSORRT_ZIP="TensorRT-${TENSORRT_VERSION}.Windows.win10.cuda-12.6.zip" +ARG TENSORRT_SOURCE=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.4.0/zip/TensorRT-10.4.0.26.Windows.win10.cuda-12.6.zip # COPY ${TENSORRT_ZIP} /tmp/${TENSORRT_ZIP} ADD ${TENSORRT_SOURCE} /tmp/${TENSORRT_ZIP} RUN unzip /tmp/%TENSORRT_ZIP% @@ -51,9 +51,9 @@ LABEL TENSORRT_VERSION="${TENSORRT_VERSION}" # # Installing cuDNN # -ARG CUDNN_VERSION=9.3.0.75 +ARG CUDNN_VERSION=9.4.0.58 ARG CUDNN_ZIP=cudnn-windows-x86_64-${CUDNN_VERSION}_cuda12-archive.zip -ARG CUDNN_SOURCE=https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/windows-x86_64/cudnn-windows-x86_64-9.2.1.18_cuda12-archive.zip +ARG CUDNN_SOURCE=https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/windows-x86_64/cudnn-windows-x86_64-9.4.0.58_cuda12-archive.zip ADD ${CUDNN_SOURCE} /tmp/${CUDNN_ZIP} RUN unzip /tmp/%CUDNN_ZIP% RUN move cudnn-* cudnn @@ -175,7 +175,7 @@ RUN copy "%CUDA_INSTALL_ROOT_WP%\extras\visual_studio_integration\MSBuildExtensi RUN setx PATH "%CUDA_INSTALL_ROOT_WP%\bin;%PATH%" -ARG CUDNN_VERSION=9.3.0.75 +ARG CUDNN_VERSION=9.4.0.58 ENV CUDNN_VERSION ${CUDNN_VERSION} COPY --from=dependency_base /cudnn /cudnn RUN copy cudnn\bin\cudnn*.dll "%CUDA_INSTALL_ROOT_WP%\bin\." @@ -183,7 +183,7 @@ RUN copy cudnn\lib\x64\cudnn*.lib "%CUDA_INSTALL_ROOT_WP%\lib\x64\." RUN copy cudnn\include\cudnn*.h "%CUDA_INSTALL_ROOT_WP%\include\." LABEL CUDNN_VERSION="${CUDNN_VERSION}" -ARG TENSORRT_VERSION=10.3.0.26 +ARG TENSORRT_VERSION=10.4.0.26 ENV TRT_VERSION ${TENSORRT_VERSION} COPY --from=dependency_base /TensorRT /TensorRT RUN setx PATH "c:\TensorRT\lib;%PATH%" From 8709680833c8a262c8ca2d533e76f334fbde952c Mon Sep 17 00:00:00 2001 From: pvijayakrish Date: Thu, 26 Sep 2024 00:09:35 -0700 Subject: [PATCH 36/44] Update server versions post 24.09 --- TRITON_VERSION | 2 +- build.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/TRITON_VERSION b/TRITON_VERSION index 44f28a05f6..124ddb483d 100644 --- a/TRITON_VERSION +++ b/TRITON_VERSION @@ -1 +1 @@ -2.50.0 \ No newline at end of file +2.51.0dev \ No newline at end of file diff --git a/build.py b/build.py index 3a3310845e..fdb7b47554 100755 --- a/build.py +++ b/build.py @@ -70,8 +70,8 @@ # incorrectly load the other version of the openvino libraries. # TRITON_VERSION_MAP = { - "2.50.0": ( - "24.09", # triton container + "2.51.0dev": ( + "24.10dev", # triton container "24.09", # upstream container "1.19.2", # ORT "2024.0.0", # ORT OpenVINO From dea11317abb92241829e3c62ce2b062ea80c0fab Mon Sep 17 00:00:00 2001 From: KrishnanPrash <140860868+KrishnanPrash@users.noreply.github.com> Date: Wed, 2 Oct 2024 12:34:54 -0700 Subject: [PATCH 37/44] ci: Reducing flakiness of `L0_python_api` (#7674) --- qa/L0_python_api/test_kserve.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/qa/L0_python_api/test_kserve.py b/qa/L0_python_api/test_kserve.py index 703d86ca43..f9af5b3b22 100644 --- a/qa/L0_python_api/test_kserve.py +++ b/qa/L0_python_api/test_kserve.py @@ -241,11 +241,19 @@ def callback(user_data, result, error): time_out = time_out - 1 time.sleep(1) + # Depending on when gRPC frontend shut down StatusCode can vary + acceptable_failure_msgs = [ + "[StatusCode.CANCELLED] CANCELLED", + "[StatusCode.UNAVAILABLE] failed to connect to all addresses", + ] + assert ( len(user_data) == 1 and isinstance(user_data[0], InferenceServerException) - and "[StatusCode.UNAVAILABLE] failed to connect to all addresses" - in str(user_data[0]) + and any( + failure_msg in str(user_data[0]) + for failure_msg in acceptable_failure_msgs + ) ) teardown_client(grpc_client) From 19f76842966bcd6a59c938b375ecea67b27768c3 Mon Sep 17 00:00:00 2001 From: Olga Andreeva <124622579+oandreeva-nv@users.noreply.github.com> Date: Wed, 2 Oct 2024 17:09:46 -0700 Subject: [PATCH 38/44] [doc]Adjusted formatting of the warning (#7675) --- README.md | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index fb347652fd..36ef51f279 100644 --- a/README.md +++ b/README.md @@ -30,11 +30,10 @@ [![License](https://img.shields.io/badge/License-BSD3-lightgrey.svg)](https://opensource.org/licenses/BSD-3-Clause) -[!WARNING] - -##### LATEST RELEASE -You are currently on the `main` branch which tracks under-development progress towards the next release. -The current release is version [2.50.0](https://github.com/triton-inference-server/server/releases/latest) and corresponds to the 24.09 container release on NVIDIA GPU Cloud (NGC). +>[!WARNING] +>You are currently on the `main` branch which tracks under-development progress +>towards the next release. The current release is version [2.50.0](https://github.com/triton-inference-server/server/releases/latest) +>and corresponds to the 24.09 container release on NVIDIA GPU Cloud (NGC). Triton Inference Server is an open source inference serving software that streamlines AI inferencing. Triton enables teams to deploy any AI model from From 6edd5c650e20c79293d86a2a44914e42cd5bf483 Mon Sep 17 00:00:00 2001 From: v-shobhit <161510941+v-shobhit@users.noreply.github.com> Date: Sun, 6 Oct 2024 20:39:47 -0700 Subject: [PATCH 39/44] fix: usage of ReadDataFromJson in array tensors (#7624) Co-authored-by: Sai Kiran Polisetty --- qa/L0_http/generate_endpoint_test.py | 31 ++++++++++++++++--- .../generate_models/mock_llm/config.pbtxt | 8 ++++- qa/L0_http/test.sh | 2 +- src/http_server.cc | 2 ++ 4 files changed, 37 insertions(+), 6 deletions(-) diff --git a/qa/L0_http/generate_endpoint_test.py b/qa/L0_http/generate_endpoint_test.py index a9a972e02a..3eb0b6ea5f 100755 --- a/qa/L0_http/generate_endpoint_test.py +++ b/qa/L0_http/generate_endpoint_test.py @@ -142,6 +142,21 @@ def test_generate(self): self.assertIn("TEXT", data) self.assertEqual(text, data["TEXT"]) + def test_generate_with_all_inputs(self): + # Setup text-based input + text = "hello world" + inputs = {"PROMPT": text, "STREAM": False, "input_ids": [100, 200]} + + r = self.generate(self._model_name, inputs) + r.raise_for_status() + + self.assertIn("Content-Type", r.headers) + self.assertEqual(r.headers["Content-Type"], "application/json") + + data = r.json() + self.assertIn("TEXT", data) + self.assertEqual(text, data["TEXT"]) + def test_request_id(self): # Setup text based input text = "hello world" @@ -220,18 +235,26 @@ def test_missing_inputs(self): ] for inputs in missing_all_inputs: self.generate_expect_failure( - self._model_name, inputs, "expected 2 inputs but got 0" + self._model_name, + inputs, + "expected number of inputs between 2 and 3 but got 0", ) self.generate_stream_expect_failure( - self._model_name, inputs, "expected 2 inputs but got 0" + self._model_name, + inputs, + "expected number of inputs between 2 and 3 but got 0", ) for inputs in missing_one_input: self.generate_expect_failure( - self._model_name, inputs, "expected 2 inputs but got 1" + self._model_name, + inputs, + "expected number of inputs between 2 and 3 but got 1", ) self.generate_stream_expect_failure( - self._model_name, inputs, "expected 2 inputs but got 1" + self._model_name, + inputs, + "expected number of inputs between 2 and 3 but got 1", ) def test_invalid_input_types(self): diff --git a/qa/L0_http/generate_models/mock_llm/config.pbtxt b/qa/L0_http/generate_models/mock_llm/config.pbtxt index 6871661525..74a306052a 100644 --- a/qa/L0_http/generate_models/mock_llm/config.pbtxt +++ b/qa/L0_http/generate_models/mock_llm/config.pbtxt @@ -1,4 +1,4 @@ -# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -41,6 +41,12 @@ input [ name: "STREAM" data_type: TYPE_BOOL dims: [ 1, 1 ] + }, + { + name: "input_ids" + data_type: TYPE_INT32 + dims: [ 1, -1 ] + optional: true } ] diff --git a/qa/L0_http/test.sh b/qa/L0_http/test.sh index 81ae4c254c..572c527ba4 100755 --- a/qa/L0_http/test.sh +++ b/qa/L0_http/test.sh @@ -662,7 +662,7 @@ fi ## Python Unit Tests TEST_RESULT_FILE='test_results.txt' PYTHON_TEST=generate_endpoint_test.py -EXPECTED_NUM_TESTS=16 +EXPECTED_NUM_TESTS=17 set +e python $PYTHON_TEST >$CLIENT_LOG 2>&1 if [ $? -ne 0 ]; then diff --git a/src/http_server.cc b/src/http_server.cc index 2fa395fc98..156c114b77 100644 --- a/src/http_server.cc +++ b/src/http_server.cc @@ -3557,6 +3557,8 @@ HTTPAPIServer::GenerateRequestClass::ExactMappingInput( } } + // get original element count back + element_cnt = tensor_data.IsArray() ? tensor_data.ArraySize() : 1; serialized_data_.emplace_back(); std::vector& serialized = serialized_data_.back(); serialized.resize(byte_size); From b247eb59b1826193f78b76b659e30a85e5fe3b78 Mon Sep 17 00:00:00 2001 From: KrishnanPrash <140860868+KrishnanPrash@users.noreply.github.com> Date: Mon, 7 Oct 2024 10:06:40 -0700 Subject: [PATCH 40/44] fix: `tritonfrontend` gRPC Streaming Segmentation Fault (#7671) --- docs/customization_guide/tritonfrontend.md | 12 ++- qa/L0_python_api/test_kserve.py | 112 ++++++++++++--------- qa/L0_python_api/testing_utils.py | 80 +++++++++++++++ src/grpc/stream_infer_handler.cc | 14 +-- src/http_server.cc | 11 +- 5 files changed, 168 insertions(+), 61 deletions(-) diff --git a/docs/customization_guide/tritonfrontend.md b/docs/customization_guide/tritonfrontend.md index 0ec4b32749..3b47e4dbee 100644 --- a/docs/customization_guide/tritonfrontend.md +++ b/docs/customization_guide/tritonfrontend.md @@ -25,9 +25,15 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. --> -### Triton Server (tritonfrontend) Bindings - -The `tritonfrontend` python package is a set of bindings to Triton's existing frontends implemented in C++. Currently, `tritonfrontend` supports starting up `KServeHttp` and `KServeGrpc` frontends. These bindings used in-combination with Triton's Python In-Process API ([`tritonserver`](https://github.com/triton-inference-server/core/tree/main/python/tritonserver)) and [`tritonclient`](https://github.com/triton-inference-server/client/tree/main/src/python/library) extend the ability to use Triton's full feature set with a couple of lines of Python. +### Triton Server (tritonfrontend) Bindings (Beta) + +The `tritonfrontend` python package is a set of bindings to Triton's existing +frontends implemented in C++. Currently, `tritonfrontend` supports starting up +`KServeHttp` and `KServeGrpc` frontends. These bindings used in-combination +with Triton's Python In-Process API +([`tritonserver`](https://github.com/triton-inference-server/core/tree/main/python/tritonserver)) +and [`tritonclient`](https://github.com/triton-inference-server/client/tree/main/src/python/library) +extend the ability to use Triton's full feature set with a few lines of Python. Let us walk through a simple example: 1. First we need to load the desired models and start the server with `tritonserver`. diff --git a/qa/L0_python_api/test_kserve.py b/qa/L0_python_api/test_kserve.py index f9af5b3b22..021ce9be17 100644 --- a/qa/L0_python_api/test_kserve.py +++ b/qa/L0_python_api/test_kserve.py @@ -29,18 +29,10 @@ import numpy as np import pytest +import testing_utils as utils import tritonclient.grpc as grpcclient import tritonclient.http as httpclient import tritonserver -from testing_utils import ( - send_and_test_inference_identity, - setup_client, - setup_server, - setup_service, - teardown_client, - teardown_server, - teardown_service, -) from tritonclient.utils import InferenceServerException from tritonfrontend import KServeGrpc, KServeHttp @@ -93,33 +85,33 @@ def test_wrong_grpc_parameters(self): class TestKServe: @pytest.mark.parametrize("frontend, client_type, url", [HTTP_ARGS, GRPC_ARGS]) def test_server_ready(self, frontend, client_type, url): - server = setup_server() - service = setup_service(server, frontend) - client = setup_client(client_type, url=url) + server = utils.setup_server() + service = utils.setup_service(server, frontend) + client = utils.setup_client(client_type, url=url) assert client.is_server_ready() - teardown_client(client) - teardown_service(service) - teardown_server(server) + utils.teardown_client(client) + utils.teardown_service(service) + utils.teardown_server(server) @pytest.mark.parametrize("frontend", [HTTP_ARGS[0], GRPC_ARGS[0]]) def test_service_double_start(self, frontend): - server = setup_server() + server = utils.setup_server() # setup_service() performs service.start() - service = setup_service(server, frontend) + service = utils.setup_service(server, frontend) with pytest.raises( tritonserver.AlreadyExistsError, match="server is already running." ): service.start() - teardown_server(server) - teardown_service(service) + utils.teardown_server(server) + utils.teardown_service(service) @pytest.mark.parametrize("frontend", [HTTP_ARGS[0], GRPC_ARGS[0]]) def test_invalid_options(self, frontend): - server = setup_server() + server = utils.setup_server() # Current flow is KServeHttp.Options or KServeGrpc.Options have to be # provided to ensure type and range validation occurs. with pytest.raises( @@ -128,45 +120,65 @@ def test_invalid_options(self, frontend): ): frontend.Server(server, {"port": 8001}) - teardown_server(server) + utils.teardown_server(server) @pytest.mark.parametrize("frontend", [HTTP_ARGS[0], GRPC_ARGS[0]]) def test_server_service_order(self, frontend): - server = setup_server() - service = setup_service(server, frontend) + server = utils.setup_server() + service = utils.setup_service(server, frontend) - teardown_server(server) - teardown_service(service) + utils.teardown_server(server) + utils.teardown_service(service) @pytest.mark.parametrize("frontend, client_type", [HTTP_ARGS[:2], GRPC_ARGS[:2]]) def test_service_custom_port(self, frontend, client_type): - server = setup_server() + server = utils.setup_server() options = frontend.Options(port=8005) - service = setup_service(server, frontend, options) - client = setup_client(client_type, url="localhost:8005") + service = utils.setup_service(server, frontend, options) + client = utils.setup_client(client_type, url="localhost:8005") # Confirms that service starts at port 8005 client.is_server_ready() - teardown_client(client) - teardown_service(service) - teardown_server(server) + utils.teardown_client(client) + utils.teardown_service(service) + utils.teardown_server(server) @pytest.mark.parametrize("frontend, client_type, url", [HTTP_ARGS, GRPC_ARGS]) def test_inference(self, frontend, client_type, url): - server = setup_server() - service = setup_service(server, frontend) + server = utils.setup_server() + service = utils.setup_service(server, frontend) # TODO: use common/test_infer - assert send_and_test_inference_identity(client_type, url=url) + assert utils.send_and_test_inference_identity(client_type, url=url) - teardown_service(service) - teardown_server(server) + utils.teardown_service(service) + utils.teardown_server(server) + + @pytest.mark.parametrize("frontend, client_type, url", [GRPC_ARGS]) + def test_streaming_inference(self, frontend, client_type, url): + server = utils.setup_server() + service = utils.setup_service(server, frontend) + + assert utils.send_and_test_stream_inference(client_type, url) + + utils.teardown_service(service) + utils.teardown_server(server) + + @pytest.mark.parametrize("frontend, client_type, url", [HTTP_ARGS]) + def test_http_generate_inference(self, frontend, client_type, url): + server = utils.setup_server() + service = utils.setup_service(server, frontend) + + assert utils.send_and_test_generate_inference() + + utils.teardown_service(service) + utils.teardown_server(server) @pytest.mark.parametrize("frontend, client_type, url", [HTTP_ARGS]) def test_http_req_during_shutdown(self, frontend, client_type, url): - server = setup_server() - http_service = setup_service(server, frontend) + server = utils.setup_server() + http_service = utils.setup_service(server, frontend) http_client = httpclient.InferenceServerClient(url="localhost:8000") model_name = "delayed_identity" delay = 2 # seconds @@ -182,7 +194,7 @@ def test_http_req_during_shutdown(self, frontend, client_type, url): model_name=model_name, inputs=inputs, outputs=outputs ) # http_service.stop() does not use graceful shutdown - teardown_service(http_service) + utils.teardown_service(http_service) # So, inference request will fail as http endpoints have been stopped. with pytest.raises( @@ -194,7 +206,7 @@ def test_http_req_during_shutdown(self, frontend, client_type, url): # However, due to an unsuccessful get_result(), async_request is still # an active thread. Hence, join stalls until greenlet timeouts. # Does not throw an exception, but displays error in logs. - teardown_client(http_client) + utils.teardown_client(http_client) # delayed_identity will still be an active model # Hence, server.stop() causes InternalError: Timeout. @@ -202,12 +214,12 @@ def test_http_req_during_shutdown(self, frontend, client_type, url): tritonserver.InternalError, match="Exit timeout expired. Exiting immediately.", ): - teardown_server(server) + utils.teardown_server(server) @pytest.mark.parametrize("frontend, client_type, url", [GRPC_ARGS]) def test_grpc_req_during_shutdown(self, frontend, client_type, url): - server = setup_server() - grpc_service = setup_service(server, frontend) + server = utils.setup_server() + grpc_service = utils.setup_service(server, frontend) grpc_client = grpcclient.InferenceServerClient(url=url) user_data = [] @@ -234,7 +246,7 @@ def callback(user_data, result, error): callback=partial(callback, user_data), ) - teardown_service(grpc_service) + utils.teardown_service(grpc_service) time_out = delay + 1 while (len(user_data) == 0) and time_out > 0: @@ -256,8 +268,8 @@ def callback(user_data, result, error): ) ) - teardown_client(grpc_client) - teardown_server(server) + utils.teardown_client(grpc_client) + utils.teardown_server(server) # KNOWN ISSUE: CAUSES SEGFAULT # Created [DLIS-7231] to address at future date @@ -265,8 +277,8 @@ def callback(user_data, result, error): # is deleted. However, the frontend does not know the server instance # is no longer valid. # def test_inference_after_server_stop(self): - # server = setup_server() - # http_service = setup_service(server, KServeHttp) + # server = utils.setup_server() + # http_service = utils.setup_service(server, KServeHttp) # http_client = setup_client(httpclient, url="localhost:8000") # teardown_server(server) # Server has been stopped @@ -282,5 +294,5 @@ def callback(user_data, result, error): # results = http_client.infer(model_name, inputs=inputs, outputs=outputs) - # teardown_client(http_client) - # teardown_service(http_service) + # utils.teardown_client(http_client) + # utils.teardown_service(http_service) diff --git a/qa/L0_python_api/testing_utils.py b/qa/L0_python_api/testing_utils.py index 8c63fea89b..4f81c373db 100644 --- a/qa/L0_python_api/testing_utils.py +++ b/qa/L0_python_api/testing_utils.py @@ -25,12 +25,18 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import os +import queue from typing import Union import numpy as np +import requests import tritonserver +from tritonclient.utils import InferenceServerException from tritonfrontend import KServeGrpc, KServeHttp +# TODO: Re-Format documentation to fit: +# https://google.github.io/styleguide/pyguide.html#38-comments-and-docstrings + def setup_server(model_repository="test_model_repository") -> tritonserver.Server: module_directory = os.path.split(os.path.abspath(__file__))[0] @@ -93,3 +99,77 @@ def send_and_test_inference_identity(frontend_client, url: str) -> bool: teardown_client(client) return input_data[0] == output_data[0].decode() + + +# Sends a streaming inference request to test_model_repository/identity model +# and verifies input == output +def send_and_test_stream_inference(frontend_client, url: str) -> bool: + model_name = "identity" + + # Setting up the gRPC client stream + results = queue.Queue() + callback = lambda error, result: results.put(error or result) + client = frontend_client.InferenceServerClient(url=url) + + client.start_stream(callback=callback) + + # Preparing Input Data + text_input = "testing" + input_tensor = frontend_client.InferInput( + name="INPUT0", shape=[1], datatype="BYTES" + ) + input_tensor.set_data_from_numpy(np.array([text_input.encode()], dtype=np.object_)) + + # Sending Streaming Inference Request + client.async_stream_infer( + model_name=model_name, inputs=[input_tensor], enable_empty_final_response=True + ) + + # Looping through until exception thrown or request completed + completed_requests, num_requests = 0, 1 + text_output, is_final = None, None + while completed_requests != num_requests: + result = results.get() + if isinstance(result, InferenceServerException): + if result.status() == "StatusCode.CANCELLED": + completed_requests += 1 + raise result + + # Processing Response + text_output = result.as_numpy("OUTPUT0")[0].decode() + + triton_final_response = result.get_response().parameters.get( + "triton_final_response", {} + ) + + is_final = False + if triton_final_response.HasField("bool_param"): + is_final = triton_final_response.bool_param + + # Request Completed + if is_final: + completed_requests += 1 + + # Tearing down gRPC client stream + client.stop_stream(cancel_requests=True) + + return is_final and (text_input == text_output) + + +def send_and_test_generate_inference() -> bool: + model_name = "identity" + url = f"http://localhost:8000/v2/models/{model_name}/generate" + input_text = "testing" + data = { + "INPUT0": input_text, + } + + response = requests.post(url, json=data, stream=True) + if response.status_code == 200: + result = response.json() + output_text = result.get("OUTPUT0", "") + + if output_text == input_text: + return True + + return False diff --git a/src/grpc/stream_infer_handler.cc b/src/grpc/stream_infer_handler.cc index cf788b1e09..e912e1512c 100644 --- a/src/grpc/stream_infer_handler.cc +++ b/src/grpc/stream_infer_handler.cc @@ -324,12 +324,14 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok) if (err == nullptr) { TRITONSERVER_InferenceTrace* triton_trace = nullptr; #ifdef TRITON_ENABLE_TRACING - GrpcServerCarrier carrier(state->context_->ctx_.get()); - auto start_options = - trace_manager_->GetTraceStartOptions(carrier, request.model_name()); - state->trace_ = std::move(trace_manager_->SampleTrace(start_options)); - if (state->trace_ != nullptr) { - triton_trace = state->trace_->trace_; + if (trace_manager_ != nullptr) { + GrpcServerCarrier carrier(state->context_->ctx_.get()); + auto start_options = + trace_manager_->GetTraceStartOptions(carrier, request.model_name()); + state->trace_ = std::move(trace_manager_->SampleTrace(start_options)); + if (state->trace_ != nullptr) { + triton_trace = state->trace_->trace_; + } } #endif // TRITON_ENABLE_TRACING diff --git a/src/http_server.cc b/src/http_server.cc index 156c114b77..99aed411b5 100644 --- a/src/http_server.cc +++ b/src/http_server.cc @@ -1810,6 +1810,10 @@ HTTPAPIServer::HandleTrace(evhtp_request_t* req, const std::string& model_name) } #ifdef TRITON_ENABLE_TRACING + if (trace_manager_ == nullptr) { + return; + } + TRITONSERVER_InferenceTraceLevel level = TRITONSERVER_TRACE_LEVEL_DISABLED; uint32_t rate; int32_t count; @@ -3233,8 +3237,11 @@ HTTPAPIServer::HandleGenerate( // If tracing is enabled see if this request should be traced. TRITONSERVER_InferenceTrace* triton_trace = nullptr; - std::shared_ptr trace = - StartTrace(req, model_name, &triton_trace); + std::shared_ptr trace; + if (trace_manager_) { + // If tracing is enabled see if this request should be traced. + trace = StartTrace(req, model_name, &triton_trace); + } std::map input_metadata; triton::common::TritonJson::Value meta_data_root; From d19c6abf85a45133c124222875b5f8d4ea85b094 Mon Sep 17 00:00:00 2001 From: Jacky <18255193+kthui@users.noreply.github.com> Date: Mon, 7 Oct 2024 15:30:32 -0700 Subject: [PATCH 41/44] test: Enhance Python gRPC streaming test to send multiple requests (#7684) --- qa/L0_python_api/testing_utils.py | 72 +++++++++++-------------------- 1 file changed, 25 insertions(+), 47 deletions(-) diff --git a/qa/L0_python_api/testing_utils.py b/qa/L0_python_api/testing_utils.py index 4f81c373db..79901f7411 100644 --- a/qa/L0_python_api/testing_utils.py +++ b/qa/L0_python_api/testing_utils.py @@ -26,6 +26,7 @@ import os import queue +from functools import partial from typing import Union import numpy as np @@ -101,59 +102,36 @@ def send_and_test_inference_identity(frontend_client, url: str) -> bool: return input_data[0] == output_data[0].decode() -# Sends a streaming inference request to test_model_repository/identity model -# and verifies input == output +# Sends multiple streaming requests to "delayed_identity" model with negligible delays, +# and verifies the inputs matches outputs and the ordering is preserved. def send_and_test_stream_inference(frontend_client, url: str) -> bool: - model_name = "identity" - - # Setting up the gRPC client stream - results = queue.Queue() - callback = lambda error, result: results.put(error or result) - client = frontend_client.InferenceServerClient(url=url) - - client.start_stream(callback=callback) - - # Preparing Input Data - text_input = "testing" - input_tensor = frontend_client.InferInput( - name="INPUT0", shape=[1], datatype="BYTES" - ) - input_tensor.set_data_from_numpy(np.array([text_input.encode()], dtype=np.object_)) + num_requests = 100 + requests = [] + for i in range(num_requests): + input0_np = np.array([[float(i) / 1000]], dtype=np.float32) + inputs = [frontend_client.InferInput("INPUT0", input0_np.shape, "FP32")] + inputs[0].set_data_from_numpy(input0_np) + requests.append(inputs) - # Sending Streaming Inference Request - client.async_stream_infer( - model_name=model_name, inputs=[input_tensor], enable_empty_final_response=True - ) - - # Looping through until exception thrown or request completed - completed_requests, num_requests = 0, 1 - text_output, is_final = None, None - while completed_requests != num_requests: - result = results.get() - if isinstance(result, InferenceServerException): - if result.status() == "StatusCode.CANCELLED": - completed_requests += 1 - raise result - - # Processing Response - text_output = result.as_numpy("OUTPUT0")[0].decode() + responses = [] - triton_final_response = result.get_response().parameters.get( - "triton_final_response", {} - ) + def callback(responses, result, error): + responses.append({"result": result, "error": error}) - is_final = False - if triton_final_response.HasField("bool_param"): - is_final = triton_final_response.bool_param - - # Request Completed - if is_final: - completed_requests += 1 + client = frontend_client.InferenceServerClient(url=url) + client.start_stream(partial(callback, responses)) + for inputs in requests: + client.async_stream_infer("delayed_identity", inputs) + client.stop_stream() + teardown_client(client) - # Tearing down gRPC client stream - client.stop_stream(cancel_requests=True) + assert len(responses) == num_requests + for i in range(len(responses)): + assert responses[i]["error"] is None + output0_np = responses[i]["result"].as_numpy(name="OUTPUT0") + assert np.allclose(output0_np, [[float(i) / 1000]]) - return is_final and (text_input == text_output) + return True # test passed def send_and_test_generate_inference() -> bool: From 4dbb1b9d6803036fa48fad7c2dfef9a0be86125d Mon Sep 17 00:00:00 2001 From: KrishnanPrash <140860868+KrishnanPrash@users.noreply.github.com> Date: Mon, 7 Oct 2024 17:09:39 -0700 Subject: [PATCH 42/44] refactor: Removing `Server` subclass from `tritonfrontend` (#7683) --- docs/customization_guide/tritonfrontend.md | 6 +- qa/L0_python_api/test_kserve.py | 2 +- qa/L0_python_api/testing_utils.py | 2 +- src/python/examples/example.py | 3 +- src/python/tritonfrontend/_api/_kservegrpc.py | 83 +++++++++---------- src/python/tritonfrontend/_api/_kservehttp.py | 73 ++++++++-------- 6 files changed, 83 insertions(+), 86 deletions(-) diff --git a/docs/customization_guide/tritonfrontend.md b/docs/customization_guide/tritonfrontend.md index 3b47e4dbee..763ab82fb9 100644 --- a/docs/customization_guide/tritonfrontend.md +++ b/docs/customization_guide/tritonfrontend.md @@ -59,11 +59,11 @@ Note: `model_path` may need to be edited depending on your setup. ```python from tritonfrontend import KServeHttp, KServeGrpc http_options = KServeHttp.Options(thread_count=5) -http_service = KServeHttp.Server(server, http_options) +http_service = KServeHttp(server, http_options) http_service.start() # Default options (if none provided) -grpc_service = KServeGrpc.Server(server) +grpc_service = KServeGrpc(server) grpc_service.start() ``` @@ -110,7 +110,7 @@ from tritonfrontend import KServeHttp import tritonclient.http as httpclient import numpy as np # Use version numpy < 2 -with KServeHttp.Server(server) as http_service: +with KServeHttp(server) as http_service: # The identity model returns an exact duplicate of the input data as output model_name = "identity" url = "localhost:8000" diff --git a/qa/L0_python_api/test_kserve.py b/qa/L0_python_api/test_kserve.py index 021ce9be17..9e8b82eb43 100644 --- a/qa/L0_python_api/test_kserve.py +++ b/qa/L0_python_api/test_kserve.py @@ -118,7 +118,7 @@ def test_invalid_options(self, frontend): tritonserver.InvalidArgumentError, match="Incorrect type for options. options argument must be of type", ): - frontend.Server(server, {"port": 8001}) + frontend(server, {"port": 8001}) utils.teardown_server(server) diff --git a/qa/L0_python_api/testing_utils.py b/qa/L0_python_api/testing_utils.py index 79901f7411..48cb3ccc37 100644 --- a/qa/L0_python_api/testing_utils.py +++ b/qa/L0_python_api/testing_utils.py @@ -64,7 +64,7 @@ def setup_service( frontend: Union[KServeHttp, KServeGrpc], options=None, ) -> Union[KServeHttp, KServeGrpc]: - service = frontend.Server(server=server, options=options) + service = frontend(server=server, options=options) service.start() return service diff --git a/src/python/examples/example.py b/src/python/examples/example.py index a1fba6e9d1..2d2ca78920 100644 --- a/src/python/examples/example.py +++ b/src/python/examples/example.py @@ -51,7 +51,7 @@ def main(): http_options = KServeHttp.Options(port=8005) # or http_service = KServeHttp.Server(server, http_options) & http_service.stop() - with KServeHttp.Server(server, http_options) as http_service: + with KServeHttp(server, http_options) as http_service: # The identity model returns an exact duplicate of the input data as output model_name = "identity" url = "localhost:8005" @@ -74,7 +74,6 @@ def main(): output_data = results.as_numpy("OUTPUT0") print("--------------------- INFERENCE RESULTS ---------------------") - print("Input data:", input_data) print("Output data:", output_data) print("-------------------------------------------------------------") diff --git a/src/python/tritonfrontend/_api/_kservegrpc.py b/src/python/tritonfrontend/_api/_kservegrpc.py index 5471613340..b8f199ac53 100644 --- a/src/python/tritonfrontend/_api/_kservegrpc.py +++ b/src/python/tritonfrontend/_api/_kservegrpc.py @@ -90,48 +90,47 @@ def __post_init__(self): if isinstance(self.infer_compression_level, Grpc_compression_level): self.infer_compression_level = self.infer_compression_level.value - class Server: - def __init__(self, server: tritonserver, options: "KServeGrpc.Options" = None): - try: - server_ptr = server._ptr() # TRITONSERVER_Server pointer - - # If no options provided, default options are selected - if options is None: - options = KServeGrpc.Options() - - if not isinstance(options, KServeGrpc.Options): - raise InvalidArgumentError( - "Incorrect type for options. options argument must be of type KServeGrpc.Options" - ) - - # Converts dataclass instance -> python dictionary -> unordered_map> - options_dict: dict[str, Union[int, bool, str]] = options.__dict__ - - self.triton_frontend = TritonFrontendGrpc(server_ptr, options_dict) - except TritonError: - exc_type, exc_value, _ = sys.exc_info() - # raise ... from None masks the tritonfrontend Error from being added in traceback - raise ERROR_MAPPING[exc_type](exc_value) from None - - def __enter__(self): + def __init__(self, server: tritonserver, options: "KServeGrpc.Options" = None): + try: + server_ptr = server._ptr() # TRITONSERVER_Server pointer + + # If no options provided, default options are selected + if options is None: + options = KServeGrpc.Options() + + if not isinstance(options, KServeGrpc.Options): + raise InvalidArgumentError( + "Incorrect type for options. options argument must be of type KServeGrpc.Options" + ) + + # Converts dataclass instance -> python dictionary -> unordered_map> + options_dict: dict[str, Union[int, bool, str]] = options.__dict__ + + self.triton_frontend = TritonFrontendGrpc(server_ptr, options_dict) + except TritonError: + exc_type, exc_value, _ = sys.exc_info() + # raise ... from None masks the tritonfrontend Error from being added in traceback + raise ERROR_MAPPING[exc_type](exc_value) from None + + def __enter__(self): + self.triton_frontend.start() + return self + + def __exit__(self, exc_type, exc_value, traceback): + self.triton_frontend.stop() + if exc_type: + raise ERROR_MAPPING[exc_type](exc_value) from None + + def start(self): + try: self.triton_frontend.start() - return self + except TritonError: + exc_type, exc_value, _ = sys.exc_info() + raise ERROR_MAPPING[exc_type](exc_value) from None - def __exit__(self, exc_type, exc_value, traceback): + def stop(self): + try: self.triton_frontend.stop() - if exc_type: - raise ERROR_MAPPING[exc_type](exc_value) from None - - def start(self): - try: - self.triton_frontend.start() - except TritonError: - exc_type, exc_value, _ = sys.exc_info() - raise ERROR_MAPPING[exc_type](exc_value) from None - - def stop(self): - try: - self.triton_frontend.stop() - except TritonError: - exc_type, exc_value, _ = sys.exc_info() - raise ERROR_MAPPING[exc_type](exc_value) from None + except TritonError: + exc_type, exc_value, _ = sys.exc_info() + raise ERROR_MAPPING[exc_type](exc_value) from None diff --git a/src/python/tritonfrontend/_api/_kservehttp.py b/src/python/tritonfrontend/_api/_kservehttp.py index 6a2524986a..4a5abef4a3 100644 --- a/src/python/tritonfrontend/_api/_kservehttp.py +++ b/src/python/tritonfrontend/_api/_kservehttp.py @@ -50,48 +50,47 @@ class Options: # DLIS-7215: Add restricted protocol support # restricted_protocols: list - class Server: - def __init__(self, server: tritonserver, options: "KServeHttp.Options" = None): - try: - server_ptr = server._ptr() # TRITONSERVER_Server pointer + def __init__(self, server: tritonserver, options: "KServeHttp.Options" = None): + try: + server_ptr = server._ptr() # TRITONSERVER_Server pointer - # If no options provided, default options are selected - if options is None: - options = KServeHttp.Options() + # If no options provided, default options are selected + if options is None: + options = KServeHttp.Options() - if not isinstance(options, KServeHttp.Options): - raise InvalidArgumentError( - "Incorrect type for options. options argument must be of type KServeHttp.Options" - ) + if not isinstance(options, KServeHttp.Options): + raise InvalidArgumentError( + "Incorrect type for options. options argument must be of type KServeHttp.Options" + ) - options_dict: dict[str, Union[int, bool, str]] = options.__dict__ - # Converts dataclass instance -> python dictionary -> unordered_map> + options_dict: dict[str, Union[int, bool, str]] = options.__dict__ + # Converts dataclass instance -> python dictionary -> unordered_map> - self.triton_frontend = TritonFrontendHttp(server_ptr, options_dict) - except TritonError: - exc_type, exc_value, _ = sys.exc_info() - # raise ... from None masks the tritonfrontend Error from being added in traceback - raise ERROR_MAPPING[exc_type](exc_value) from None + self.triton_frontend = TritonFrontendHttp(server_ptr, options_dict) + except TritonError: + exc_type, exc_value, _ = sys.exc_info() + # raise ... from None masks the tritonfrontend Error from being added in traceback + raise ERROR_MAPPING[exc_type](exc_value) from None - def __enter__(self): - self.triton_frontend.start() - return self + def __enter__(self): + self.triton_frontend.start() + return self - def __exit__(self, exc_type, exc_value, traceback): - self.triton_frontend.stop() - if exc_type: - raise ERROR_MAPPING[exc_type](exc_value) from None + def __exit__(self, exc_type, exc_value, traceback): + self.triton_frontend.stop() + if exc_type: + raise ERROR_MAPPING[exc_type](exc_value) from None - def start(self): - try: - self.triton_frontend.start() - except TritonError: - exc_type, exc_value, _ = sys.exc_info() - raise ERROR_MAPPING[exc_type](exc_value) from None + def start(self): + try: + self.triton_frontend.start() + except TritonError: + exc_type, exc_value, _ = sys.exc_info() + raise ERROR_MAPPING[exc_type](exc_value) from None - def stop(self): - try: - self.triton_frontend.stop() - except TritonError: - exc_type, exc_value, _ = sys.exc_info() - raise ERROR_MAPPING[exc_type](exc_value) from None + def stop(self): + try: + self.triton_frontend.stop() + except TritonError: + exc_type, exc_value, _ = sys.exc_info() + raise ERROR_MAPPING[exc_type](exc_value) from None From da05094930edfdd1c7bf557fb1957e831eccc79f Mon Sep 17 00:00:00 2001 From: pranavm-nvidia <49246958+pranavm-nvidia@users.noreply.github.com> Date: Tue, 8 Oct 2024 14:50:03 -0400 Subject: [PATCH 43/44] feat: Add copyright hook (#7666) --- .pre-commit-config.yaml | 12 +- LICENSE | 2 +- tools/add_copyright.py | 365 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 377 insertions(+), 2 deletions(-) create mode 100644 tools/add_copyright.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f44f815351..663a36d631 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,4 +1,4 @@ -# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -72,3 +72,13 @@ repos: - id: mixed-line-ending - id: requirements-txt-fixer - id: trailing-whitespace + +- repo: local + hooks: + - id: add-license + name: Add License + entry: python tools/add_copyright.py + language: python + stages: [pre-commit] + verbose: true + require_serial: true diff --git a/LICENSE b/LICENSE index 5529809efc..914565ec7d 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions diff --git a/tools/add_copyright.py b/tools/add_copyright.py new file mode 100644 index 0000000000..34432bb0c6 --- /dev/null +++ b/tools/add_copyright.py @@ -0,0 +1,365 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import argparse +import os +import re +import subprocess +import sys +from datetime import datetime +from typing import Callable, Dict, Optional, Sequence + +current_year = str(datetime.now().year) + +ROOT_DIR = os.path.join(os.path.dirname(__file__), os.path.pardir) + +LICENSE_PATH = os.path.join(ROOT_DIR, "LICENSE") + +COPYRIGHT_YEAR_PAT = re.compile( + r"Copyright( \(c\))? (\d{4})?-?(\d{4}), NVIDIA CORPORATION" +) + + +def has_copyright(content: str) -> bool: + return COPYRIGHT_YEAR_PAT.search(content) + + +def update_copyright_year( + path: str, content: Optional[str] = None, disallow_range: bool = False +) -> str: + """ + Updates the copyright year in the provided file. + If the copyright is not present in the file, this function has no effect. + """ + if content is None: + with open(path, "r") as f: + content = f.read() + + match = COPYRIGHT_YEAR_PAT.search(content) + min_year = match.groups()[1] or match.groups()[2] + + new_copyright = f"Copyright{match.groups()[0] or ''} " + if min_year < current_year and not disallow_range: + new_copyright += f"{min_year}-{current_year}" + else: + new_copyright += f"{current_year}" + new_copyright += ", NVIDIA CORPORATION" + + updated_content = COPYRIGHT_YEAR_PAT.sub(new_copyright, content) + + if content != updated_content: + with open(path, "w") as f: + f.write(updated_content) + + +def update_and_get_license() -> str: + """ + Updates the copyright year in the LICENSE file if necessary and then + returns its contents. + """ + # TODO: Check if this is right - if the license file needs to have a range, + # we need to remove the range before returning the license text. + # + # License file should always have the current year. + update_copyright_year(LICENSE_PATH, disallow_range=True) + + with open(LICENSE_PATH, "r") as license_file: + return license_file.read() + + +LICENSE_TEXT = update_and_get_license() + +# +# Header manipulation helpers +# + + +def prefix_lines(content: str, prefix: str) -> str: + # NOTE: This could have been done via `textwrap.indent`, but we're not actually indenting, + # so it seems semantically wrong to do that. + return prefix + f"\n{prefix}".join(content.splitlines()) + + +def insert_after(regex: str) -> Callable[[str], str]: + """ + Builds a callback that will insert a provided header after + the specified regular expression. If the expression is not + found in the file contents, the header will be inserted at the + beginning of the file. + + Args: + regex: The regular expression to match. + + Returns: + A callable that can be used as the `add_header` argument to `update_or_add_header`. + """ + + def add_header(header: str, content: str) -> str: + match = re.match(regex, content) + + if match is None: + return header + "\n" + content + + insertion_point = match.span()[-1] + + return content[:insertion_point] + f"{header}\n" + content[insertion_point:] + + return add_header + + +def update_or_add_header( + path: str, header: str, add_header: Optional[Callable[[str, str], str]] = None +): + """ + Updates in place or adds a new copyright header to the specified file. + + Args: + path: The path of the file. + header: The contents of the copyright header. + add_header: A callback that receives the copyright header and file contents and + controls how the contents of the file are updated. By default, the copyright + header is prepended to the file. + """ + with open(path, "r") as f: + content = f.read() + + if has_copyright(content): + update_copyright_year(path, content) + return + + add_header = add_header or (lambda header, content: header + "\n" + content) + + content = add_header(header, content) + + # As a sanity check, make sure we didn't accidentally add the copyright header + # twice, or add a new header when one was already present. + if content.count("Copyright (c)") != 1: + print( + f"WARNING: Something went wrong while processing: {path}!\n" + "Please check if the copyright header was included twice or wasn't added at all. " + ) + + with open(path, "w") as f: + f.write(content) + + +# Each file type requires slightly different handling when inserting the copyright +# header. For example, for C++ files, the header must be prefixed with `//` and for +# shell scripts, it must be prefixed with `#` and must be inserted *after* the shebang. +# +# This mapping stores callables that return whether a handler wants to process a specified +# file based on the path along with callables that will accept the file path and update +# it with the copyright header. +FILE_TYPE_HANDLERS: Dict[Callable[[str], bool], Callable[[str], None]] = {} + + +# +# Path matching callables +# These allow registered functions to more easily specify what kinds of +# paths they should be applied to. +# +def has_ext(exts: Sequence[str]): + def has_ext_impl(path: str): + _, ext = os.path.splitext(path) + return ext in exts + + return has_ext_impl + + +def basename_is(expected_path: str): + return lambda path: os.path.basename(path) == expected_path + + +def path_contains(expected: str): + return lambda path: expected in path + + +def any_of(*funcs: Sequence[Callable[[str], bool]]): + return lambda path: any(func(path) for func in funcs) + + +# +# File handlers for different types of files. +# Many types of files require very similar handling - those are combined where possible. +# + + +def register(match: Callable[[str], bool]): + def register_impl(func): + FILE_TYPE_HANDLERS[match] = func + return func + + return register_impl + + +@register( + any_of( + has_ext([".py", ".sh", ".bash", ".yaml", ".pbtxt"]), + basename_is("CMakeLists.txt"), + path_contains("Dockerfile"), + ) +) +def py_or_shell_like(path): + update_or_add_header( + path, + prefix_lines(LICENSE_TEXT, "# "), + # Insert the header *after* the shebang. + # NOTE: This could break if there is a shebang-like pattern elsewhere in the file. + # In that case, this could be edited to check only the first line of the file (after removing whitespace). + insert_after(r"#!(.*)\n"), + ) + + +@register(has_ext([".cc", ".h"])) +def cpp(path): + update_or_add_header(path, prefix_lines(LICENSE_TEXT, "// ")) + + +@register(has_ext([".tpl"])) +def tpl(path): + update_or_add_header(path, "{{/*\n" + prefix_lines(LICENSE_TEXT, "# ") + "\n*/}}") + + +@register(has_ext([".html", ".md"])) +def html_md(path): + update_or_add_header(path, "") + + +def add_copyrights(paths): + for path in paths: + for match, handler in FILE_TYPE_HANDLERS.items(): + if match(path): + handler(path) + break + else: + print( + f"WARNING: No handler registered for file: {path}. Please add a new handler to {__file__}!" + ) + + subprocess.run(["git", "add"] + paths) + + print(f"Processed copyright headers for {len(paths)} file(s).") + + +def main() -> int: + parser = argparse.ArgumentParser( + description="Adds copyright headers to source files" + ) + parser.add_argument("files", nargs="*") + + args, _ = parser.parse_known_args() + add_copyrights(args.files) + return 0 + + +if __name__ == "__main__": + # sys.exit is important here to avoid the test-related imports below during normal execution. + sys.exit(main()) + + +# +# Integration Tests +# +import tempfile + +import pytest + + +# Processes provided text through the copyright hook by writing it to a temporary file. +def process_text(content, extension): + with tempfile.NamedTemporaryFile("w+", suffix=extension) as f: + f.write(content) + f.flush() + + add_copyrights([f.name]) + + f.seek(0) + return f.read() + + +# We use this slightly weird hack to make sure the copyright hook does not do a text replacement +# of the parameters in the test, since they look exactly like copyright headers. +def make_copyright_text(text): + return f"Copyright {text}" + + +@pytest.mark.parametrize( + "content, expected", + [ + # Convert to range if the year that's already present is older than the current year. + ( + make_copyright_text("(c) 2018, NVIDIA CORPORATION"), + make_copyright_text(f"(c) 2018-{current_year}, NVIDIA CORPORATION"), + ), + ( + make_copyright_text("2018, NVIDIA CORPORATION"), + make_copyright_text(f"2018-{current_year}, NVIDIA CORPORATION"), + ), + # No effect if the year is current: + ( + make_copyright_text(f"(c) {current_year}, NVIDIA CORPORATION"), + make_copyright_text(f"(c) {current_year}, NVIDIA CORPORATION"), + ), + ( + make_copyright_text(f"{current_year}, NVIDIA CORPORATION"), + make_copyright_text(f"{current_year}, NVIDIA CORPORATION"), + ), + # If there is already a range, update the upper bound of the range: + ( + make_copyright_text("(c) 2018-2023, NVIDIA CORPORATION"), + make_copyright_text(f"(c) 2018-{current_year}, NVIDIA CORPORATION"), + ), + ], +) +def test_copyright_update(content, expected): + # We don't really care about the extension here - just needs to be something the hook will recognize. + assert process_text(content, ".py") == expected + + +@pytest.mark.parametrize( + "content, extension, expected", + [ + ("", ".cc", f"// {make_copyright_text(f'(c) {current_year}')}"), + ("", ".h", f"// {make_copyright_text(f'(c) {current_year}')}"), + ("", ".py", f"# {make_copyright_text(f'(c) {current_year}')}"), + ("", ".sh", f"# {make_copyright_text(f'(c) {current_year}')}"), + # Make sure copyright comes after shebangs + ( + "#!/bin/python\n", + ".py", + f"#!/bin/python\n# {make_copyright_text(f'(c) {current_year}')}", + ), + ( + "#!/bin/bash\n", + ".sh", + f"#!/bin/bash\n# {make_copyright_text(f'(c) {current_year}')}", + ), + ], +) +def test_adding_new_copyrights(content, extension, expected): + assert process_text(content, extension).startswith(expected) + + +def test_license_has_no_range(): + assert LICENSE_TEXT.startswith(f"Copyright (c) {current_year},") From fde6e5887775b1236e25192601b6cd1d7abe3620 Mon Sep 17 00:00:00 2001 From: KrishnanPrash <140860868+KrishnanPrash@users.noreply.github.com> Date: Tue, 8 Oct 2024 17:10:48 -0700 Subject: [PATCH 44/44] build: Adding `tritonfrontend` to `build.py` (#7681) Co-authored-by: Ryan McCormick --- build.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/build.py b/build.py index fdb7b47554..14301f843d 100755 --- a/build.py +++ b/build.py @@ -1849,11 +1849,11 @@ def core_build( os.path.join(repo_install_dir, "lib", "libtritonserver.so"), os.path.join(install_dir, "lib"), ) - # [FIXME] Placing the Triton server wheel file in 'python' for now, should - # have been upload to pip registry and be able to install directly + # [FIXME] Placing the tritonserver and tritonfrontend wheel files in 'python' for now, + # should be uploaded to pip registry to be able to install directly cmake_script.mkdir(os.path.join(install_dir, "python")) cmake_script.cp( - os.path.join(repo_install_dir, "python", "tritonserver*.whl"), + os.path.join(repo_install_dir, "python", "triton*.whl"), os.path.join(install_dir, "python"), )