From e1816629dd8bc1b05748450739420639543bcb11 Mon Sep 17 00:00:00 2001
From: Kris Hung <krish@nvidia.com>
Date: Wed, 31 Jul 2024 12:52:56 -0700
Subject: [PATCH 01/44] ci: Return custom exit code to indicate known shm leak
 failure in L0_backend_python bls test (#7485)

---
 .../argument_validation/test.sh               |  2 +-
 qa/L0_backend_python/bls/test.sh              | 41 +++++++++++--------
 qa/L0_backend_python/custom_metrics/test.sh   |  2 +-
 .../request_rescheduling/test.sh              |  2 +-
 .../setup_python_enviroment.sh                |  2 +-
 qa/L0_backend_python/test.sh                  | 29 +++++++++++--
 ...hon_unittest.py => test_infer_shm_leak.py} | 31 ++++++++------
 qa/L0_dlpack_multi_gpu/test.sh                |  6 +--
 qa/L0_warmup/test.sh                          |  6 +--
 qa/common/shm_util.py                         |  5 ++-
 10 files changed, 81 insertions(+), 45 deletions(-)
 rename qa/L0_backend_python/{python_unittest.py => test_infer_shm_leak.py} (75%)

diff --git a/qa/L0_backend_python/argument_validation/test.sh b/qa/L0_backend_python/argument_validation/test.sh
index b14ba4abb3..90cbef89b5 100755
--- a/qa/L0_backend_python/argument_validation/test.sh
+++ b/qa/L0_backend_python/argument_validation/test.sh
@@ -25,7 +25,7 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-CLIENT_PY=../python_unittest.py
+CLIENT_PY=../test_infer_shm_leak.py
 CLIENT_LOG="./arg_validation_client.log"
 TEST_RESULT_FILE='test_results.txt'
 SERVER_ARGS="--model-repository=${MODELDIR}/argument_validation/models --backend-directory=${BACKEND_DIR} --log-verbose=1"
diff --git a/qa/L0_backend_python/bls/test.sh b/qa/L0_backend_python/bls/test.sh
index 204af7e2ba..46d1f40818 100755
--- a/qa/L0_backend_python/bls/test.sh
+++ b/qa/L0_backend_python/bls/test.sh
@@ -25,7 +25,7 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-CLIENT_PY=../python_unittest.py
+CLIENT_PY=../test_infer_shm_leak.py
 CLIENT_LOG="./bls_client.log"
 TEST_RESULT_FILE='test_results.txt'
 source ../../common/util.sh
@@ -33,7 +33,7 @@ source ../../common/util.sh
 TRITON_REPO_ORGANIZATION=${TRITON_REPO_ORGANIZATION:=http://github.com/triton-inference-server}
 
 RET=0
-rm -fr *.log ./models *.txt
+rm -fr *.log ./models *.txt *.xml
 
 # FIXME: [DLIS-5970] Until Windows supports GPU tensors, only test CPU
 if [[ ${TEST_WINDOWS} == 0 ]]; then
@@ -119,30 +119,35 @@ if [[ ${TEST_WINDOWS} == 0 ]]; then
 
             for MODEL_NAME in bls bls_memory bls_memory_async bls_async; do
                 export MODEL_NAME=${MODEL_NAME}
-
-                python3 -m pytest --junitxml="${MODEL_NAME}.${TRIAL}.${CUDA_MEMORY_POOL_SIZE_MB}.report.xml" $CLIENT_PY >> $CLIENT_LOG 2>&1
-                if [ $? -ne 0 ]; then
+                # Run with pytest to capture the return code correctly
+                pytest --junitxml="${MODEL_NAME}.${TRIAL}.${CUDA_MEMORY_POOL_SIZE_MB}.report.xml" $CLIENT_PY >> $CLIENT_LOG 2>&1
+                EXIT_CODE=$?
+                if [ $EXIT_CODE -ne 0 ]; then
                     echo -e "\n***\n*** ${MODEL_NAME} ${BLS_KIND} test FAILED. \n***"
+                    RET=$EXIT_CODE
                     cat $SERVER_LOG
                     cat $CLIENT_LOG
-                    RET=1
                 fi
             done
 
-            set -e
-
             kill_server
 
-            # Check for bls 'test_timeout' to ensure timeout value is being correctly passed
-            if [ `grep -c "Request timeout: 11000000000" $SERVER_LOG` == "0" ]; then
-                echo -e "\n***\n*** BLS timeout value not correctly passed to model: line ${LINENO}\n***"
-                cat $SERVER_LOG
-                RET=1
+            set -e
+
+            # Only check the timeout value if there is no error since the test
+            # may fail before the test_timeout case gets run.
+            if [ $RET -eq 0 ]; then
+                # Check for bls 'test_timeout' to ensure timeout value is being correctly passed
+                if [ `grep -c "Request timeout: 11000000000" $SERVER_LOG` == "0" ]; then
+                    echo -e "\n***\n*** BLS timeout value not correctly passed to model: line ${LINENO}\n***"
+                    cat $SERVER_LOG
+                    RET=1
+                fi
             fi
 
-            if [[ $CUDA_MEMORY_POOL_SIZE_MB -eq 128 ]]; then
+            if [[ $CUDA_MEMORY_POOL_SIZE_MB -eq 256 ]]; then
                 if [ `grep -c "Failed to allocate memory from CUDA memory pool" $SERVER_LOG` != "0" ]; then
-                    echo -e "\n***\n*** Expected to use CUDA memory pool for all tests when CUDA_MEMOY_POOL_SIZE_MB is 128 MB for 'bls' $BLS_KIND test\n***"
+                    echo -e "\n***\n*** Expected to use CUDA memory pool for all tests when CUDA_MEMORY_POOL_SIZE_MB is 256 MB for 'bls' $BLS_KIND test\n***"
                     cat $SERVER_LOG
                     RET=1
                 fi
@@ -342,10 +347,10 @@ set -e
 
 kill_server
 
-if [ $RET -eq 1 ]; then
-    echo -e "\n***\n*** BLS test FAILED. \n***"
-else
+if [ $RET -eq 0 ]; then
     echo -e "\n***\n*** BLS test PASSED. \n***"
+else
+    echo -e "\n***\n*** BLS test FAILED. \n***"
 fi
 
 exit $RET
diff --git a/qa/L0_backend_python/custom_metrics/test.sh b/qa/L0_backend_python/custom_metrics/test.sh
index 4491d9e030..9020c7ebfd 100755
--- a/qa/L0_backend_python/custom_metrics/test.sh
+++ b/qa/L0_backend_python/custom_metrics/test.sh
@@ -25,7 +25,7 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-CLIENT_PY=../python_unittest.py
+CLIENT_PY=../test_infer_shm_leak.py
 CLIENT_LOG="./custom_metrics_client.log"
 TEST_RESULT_FILE='test_results.txt'
 source ../../common/util.sh
diff --git a/qa/L0_backend_python/request_rescheduling/test.sh b/qa/L0_backend_python/request_rescheduling/test.sh
index 6fd6fe09e5..31ba6692d9 100755
--- a/qa/L0_backend_python/request_rescheduling/test.sh
+++ b/qa/L0_backend_python/request_rescheduling/test.sh
@@ -25,7 +25,7 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-CLIENT_PY="../python_unittest.py"
+CLIENT_PY="../test_infer_shm_leak.py"
 CLIENT_LOG="./request_rescheduling_client.log"
 TEST_RESULT_FILE='test_results.txt'
 source ../../common/util.sh
diff --git a/qa/L0_backend_python/setup_python_enviroment.sh b/qa/L0_backend_python/setup_python_enviroment.sh
index 88baccc4f6..a2171e02da 100755
--- a/qa/L0_backend_python/setup_python_enviroment.sh
+++ b/qa/L0_backend_python/setup_python_enviroment.sh
@@ -151,7 +151,7 @@ apt-get update && apt-get -y install \
                             libboost-dev
 rm -f /usr/bin/python3 && \
 ln -s "/usr/bin/python3.${PYTHON_ENV_VERSION}" /usr/bin/python3
-pip3 install --upgrade install requests numpy virtualenv protobuf
+pip3 install --upgrade requests numpy virtualenv protobuf
 find /opt/tritonserver/qa/pkgs/ -maxdepth 1 -type f -name \
     "tritonclient-*linux*.whl" | xargs printf -- '%s[all]' | \
     xargs pip3 install --upgrade
diff --git a/qa/L0_backend_python/test.sh b/qa/L0_backend_python/test.sh
index 65767419f2..f6d4b7b445 100755
--- a/qa/L0_backend_python/test.sh
+++ b/qa/L0_backend_python/test.sh
@@ -425,11 +425,20 @@ if [ "$TEST_JETSON" == "0" ]; then
         # between dependencies.
         setup_virtualenv
 
+        set +e
         (cd ${TEST} && bash -ex test.sh)
-        if [ $? -ne 0 ]; then
+        EXIT_CODE=$?
+        if [ $EXIT_CODE -ne 0 ]; then
             echo "Subtest ${TEST} FAILED"
-            RET=1
+            RET=$EXIT_CODE
+
+            # In bls test, it is allowed to fail with a strict memory leak of 480 bytes with exit code '123'.
+            # Propagate the exit code to make sure it's not overwritten by other tests.
+            if [[ ${TEST} == "bls" ]]  && [[ $EXIT_CODE -ne 1 ]] ; then
+                BLS_RET=$RET
+            fi
         fi
+        set -e
 
         deactivate_virtualenv
     done
@@ -438,11 +447,13 @@ if [ "$TEST_JETSON" == "0" ]; then
     if [[ ${PYTHON_ENV_VERSION} = "10" ]] && [[ ${TEST_WINDOWS} == 0 ]]; then
         # In 'env' test we use miniconda for dependency management. No need to run
         # the test in a virtual environment.
+        set +e
         (cd env && bash -ex test.sh)
         if [ $? -ne 0 ]; then
             echo "Subtest env FAILED"
             RET=1
         fi
+        set -e
     fi
 fi
 
@@ -459,12 +470,14 @@ for TEST in ${SUBTESTS}; do
     # between dependencies.
     setup_virtualenv
 
+    set +e
     (cd ${TEST} && bash -ex test.sh)
 
     if [ $? -ne 0 ]; then
         echo "Subtest ${TEST} FAILED"
         RET=1
     fi
+    set -e
 
     deactivate_virtualenv
 done
@@ -475,4 +488,14 @@ else
   echo -e "\n***\n*** Test FAILED\n***"
 fi
 
-exit $RET
+# Exit with RET if it is 1, meaning that the test failed.
+# Otherwise, exit with BLS_RET if it is set, meaning that the known memory leak is captured.
+if [ $RET -eq 1 ]; then
+    exit $RET
+else
+    if [ -z "$BLS_RET" ]; then
+        exit $RET
+    else
+        exit $BLS_RET
+    fi
+fi
diff --git a/qa/L0_backend_python/python_unittest.py b/qa/L0_backend_python/test_infer_shm_leak.py
similarity index 75%
rename from qa/L0_backend_python/python_unittest.py
rename to qa/L0_backend_python/test_infer_shm_leak.py
index 4b94996976..966243e86e 100755
--- a/qa/L0_backend_python/python_unittest.py
+++ b/qa/L0_backend_python/test_infer_shm_leak.py
@@ -33,6 +33,7 @@
 import os
 import unittest
 
+import pytest
 import shm_util
 import tritonclient.grpc as grpcclient
 from tritonclient.utils import *
@@ -41,11 +42,13 @@
 # we overwrite the IP address with the TRITONSERVER_IPADDR envvar
 _tritonserver_ipaddr = os.environ.get("TRITONSERVER_IPADDR", "localhost")
 
+# The exit code 123 is used to indicate that the shm leak probe detected a 480
+# bytes leak in the bls sub-test. Any leak other than 480 bytes will cause the
+# test to fail with the default exit code 1.
+ALLOWED_FAILURE_EXIT_CODE = 123
 
-class PythonUnittest(unittest.TestCase):
-    def setUp(self):
-        self._shm_leak_detector = shm_util.ShmLeakDetector()
 
+class TestInferShmLeak:
     def _run_unittest(self, model_name):
         with grpcclient.InferenceServerClient(f"{_tritonserver_ipaddr}:8001") as client:
             # No input is required
@@ -54,15 +57,17 @@ def _run_unittest(self, model_name):
 
             # The model returns 1 if the tests were successfully passed.
             # Otherwise, it will return 0.
-            self.assertEqual(
-                output0, [1], f"python_unittest failed for model {model_name}"
-            )
-
-    def test_python_unittest(self):
-        model_name = os.environ["MODEL_NAME"]
-        with self._shm_leak_detector.Probe() as shm_probe:
-            self._run_unittest(model_name)
+            assert output0 == [1], f"python_unittest failed for model {model_name}"
 
+    def test_shm_leak(self):
+        self._shm_leak_detector = shm_util.ShmLeakDetector()
+        model_name = os.environ.get("MODEL_NAME", "default_model")
 
-if __name__ == "__main__":
-    unittest.main()
+        try:
+            with self._shm_leak_detector.Probe() as shm_probe:
+                self._run_unittest(model_name)
+        except AssertionError as e:
+            if "Known shared memory leak of 480 bytes detected" in str(e):
+                pytest.exit(str(e), returncode=ALLOWED_FAILURE_EXIT_CODE)
+            else:
+                raise e
diff --git a/qa/L0_dlpack_multi_gpu/test.sh b/qa/L0_dlpack_multi_gpu/test.sh
index 996f062f42..ae72daa7d0 100755
--- a/qa/L0_dlpack_multi_gpu/test.sh
+++ b/qa/L0_dlpack_multi_gpu/test.sh
@@ -27,7 +27,7 @@
 
 SERVER=/opt/tritonserver/bin/tritonserver
 SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=1"
-CLIENT_PY=./python_unittest.py
+CLIENT_PY=./test_infer_shm_leak.py
 CLIENT_LOG="./client.log"
 EXPECTED_NUM_TESTS="1"
 TEST_RESULT_FILE='test_results.txt'
@@ -52,8 +52,8 @@ rm -fr *.log ./models
 mkdir -p models/dlpack_test/1/
 cp ../python_models/dlpack_test/model.py models/dlpack_test/1/
 cp ../python_models/dlpack_test/config.pbtxt models/dlpack_test
-cp ../L0_backend_python/python_unittest.py .
-sed -i 's#sys.path.append("../../common")#sys.path.append("../common")#g' python_unittest.py
+cp ../L0_backend_python/test_infer_shm_leak.py .
+sed -i 's#sys.path.append("../../common")#sys.path.append("../common")#g' test_infer_shm_leak.py
 
 run_server
 if [ "$SERVER_PID" == "0" ]; then
diff --git a/qa/L0_warmup/test.sh b/qa/L0_warmup/test.sh
index aeed873b25..a535aed25b 100755
--- a/qa/L0_warmup/test.sh
+++ b/qa/L0_warmup/test.sh
@@ -42,7 +42,7 @@ export CUDA_VISIBLE_DEVICES=0
 
 CLIENT=../clients/image_client
 CLIENT_LOG="./client.log"
-CLIENT_PY=./python_unittest.py
+CLIENT_PY=./test_infer_shm_leak.py
 EXPECTED_NUM_TESTS="1"
 TEST_RESULT_FILE='test_results.txt'
 
@@ -449,8 +449,8 @@ mkdir -p models/bls_onnx_warmup/1/
 cp ../python_models/bls_onnx_warmup/model.py models/bls_onnx_warmup/1/
 cp ../python_models/bls_onnx_warmup/config.pbtxt models/bls_onnx_warmup/.
 
-cp ../L0_backend_python/python_unittest.py .
-sed -i 's#sys.path.append("../../common")#sys.path.append("../common")#g' python_unittest.py
+cp ../L0_backend_python/test_infer_shm_leak.py .
+sed -i 's#sys.path.append("../../common")#sys.path.append("../common")#g' test_infer_shm_leak.py
 
 run_server
 if [ "$SERVER_PID" == "0" ]; then
diff --git a/qa/common/shm_util.py b/qa/common/shm_util.py
index 16e5ce4e45..0e533bcdbb 100755
--- a/qa/common/shm_util.py
+++ b/qa/common/shm_util.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 
-# Copyright 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2018-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -441,6 +441,9 @@ def __exit__(self, type, value, traceback):
                     print(
                         f"Shared memory leak detected [{shm_region}]: {curr_shm_free_size} (curr free) < {prev_shm_free_size} (prev free)."
                     )
+                    # FIXME DLIS-7122: Known shared memory leak of 480 bytes in BLS test.
+                    if curr_shm_free_size == 1006576 and prev_shm_free_size == 1007056:
+                        assert False, f"Known shared memory leak of 480 bytes detected."
             assert not shm_leak_detected, f"Shared memory leak detected."
 
         def _get_shm_free_sizes(self, delay_sec=0):

From dc90a5260ddad1cef54b6a2523533209bbe373e1 Mon Sep 17 00:00:00 2001
From: Misha Chornyi <99709299+mc-nv@users.noreply.github.com>
Date: Fri, 2 Aug 2024 08:19:21 -0700
Subject: [PATCH 02/44] Including 'tritonserver.lib' into final package (#7491)

---
 build.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/build.py b/build.py
index 6ab8a58515..2c95cbded3 100755
--- a/build.py
+++ b/build.py
@@ -1647,6 +1647,10 @@ def core_build(
             os.path.join(repo_install_dir, "bin", "tritonserver.dll"),
             os.path.join(install_dir, "bin"),
         )
+        cmake_script.cp(
+            os.path.join(repo_install_dir, "lib", "tritonserver.lib"),
+            os.path.join(install_dir, "bin"),
+        )
     else:
         cmake_script.mkdir(os.path.join(install_dir, "bin"))
         cmake_script.cp(

From cca12f9ddee928b7ad4b089598e4b7a98132c9f8 Mon Sep 17 00:00:00 2001
From: Alex Zhang <zhanga5@users.noreply.github.com>
Date: Mon, 5 Aug 2024 09:56:38 +0800
Subject: [PATCH 03/44] build: Add default value for argument
 'TRITON_REPO_ORGANIZATION' from sdk Dockerfile (#7437)

---
 Dockerfile.sdk                    |  1 +
 docs/customization_guide/build.md | 12 ++++++------
 docs/customization_guide/test.md  | 13 ++++++++-----
 3 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/Dockerfile.sdk b/Dockerfile.sdk
index 1524b5ead3..0748277d52 100644
--- a/Dockerfile.sdk
+++ b/Dockerfile.sdk
@@ -33,6 +33,7 @@ ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.07-py3-min
 
 ARG TRITON_CLIENT_REPO_SUBDIR=clientrepo
 ARG TRITON_PA_REPO_SUBDIR=perfanalyzerrepo
+ARG TRITON_REPO_ORGANIZATION=http://github.com/triton-inference-server
 ARG TRITON_COMMON_REPO_TAG=main
 ARG TRITON_CORE_REPO_TAG=main
 ARG TRITON_CLIENT_REPO_TAG=main
diff --git a/docs/customization_guide/build.md b/docs/customization_guide/build.md
index db16b65c6b..0c1cc08a41 100644
--- a/docs/customization_guide/build.md
+++ b/docs/customization_guide/build.md
@@ -331,13 +331,13 @@ invocation builds all features and backends available on windows.
 python build.py --cmake-dir=<path/to/repo>/build --build-dir=/tmp/citritonbuild --no-container-pull --image=base,win10-py3-min --enable-logging --enable-stats --enable-tracing --enable-gpu --endpoint=grpc --endpoint=http --repo-tag=common:<container tag> --repo-tag=core:<container tag> --repo-tag=backend:<container tag> --repo-tag=thirdparty:<container tag> --backend=ensemble --backend=tensorrt:<container tag> --backend=onnxruntime:<container tag> --backend=openvino:<container tag>
 ```
 
-If you are building on *main* branch then '<container tag>' will
+If you are building on *main* branch then `<container tag>` will
 default to "main". If you are building on a release branch then
-'<container tag>' will default to the branch name. For example, if you
-are building on the r24.07 branch, '<container tag>' will default to
-r24.07. Therefore, you typically do not need to provide '<container
-tag>' at all (nor the preceding colon). You can use a different
-'<container tag>' for a component to instead use the corresponding
+`<container tag>` will default to the branch name. For example, if you
+are building on the r24.07 branch, `<container tag>` will default to
+r24.07. Therefore, you typically do not need to provide `<container
+tag>` at all (nor the preceding colon). You can use a different
+`<container tag>` for a component to instead use the corresponding
 branch/tag in the build. For example, if you have a branch called
 "mybranch" in the
 [onnxruntime_backend](https://github.com/triton-inference-server/onnxruntime_backend)
diff --git a/docs/customization_guide/test.md b/docs/customization_guide/test.md
index d664a139d3..e066d31493 100644
--- a/docs/customization_guide/test.md
+++ b/docs/customization_guide/test.md
@@ -48,7 +48,7 @@ $ ./gen_qa_model_repository
 $ ./gen_qa_custom_ops
 ```
 
-This will create multiple model repositories in /tmp/<version>/qa_*
+This will create multiple model repositories in /tmp/\<version\>/qa_*
 (for example /tmp/24.07/qa_model_repository).  The TensorRT models
 will be created for the GPU on the system that CUDA considers device 0
 (zero). If you have multiple GPUs on your system see the documentation
@@ -57,14 +57,17 @@ in the scripts for how to target a specific GPU.
 ## Build SDK Image
 
 Build the *tritonserver_sdk* image that contains the client
-libraries, model analyzer, and examples using the following
-commands. You must first checkout the <client branch> branch of the
-*client* repo into the clientrepo/ subdirectory. Typically you want to
-set <client branch> to be the same as your current server branch.
+libraries, model analyzer, perf analyzer and examples using the following
+commands. You must first checkout the `<client branch>` branch of the
+*client* repo into the clientrepo/ subdirectory and the `<perf analyzer branch>`
+branch of the *perf_analyzer* repo into the perfanalyzerrepo/ subdirectory
+respectively. Typically you want to set both `<client branch>` and `<perf analyzer branch>`
+to be the same as your current server branch.
 
 ```
 $ cd <server repo root>
 $ git clone --single-branch --depth=1 -b <client branch> https://github.com/triton-inference-server/client.git clientrepo
+$ git clone --single-branch --depth=1 -b <perf analyzer branch> https://github.com/triton-inference-server/perf_analyzer.git perfanalyzerrepo
 $ docker build -t tritonserver_sdk -f Dockerfile.sdk .
 ```
 

From 9ad856c4f67e62226eff40d757e6181ddf97c9a2 Mon Sep 17 00:00:00 2001
From: Francesco Petrini <francescogpetrini@gmail.com>
Date: Tue, 6 Aug 2024 16:32:29 -0700
Subject: [PATCH 04/44] chore:Purge PA from Client Repo (#7488)

* PA Migration: Update server docs and tests
---
 README.md                              |  2 +-
 deploy/gke-marketplace-app/README.md   |  4 ++--
 deploy/k8s-onprem/README.md            |  4 ++--
 docs/README.md                         |  6 ++---
 docs/contents.md                       | 31 ++++++++++++++++----------
 docs/examples/jetson/README.md         |  6 ++---
 docs/generate_docs.py                  |  4 ++++
 docs/user_guide/debugging_guide.md     |  4 ++--
 docs/user_guide/faq.md                 |  4 ++--
 docs/user_guide/jetson.md              |  2 +-
 docs/user_guide/model_analyzer.md      |  4 ++--
 docs/user_guide/model_configuration.md |  2 +-
 docs/user_guide/optimization.md        |  4 ++--
 docs/user_guide/perf_analyzer.md       |  4 ++--
 docs/user_guide/performance_tuning.md  |  4 ++--
 qa/L0_perf_analyzer_doc_links/test.sh  | 10 ++++-----
 16 files changed, 53 insertions(+), 42 deletions(-)

diff --git a/README.md b/README.md
index 17628b4f03..2200886a20 100644
--- a/README.md
+++ b/README.md
@@ -179,7 +179,7 @@ configuration](docs/user_guide/model_configuration.md) for the model.
   [Backend-Platform Support Matrix](https://github.com/triton-inference-server/backend/blob/main/docs/backend_platform_support_matrix.md)
   to learn which backends are supported on your target platform.
 - Learn how to [optimize performance](docs/user_guide/optimization.md) using the
-  [Performance Analyzer](https://github.com/triton-inference-server/client/blob/main/src/c++/perf_analyzer/README.md)
+  [Performance Analyzer](https://github.com/triton-inference-server/perf_analyzer/blob/main/README.md)
   and
   [Model Analyzer](https://github.com/triton-inference-server/model_analyzer)
 - Learn how to [manage loading and unloading models](docs/user_guide/model_management.md) in
diff --git a/deploy/gke-marketplace-app/README.md b/deploy/gke-marketplace-app/README.md
index e99b9efbae..595d4634ab 100644
--- a/deploy/gke-marketplace-app/README.md
+++ b/deploy/gke-marketplace-app/README.md
@@ -1,5 +1,5 @@
 <!--
-# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -172,7 +172,7 @@ The client example push about ~650 QPS(Query per second) to Triton Server, and w
 ![Locust Client Chart](client.png)
 
 Alternatively, user can opt to use
-[Perf Analyzer](https://github.com/triton-inference-server/client/blob/main/src/c++/perf_analyzer/README.md)
+[Perf Analyzer](https://github.com/triton-inference-server/perf_analyzer/blob/main/README.md)
 to profile and study the performance of Triton Inference Server. Here we also
 provide a
 [client script](https://github.com/triton-inference-server/server/tree/master/deploy/gke-marketplace-app/client-sample/perf_analyzer_grpc.sh)
diff --git a/deploy/k8s-onprem/README.md b/deploy/k8s-onprem/README.md
index 4287b23c35..cb641830c9 100644
--- a/deploy/k8s-onprem/README.md
+++ b/deploy/k8s-onprem/README.md
@@ -1,5 +1,5 @@
 <!--
-# Copyright (c) 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -295,7 +295,7 @@ Image 'images/mug.jpg':
 After you have confirmed that your Triton cluster is operational and can perform inference,
 you can test the load balancing and autoscaling features by sending a heavy load of requests.
 One option for doing this is using the
-[perf_analyzer](https://github.com/triton-inference-server/client/blob/main/src/c++/perf_analyzer/README.md)
+[perf_analyzer](https://github.com/triton-inference-server/perf_analyzer/blob/main/README.md)
 application.
 
 You can apply a progressively increasing load with a command like:
diff --git a/docs/README.md b/docs/README.md
index 22e0c0d691..0f9faba3fe 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -1,5 +1,5 @@
 <!--
-# Copyright 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2018-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -173,7 +173,7 @@ Understanding Inference performance is key to better resource utilization. Use T
 - [Performance Tuning Guide](user_guide/performance_tuning.md)
 - [Optimization](user_guide/optimization.md)
 - [Model Analyzer](user_guide/model_analyzer.md)
-- [Performance Analyzer](https://github.com/triton-inference-server/client/blob/main/src/c++/perf_analyzer/README.md)
+- [Performance Analyzer](https://github.com/triton-inference-server/perf_analyzer/blob/main/README.md)
 - [Inference Request Tracing](user_guide/trace.md)
 ### Jetson and JetPack
 Triton can be deployed on edge devices. Explore [resources](user_guide/jetson.md) and [examples](examples/jetson/README.md).
@@ -185,7 +185,7 @@ The following resources are recommended to explore the full suite of Triton Infe
 
 - **Configuring Deployment**: Triton comes with three tools which can be used to configure deployment setting, measure performance and recommend optimizations.
   - [Model Analyzer](https://github.com/triton-inference-server/model_analyzer) Model Analyzer is CLI tool built to recommend deployment configurations for Triton Inference Server based on user's Quality of Service Requirements. It also generates detailed reports about model performance to summarize the benefits and trade offs of different configurations.
-  - [Perf Analyzer](https://github.com/triton-inference-server/client/blob/main/src/c++/perf_analyzer/README.md):
+  - [Perf Analyzer](https://github.com/triton-inference-server/perf_analyzer/blob/main/README.md):
   Perf Analyzer is a CLI application built to generate inference requests and
   measures the latency of those requests and throughput of the model being
   served.
diff --git a/docs/contents.md b/docs/contents.md
index cf5653340d..5aaafa7afa 100644
--- a/docs/contents.md
+++ b/docs/contents.md
@@ -1,5 +1,5 @@
 <!--
-# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -119,17 +119,24 @@ client/src/grpc_generated/java/README
 :maxdepth: 1
 :caption: Performance Analyzer
 
-client/src/c++/perf_analyzer/README
-client/src/c++/perf_analyzer/docs/README
-client/src/c++/perf_analyzer/docs/install
-client/src/c++/perf_analyzer/docs/quick_start
-client/src/c++/perf_analyzer/docs/cli
-client/src/c++/perf_analyzer/docs/inference_load_modes
-client/src/c++/perf_analyzer/docs/input_data
-client/src/c++/perf_analyzer/docs/measurements_metrics
-client/src/c++/perf_analyzer/docs/benchmarking
-client/src/c++/perf_analyzer/genai-perf/README
-client/src/c++/perf_analyzer/genai-perf/examples/tutorial
+perf_analyzer/README
+perf_analyzer/docs/README
+perf_analyzer/docs/install
+perf_analyzer/docs/quick_start
+perf_analyzer/docs/cli
+perf_analyzer/docs/inference_load_modes
+perf_analyzer/docs/input_data
+perf_analyzer/docs/measurements_metrics
+perf_analyzer/docs/benchmarking
+perf_analyzer/genai-perf/README
+perf_analyzer/genai-perf/docs/compare
+perf_analyzer/genai-perf/docs/embeddings
+perf_analyzer/genai-perf/docs/files
+perf_analyzer/genai-perf/docs/lora
+perf_analyzer/genai-perf/docs/multi_modal
+perf_analyzer/genai-perf/docs/rankings
+perf_analyzer/genai-perf/docs/tutorial
+perf_analyzer/genai-perf/examples/tutorial
 ```
 
 ```{toctree}
diff --git a/docs/examples/jetson/README.md b/docs/examples/jetson/README.md
index 281d5f2a97..77a20474b9 100644
--- a/docs/examples/jetson/README.md
+++ b/docs/examples/jetson/README.md
@@ -1,5 +1,5 @@
 <!--
-# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -53,7 +53,7 @@ Inference Server as a shared library.
 ## Part 2. Analyzing model performance with perf_analyzer
 
 To analyze model performance on Jetson,
-[perf_analyzer](https://github.com/triton-inference-server/client/blob/main/src/c++/perf_analyzer/README.md)
+[perf_analyzer](https://github.com/triton-inference-server/perf_analyzer/blob/main/README.md)
 tool is used. The `perf_analyzer` is included in the release tar file or can be
 compiled from source.
 
@@ -65,4 +65,4 @@ From this directory of the repository, execute the following to evaluate model p
 
 In the example above we saved the results as a `.csv` file. To visualize these
 results, follow the steps described
-[here](https://github.com/triton-inference-server/client/blob/main/src/c++/perf_analyzer/README.md).
+[here](https://github.com/triton-inference-server/perf_analyzer/blob/main/README.md).
diff --git a/docs/generate_docs.py b/docs/generate_docs.py
index 1cc6644fde..6982294d21 100755
--- a/docs/generate_docs.py
+++ b/docs/generate_docs.py
@@ -388,6 +388,10 @@ def main():
     if "client" in repo_tags:
         clone_from_github("client", repo_tags["client"], github_org)
 
+    # Usage generate_docs.py --repo-tag=perf_analyzer:main
+    if "perf_analyzer" in repo_tags:
+        clone_from_github("perf_analyzer", repo_tags["perf_analyzer"], github_org)
+
     # Usage generate_docs.py --repo-tag=python_backend:main
     if "python_backend" in repo_tags:
         clone_from_github("python_backend", repo_tags["python_backend"], github_org)
diff --git a/docs/user_guide/debugging_guide.md b/docs/user_guide/debugging_guide.md
index 3a38f209d3..e5b0263d30 100644
--- a/docs/user_guide/debugging_guide.md
+++ b/docs/user_guide/debugging_guide.md
@@ -1,5 +1,5 @@
 <!--
-# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -59,7 +59,7 @@ Before proceeding, please see if the model configuration documentation [here](./
     - [Custom_models](https://github.com/triton-inference-server/server/tree/main/qa/custom_models), [ensemble_models](https://github.com/triton-inference-server/server/tree/main/qa/ensemble_models), and [python_models](https://github.com/triton-inference-server/server/tree/main/qa/python_models) include examples of configs for their respective use cases.
     - [L0_model_config](https://github.com/triton-inference-server/server/tree/main/qa/L0_model_config) tests many types of incomplete model configs.
 
-Note that if you are running into an issue with [perf_analyzer](https://github.com/triton-inference-server/client/blob/main/src/c%2B%2B/perf_analyzer/README.md) or [Model Analyzer](https://github.com/triton-inference-server/model_analyzer), try loading the model onto Triton directly. This checks if the configuration is incorrect or the perf_analyzer or Model Analyzer options need to be updated.
+Note that if you are running into an issue with [perf_analyzer](https://github.com/triton-inference-server/perf_analyzer/blob/main/README.md) or [Model Analyzer](https://github.com/triton-inference-server/model_analyzer), try loading the model onto Triton directly. This checks if the configuration is incorrect or the perf_analyzer or Model Analyzer options need to be updated.
 
 ## Model Issues
 **Step 1. Run Models Outside of Triton**
diff --git a/docs/user_guide/faq.md b/docs/user_guide/faq.md
index 523b38f750..2381b1d9b9 100644
--- a/docs/user_guide/faq.md
+++ b/docs/user_guide/faq.md
@@ -1,5 +1,5 @@
 <!--
-# Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -99,7 +99,7 @@ available through the [HTTP/REST, GRPC, and C
 APIs](../customization_guide/inference_protocols.md).
 
 A client application,
-[perf_analyzer](https://github.com/triton-inference-server/client/blob/main/src/c++/perf_analyzer/README.md),
+[perf_analyzer](https://github.com/triton-inference-server/perf_analyzer/blob/main/README.md),
 allows you to measure the performance of an individual model using a synthetic
 load. The perf_analyzer application is designed to show you the tradeoff of
 latency vs. throughput.
diff --git a/docs/user_guide/jetson.md b/docs/user_guide/jetson.md
index cda1da111d..e2b2b0ad34 100644
--- a/docs/user_guide/jetson.md
+++ b/docs/user_guide/jetson.md
@@ -201,7 +201,7 @@ tritonserver --model-repository=/path/to/model_repo --backend-directory=/path/to
 ```
 
 **Note**:
-[perf_analyzer](https://github.com/triton-inference-server/client/blob/main/src/c++/perf_analyzer/README.md)
+[perf_analyzer](https://github.com/triton-inference-server/perf_analyzer/blob/main/README.md)
 is supported on Jetson, while the [model_analyzer](model_analyzer.md) is
 currently not available for Jetson. To execute `perf_analyzer` for C API, use
 the CLI flag `--service-kind=triton_c_api`:
diff --git a/docs/user_guide/model_analyzer.md b/docs/user_guide/model_analyzer.md
index 663a8a277a..c4b606364b 100644
--- a/docs/user_guide/model_analyzer.md
+++ b/docs/user_guide/model_analyzer.md
@@ -1,5 +1,5 @@
 <!--
-# Copyright (c) 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -30,7 +30,7 @@
 
 The Triton [Model Analyzer](https://github.com/triton-inference-server/model_analyzer)
  is a tool that uses
-[Performance Analyzer](https://github.com/triton-inference-server/client/blob/main/src/c++/perf_analyzer/README.md)
+[Performance Analyzer](https://github.com/triton-inference-server/perf_analyzer/blob/main/README.md)
 to send requests to your model while measuring GPU memory and compute
 utilization. The Model Analyzer is specifically useful for characterizing the
 GPU memory requirements for your model under different batching and model
diff --git a/docs/user_guide/model_configuration.md b/docs/user_guide/model_configuration.md
index e7a2d29c3c..1b0e64a533 100644
--- a/docs/user_guide/model_configuration.md
+++ b/docs/user_guide/model_configuration.md
@@ -934,7 +934,7 @@ dynamic batcher configurations.
 ```
 
 * Use the
-  [Performance Analyzer](https://github.com/triton-inference-server/client/blob/main/src/c++/perf_analyzer/README.md)
+  [Performance Analyzer](https://github.com/triton-inference-server/perf_analyzer/blob/main/README.md)
   to determine the latency and throughput provided by the default dynamic
   batcher configuration.
 
diff --git a/docs/user_guide/optimization.md b/docs/user_guide/optimization.md
index f842198a90..5ca3d376b2 100644
--- a/docs/user_guide/optimization.md
+++ b/docs/user_guide/optimization.md
@@ -1,5 +1,5 @@
 <!--
-# Copyright (c) 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -44,7 +44,7 @@ single GPU.
 Unless you already have a client application suitable for measuring
 the performance of your model on Triton, you should familiarize
 yourself with
-[Performance Analyzer](https://github.com/triton-inference-server/client/blob/main/src/c++/perf_analyzer/README.md).
+[Performance Analyzer](https://github.com/triton-inference-server/perf_analyzer/blob/main/README.md).
 The Performance Analyzer is an essential tool for optimizing your model's
 performance.
 
diff --git a/docs/user_guide/perf_analyzer.md b/docs/user_guide/perf_analyzer.md
index 7019d51c63..0631b404c5 100644
--- a/docs/user_guide/perf_analyzer.md
+++ b/docs/user_guide/perf_analyzer.md
@@ -1,5 +1,5 @@
 <!--
-# Copyright (c) 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -27,4 +27,4 @@
 -->
 
 Perf Analyzer documentation has been relocated to
-[here](https://github.com/triton-inference-server/client/blob/main/src/c++/perf_analyzer/README.md).
+[here](https://github.com/triton-inference-server/perf_analyzer/blob/main/README.md).
diff --git a/docs/user_guide/performance_tuning.md b/docs/user_guide/performance_tuning.md
index 49cad9e637..446534da99 100644
--- a/docs/user_guide/performance_tuning.md
+++ b/docs/user_guide/performance_tuning.md
@@ -73,7 +73,7 @@ For additional material, see the
     verify that we can run inference requests and get a baseline performance
     benchmark of your model.
     Triton's
-    [Perf Analyzer](https://github.com/triton-inference-server/client/blob/main/src/c++/perf_analyzer/README.md)
+    [Perf Analyzer](https://github.com/triton-inference-server/perf_analyzer/blob/main/README.md)
     tool specifically fits this purpose. Here is a simplified output for
     demonstration purposes:
 
@@ -103,7 +103,7 @@ For additional material, see the
     There are many variables that can be tweaked just within your model
     configuration (`config.pbtxt`) to obtain different results.
     - As your model, config, or use case evolves,
-    [Perf Analyzer](https://github.com/triton-inference-server/client/blob/main/src/c++/perf_analyzer/README.md)
+    [Perf Analyzer](https://github.com/triton-inference-server/perf_analyzer/blob/main/README.md)
     is a great tool to quickly verify model functionality and performance.
 
 3. How can I improve my model performance?
diff --git a/qa/L0_perf_analyzer_doc_links/test.sh b/qa/L0_perf_analyzer_doc_links/test.sh
index db80e84974..d0757bca9e 100755
--- a/qa/L0_perf_analyzer_doc_links/test.sh
+++ b/qa/L0_perf_analyzer_doc_links/test.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -35,10 +35,10 @@ python3 -m pip install mkdocs-htmlproofer-plugin==0.10.3
 
 #Download perf_analyzer docs
 TRITON_REPO_ORGANIZATION=${TRITON_REPO_ORGANIZATION:="http://github.com/triton-inference-server"}
-TRITON_CLIENT_REPO_TAG="${TRITON_CLIENT_REPO_TAG:=main}"
-git clone -b ${TRITON_CLIENT_REPO_TAG} ${TRITON_REPO_ORGANIZATION}/client.git
-cp `pwd`/client/src/c++/perf_analyzer/README.md .
-cp -rf `pwd`/client/src/c++/perf_analyzer/docs .
+TRITON_PERF_ANALYZER_REPO_TAG="${TRITON_PERF_ANALYZER_REPO_TAG:=main}"
+git clone -b ${TRITON_PERF_ANALYZER_REPO_TAG} ${TRITON_REPO_ORGANIZATION}/perf_analyzer.git
+cp `pwd`/perf_analyzer/README.md .
+cp -rf `pwd`/perf_analyzer/docs .
 
 # Need to remove all links that start with -- or -. Mkdocs converts all -- to - for anchor links.
 # This breaks all links to cli commands throughout the docs. This will iterate over all

From a4285ff0d68643bb4c959e5cb7287de427d006d5 Mon Sep 17 00:00:00 2001
From: Francesco Petrini <francescogpetrini@gmail.com>
Date: Wed, 7 Aug 2024 10:01:11 -0700
Subject: [PATCH 05/44] PA Migration: Update L0_client_build_variants (#7505)

* PA Migration: Update L0_client_build_variants
---
 Dockerfile.sdk                      |  1 +
 qa/L0_client_build_variants/test.sh | 84 ++++++-----------------------
 2 files changed, 16 insertions(+), 69 deletions(-)

diff --git a/Dockerfile.sdk b/Dockerfile.sdk
index 0748277d52..7897c2a215 100644
--- a/Dockerfile.sdk
+++ b/Dockerfile.sdk
@@ -218,6 +218,7 @@ WORKDIR /workspace
 COPY TRITON_VERSION .
 COPY NVIDIA_Deep_Learning_Container_License.pdf .
 COPY --from=sdk_build /workspace/client/ client/
+COPY --from=sdk_build /workspace/perf_analyzer/ perf_analyzer/
 COPY --from=sdk_build /workspace/install/ install/
 RUN cd install && \
     export VERSION=`cat /workspace/TRITON_VERSION` && \
diff --git a/qa/L0_client_build_variants/test.sh b/qa/L0_client_build_variants/test.sh
index c31c55e310..9dc1c4c85d 100755
--- a/qa/L0_client_build_variants/test.sh
+++ b/qa/L0_client_build_variants/test.sh
@@ -58,10 +58,6 @@ TRITON_REPO_ORGANIZATION=${TRITON_REPO_ORGANIZATION:="http://github.com/triton-i
               -DTRITON_ENABLE_PYTHON_HTTP=ON \
               -DTRITON_ENABLE_PYTHON_GRPC=ON \
               -DTRITON_ENABLE_JAVA_HTTP=ON \
-              -DTRITON_ENABLE_PERF_ANALYZER=ON \
-              -DTRITON_ENABLE_PERF_ANALYZER_C_API=ON \
-              -DTRITON_ENABLE_PERF_ANALYZER_TFS=OFF \
-              -DTRITON_ENABLE_PERF_ANALYZER_TS=OFF \
               -DTRITON_ENABLE_EXAMPLES=ON \
               -DTRITON_ENABLE_TESTS=ON \
               -DTRITON_ENABLE_GPU=OFF \
@@ -90,10 +86,6 @@ fi
               -DTRITON_ENABLE_CC_GRPC=ON \
               -DTRITON_ENABLE_PYTHON_HTTP=OFF \
               -DTRITON_ENABLE_PYTHON_GRPC=ON \
-              -DTRITON_ENABLE_PERF_ANALYZER=ON \
-              -DTRITON_ENABLE_PERF_ANALYZER_C_API=ON \
-              -DTRITON_ENABLE_PERF_ANALYZER_TFS=OFF \
-              -DTRITON_ENABLE_PERF_ANALYZER_TS=OFF \
               -DTRITON_ENABLE_EXAMPLES=ON \
               -DTRITON_ENABLE_TESTS=ON \
               -DTRITON_ENABLE_GPU=ON \
@@ -121,10 +113,6 @@ fi
               -DTRITON_ENABLE_CC_GRPC=OFF \
               -DTRITON_ENABLE_PYTHON_HTTP=ON \
               -DTRITON_ENABLE_PYTHON_GRPC=OFF \
-              -DTRITON_ENABLE_PERF_ANALYZER=ON \
-              -DTRITON_ENABLE_PERF_ANALYZER_C_API=ON \
-              -DTRITON_ENABLE_PERF_ANALYZER_TFS=OFF \
-              -DTRITON_ENABLE_PERF_ANALYZER_TS=OFF \
               -DTRITON_ENABLE_EXAMPLES=ON \
               -DTRITON_ENABLE_TESTS=ON \
               -DTRITON_ENABLE_GPU=ON \
@@ -141,59 +129,27 @@ else
     exit 1
 fi
 
-#
-# Build without Perf Analyzer
-#
-(cd /workspace/build && \
-        rm -fr cc-clients python-clients && \
-        cmake -DCMAKE_INSTALL_PREFIX=/workspace/install \
-              -DTRITON_ENABLE_CC_HTTP=ON \
-              -DTRITON_ENABLE_CC_GRPC=ON \
-              -DTRITON_ENABLE_PYTHON_HTTP=ON \
-              -DTRITON_ENABLE_PYTHON_GRPC=ON \
-              -DTRITON_ENABLE_PERF_ANALYZER=OFF \
-              -DTRITON_ENABLE_PERF_ANALYZER_C_API=OFF \
-              -DTRITON_ENABLE_PERF_ANALYZER_TFS=OFF \
-              -DTRITON_ENABLE_PERF_ANALYZER_TS=OFF \
-              -DTRITON_ENABLE_EXAMPLES=ON \
-              -DTRITON_ENABLE_TESTS=ON \
-              -DTRITON_ENABLE_GPU=ON \
-              -DTRITON_REPO_ORGANIZATION:STRING=${TRITON_REPO_ORGANIZATION} \
-              -DTRITON_COMMON_REPO_TAG=${TRITON_COMMON_REPO_TAG} \
-              -DTRITON_CORE_REPO_TAG=${TRITON_CORE_REPO_TAG} \
-              -DTRITON_THIRD_PARTY_REPO_TAG=${TRITON_THIRD_PARTY_REPO_TAG} \
-              /workspace/client && \
-        make -j16 cc-clients python-clients)
-if [ $? -eq 0 ]; then
-    echo -e "\n***\n*** No-Perf-Analyzer Passed\n***"
-else
-    echo -e "\n***\n*** No-Perf-Analyzer FAILED\n***"
-    exit 1
-fi
-
+# TODO: TPRD-342 These tests should be PA CI test
+# cases not Triton test cases
+rm -fr /workspace/build
+mkdir -p /workspace/build
 #
 # Build without C API in Perf Analyzer
 #
 (cd /workspace/build && \
-        rm -fr cc-clients python-clients && \
         cmake -DCMAKE_INSTALL_PREFIX=/workspace/install \
               -DTRITON_ENABLE_CC_HTTP=ON \
               -DTRITON_ENABLE_CC_GRPC=ON \
-              -DTRITON_ENABLE_PYTHON_HTTP=ON \
-              -DTRITON_ENABLE_PYTHON_GRPC=ON \
-              -DTRITON_ENABLE_PERF_ANALYZER=ON \
               -DTRITON_ENABLE_PERF_ANALYZER_C_API=OFF \
               -DTRITON_ENABLE_PERF_ANALYZER_TFS=ON \
               -DTRITON_ENABLE_PERF_ANALYZER_TS=ON \
-              -DTRITON_ENABLE_EXAMPLES=ON \
-              -DTRITON_ENABLE_TESTS=ON \
               -DTRITON_ENABLE_GPU=ON \
               -DTRITON_REPO_ORGANIZATION:STRING=${TRITON_REPO_ORGANIZATION} \
               -DTRITON_COMMON_REPO_TAG=${TRITON_COMMON_REPO_TAG} \
               -DTRITON_CORE_REPO_TAG=${TRITON_CORE_REPO_TAG} \
-              -DTRITON_THIRD_PARTY_REPO_TAG=${TRITON_THIRD_PARTY_REPO_TAG} \
-              /workspace/client && \
-        make -j16 cc-clients python-clients)
+              -DTRITON_CLIENT_REPO_TAG=${TRITON_CLIENT_REPO_TAG} \
+              /workspace/perf_analyzer && \
+        make -j16 perf-analyzer)
 if [ $? -eq 0 ]; then
     echo -e "\n***\n*** No-CAPI Passed\n***"
 else
@@ -205,25 +161,20 @@ fi
 # Build without TensorFlow Serving in Perf Analyzer
 #
 (cd /workspace/build && \
-        rm -fr cc-clients python-clients && \
+        rm -fr cc_clients perf_analyzer && \
         cmake -DCMAKE_INSTALL_PREFIX=/workspace/install \
               -DTRITON_ENABLE_CC_HTTP=ON \
               -DTRITON_ENABLE_CC_GRPC=ON \
-              -DTRITON_ENABLE_PYTHON_HTTP=ON \
-              -DTRITON_ENABLE_PYTHON_GRPC=ON \
-              -DTRITON_ENABLE_PERF_ANALYZER=ON \
               -DTRITON_ENABLE_PERF_ANALYZER_C_API=ON \
               -DTRITON_ENABLE_PERF_ANALYZER_TFS=OFF \
               -DTRITON_ENABLE_PERF_ANALYZER_TS=ON \
-              -DTRITON_ENABLE_EXAMPLES=ON \
-              -DTRITON_ENABLE_TESTS=ON \
               -DTRITON_ENABLE_GPU=ON \
               -DTRITON_REPO_ORGANIZATION:STRING=${TRITON_REPO_ORGANIZATION} \
               -DTRITON_COMMON_REPO_TAG=${TRITON_COMMON_REPO_TAG} \
               -DTRITON_CORE_REPO_TAG=${TRITON_CORE_REPO_TAG} \
-              -DTRITON_THIRD_PARTY_REPO_TAG=${TRITON_THIRD_PARTY_REPO_TAG} \
-              /workspace/client && \
-        make -j16 cc-clients python-clients)
+              -DTRITON_CLIENT_REPO_TAG=${TRITON_CLIENT_REPO_TAG} \
+              /workspace/perf_analyzer && \
+        make -j16 perf-analyzer)
 if [ $? -eq 0 ]; then
     echo -e "\n***\n*** No-TF-Serving Passed\n***"
 else
@@ -235,25 +186,20 @@ fi
 # Build without TorchServe in Perf Analyzer
 #
 (cd /workspace/build && \
-        rm -fr cc-clients python-clients && \
+        rm -fr cc_clients perf_analyzer && \
         cmake -DCMAKE_INSTALL_PREFIX=/workspace/install \
               -DTRITON_ENABLE_CC_HTTP=ON \
               -DTRITON_ENABLE_CC_GRPC=ON \
-              -DTRITON_ENABLE_PYTHON_HTTP=ON \
-              -DTRITON_ENABLE_PYTHON_GRPC=ON \
-              -DTRITON_ENABLE_PERF_ANALYZER=ON \
               -DTRITON_ENABLE_PERF_ANALYZER_C_API=ON \
               -DTRITON_ENABLE_PERF_ANALYZER_TFS=ON \
               -DTRITON_ENABLE_PERF_ANALYZER_TS=OFF \
-              -DTRITON_ENABLE_EXAMPLES=ON \
-              -DTRITON_ENABLE_TESTS=ON \
               -DTRITON_ENABLE_GPU=ON \
               -DTRITON_REPO_ORGANIZATION:STRING=${TRITON_REPO_ORGANIZATION} \
               -DTRITON_COMMON_REPO_TAG=${TRITON_COMMON_REPO_TAG} \
               -DTRITON_CORE_REPO_TAG=${TRITON_CORE_REPO_TAG} \
-              -DTRITON_THIRD_PARTY_REPO_TAG=${TRITON_THIRD_PARTY_REPO_TAG} \
-              /workspace/client && \
-        make -j16 cc-clients python-clients)
+              -DTRITON_CLIENT_REPO_TAG=${TRITON_CLIENT_REPO_TAG} \
+              /workspace/perf_analyzer && \
+        make -j16 perf-analyzer)
 if [ $? -eq 0 ]; then
     echo -e "\n***\n*** No-TorchServe Passed\n***"
 else

From 6636fc9fc007e913566d562cefe0b882cd568dd3 Mon Sep 17 00:00:00 2001
From: Jacky <18255193+kthui@users.noreply.github.com>
Date: Wed, 7 Aug 2024 10:58:18 -0700
Subject: [PATCH 06/44] test: Add test for sending response after sending
 complete final flag (#7504)

---
 .../response_sender_complete_final_test.py    | 77 +++++++++++++++++++
 qa/L0_backend_python/response_sender/test.sh  | 31 ++++++++
 .../config.pbtxt                              | 47 +++++++++++
 .../response_sender_complete_final/model.py   | 63 +++++++++++++++
 4 files changed, 218 insertions(+)
 create mode 100644 qa/L0_backend_python/response_sender/response_sender_complete_final_test.py
 create mode 100644 qa/python_models/response_sender_complete_final/config.pbtxt
 create mode 100644 qa/python_models/response_sender_complete_final/model.py

diff --git a/qa/L0_backend_python/response_sender/response_sender_complete_final_test.py b/qa/L0_backend_python/response_sender/response_sender_complete_final_test.py
new file mode 100644
index 0000000000..386a54e3d3
--- /dev/null
+++ b/qa/L0_backend_python/response_sender/response_sender_complete_final_test.py
@@ -0,0 +1,77 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+import time
+import unittest
+
+import numpy as np
+import tritonclient.grpc as grpcclient
+
+
+class ResponseSenderTest(unittest.TestCase):
+    def _generate_streaming_callback_and_responses_pair(self):
+        responses = []  # [{"result": result, "error": error}, ...]
+
+        def callback(result, error):
+            responses.append({"result": result, "error": error})
+
+        return callback, responses
+
+    def test_respond_after_complete_final(self):
+        with open(os.environ["SERVER_LOG"]) as f:
+            server_log = f.read()
+        self.assertNotIn("Test Passed", server_log)
+
+        model_name = "response_sender_complete_final"
+        shape = [1, 1]
+        inputs = [grpcclient.InferInput("INPUT0", shape, "FP32")]
+        input0_np = np.array([[123.45]], np.float32)
+        inputs[0].set_data_from_numpy(input0_np)
+
+        callback, responses = self._generate_streaming_callback_and_responses_pair()
+        with grpcclient.InferenceServerClient("localhost:8001") as client:
+            client.start_stream(callback)
+            client.async_stream_infer(model_name, inputs)
+            client.stop_stream()
+
+        self.assertEqual(len(responses), 1)
+        for response in responses:
+            output0_np = response["result"].as_numpy(name="OUTPUT0")
+            self.assertTrue(np.allclose(input0_np, output0_np))
+            self.assertIsNone(response["error"])
+
+        time.sleep(1)  # make sure the logs are written before checking
+        with open(os.environ["SERVER_LOG"]) as f:
+            server_log = f.read()
+        self.assertNotIn("Unexpected request length", server_log)
+        self.assertNotIn("Expected exception not raised", server_log)
+        self.assertNotIn("Test FAILED", server_log)
+        self.assertIn("Test Passed", server_log)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/qa/L0_backend_python/response_sender/test.sh b/qa/L0_backend_python/response_sender/test.sh
index 33db46edbb..cca7e7acfa 100755
--- a/qa/L0_backend_python/response_sender/test.sh
+++ b/qa/L0_backend_python/response_sender/test.sh
@@ -97,6 +97,37 @@ set -e
 kill $SERVER_PID
 wait $SERVER_PID
 
+#
+# Test response sender to raise exception on response after complete final flag
+#
+rm -rf models && mkdir models
+mkdir -p models/response_sender_complete_final/1 && \
+    cp ../../python_models/response_sender_complete_final/model.py models/response_sender_complete_final/1 && \
+    cp ../../python_models/response_sender_complete_final/config.pbtxt models/response_sender_complete_final
+
+TEST_LOG="response_sender_complete_final_test.log"
+SERVER_LOG="response_sender_complete_final_test.server.log"
+SERVER_ARGS="--model-repository=${MODELDIR}/response_sender/models --backend-directory=${BACKEND_DIR} --log-verbose=1"
+
+run_server
+if [ "$SERVER_PID" == "0" ]; then
+    echo -e "\n***\n*** Failed to start $SERVER\n***"
+    cat $SERVER_LOG
+    exit 1
+fi
+
+set +e
+SERVER_LOG=$SERVER_LOG python3 -m pytest --junitxml=concurrency_test.report.xml response_sender_complete_final_test.py > $TEST_LOG 2>&1
+if [ $? -ne 0 ]; then
+    echo -e "\n***\n*** response sender complete final test FAILED\n***"
+    cat $TEST_LOG
+    RET=1
+fi
+set -e
+
+kill $SERVER_PID
+wait $SERVER_PID
+
 #
 # Test async response sender under decoupled / non-decoupled
 #
diff --git a/qa/python_models/response_sender_complete_final/config.pbtxt b/qa/python_models/response_sender_complete_final/config.pbtxt
new file mode 100644
index 0000000000..f08ed6da5b
--- /dev/null
+++ b/qa/python_models/response_sender_complete_final/config.pbtxt
@@ -0,0 +1,47 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+backend: "python"
+max_batch_size: 8
+
+input [
+  {
+    name: "INPUT0"
+    data_type: TYPE_FP32
+    dims: [ -1 ]
+  }
+]
+
+output [
+  {
+    name: "OUTPUT0"
+    data_type: TYPE_FP32
+    dims: [ -1 ]
+  }
+]
+
+instance_group [{ kind: KIND_CPU }]
+model_transaction_policy { decoupled: True }
diff --git a/qa/python_models/response_sender_complete_final/model.py b/qa/python_models/response_sender_complete_final/model.py
new file mode 100644
index 0000000000..e17f0b04f6
--- /dev/null
+++ b/qa/python_models/response_sender_complete_final/model.py
@@ -0,0 +1,63 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import triton_python_backend_utils as pb_utils
+
+
+class TritonPythonModel:
+    def execute(self, requests):
+        # Expect exactly one request per execute() call.
+        if len(requests) != 1:
+            pb_utils.Logger.log_error(f"Unexpected request length: {len(requests)}")
+            raise Exception("Test FAILED")
+
+        # Send a response with complete final flag, and then send another response and
+        # and assert an exception is raised, for all requests.
+        for request in requests:
+            in_tensor = pb_utils.get_input_tensor_by_name(request, "INPUT0")
+            out_tensor = pb_utils.Tensor("OUTPUT0", in_tensor.as_numpy())
+            response = pb_utils.InferenceResponse([out_tensor])
+            response_sender = request.get_response_sender()
+            response_sender.send(
+                response, flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL
+            )
+            test_passed = False
+            try:
+                response_sender.send(response)
+            except Exception as e:
+                pb_utils.Logger.log_info(f"Raised exception: {e}")
+                if (
+                    str(e)
+                    == "Unable to send response. Response sender has been closed."
+                ):
+                    test_passed = True
+            finally:
+                if not test_passed:
+                    pb_utils.Logger.log_error("Expected exception not raised")
+                    raise Exception("Test FAILED")
+            pb_utils.Logger.log_info("Test Passed")
+        return None

From f2841016079b0b647f10e85db6a4b9e7b2dda330 Mon Sep 17 00:00:00 2001
From: Harry Kim <harry_kim@live.com>
Date: Thu, 8 Aug 2024 07:15:30 -0700
Subject: [PATCH 07/44] Add vLLM x Triton user meetup announcement (#7509)

---
 README.md | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/README.md b/README.md
index 2200886a20..f9b1a483f3 100644
--- a/README.md
+++ b/README.md
@@ -28,6 +28,17 @@
 
 # Triton Inference Server
 
+📣 **vLLM x Triton Meetup at Fort Mason on Sept 9th 4:00 - 9:00 pm**
+
+We are excited to announce that we will be hosting our Triton user meetup with the vLLM team at
+[Fort Mason](https://maps.app.goo.gl/9Lr3fxRssrpQCGK58) on Sept 9th 4:00 - 9:00 pm. Join us for this
+exclusive event where you will learn about the newest vLLM and Triton features, get a
+glimpse into the roadmaps, and connect with fellow users, the NVIDIA Triton and vLLM teams. Seating is limited and registration confirmation
+is required to attend - please register [here](https://lu.ma/87q3nvnh) to join
+the meetup.
+
+___
+
 [![License](https://img.shields.io/badge/License-BSD3-lightgrey.svg)](https://opensource.org/licenses/BSD-3-Clause)
 
 [!WARNING]

From 53200091b84f08a5e4921f5073137784570283e9 Mon Sep 17 00:00:00 2001
From: Sai Kiran Polisetty <spolisetty@nvidia.com>
Date: Sat, 10 Aug 2024 21:21:35 +0530
Subject: [PATCH 08/44] Fix benchmarking tests (#7461)

---
 qa/L0_perf_tensorrt_llm/test.sh | 65 +++++++++------------------------
 qa/L0_perf_vllm/test.sh         |  2 +-
 2 files changed, 19 insertions(+), 48 deletions(-)

diff --git a/qa/L0_perf_tensorrt_llm/test.sh b/qa/L0_perf_tensorrt_llm/test.sh
index 35d360498d..e74b01e568 100755
--- a/qa/L0_perf_tensorrt_llm/test.sh
+++ b/qa/L0_perf_tensorrt_llm/test.sh
@@ -34,7 +34,7 @@ TRT_ROOT="/usr/local/tensorrt"
 MODEL_NAME="gpt2_tensorrt_llm"
 NAME="tensorrt_llm_benchmarking_test"
 MODEL_REPOSITORY="$(pwd)/triton_model_repo"
-TENSORRTLLM_BACKEND_DIR="/opt/tritonserver/tensorrtllm_backend"
+TENSORRTLLM_BACKEND_DIR="/workspace/tensorrtllm_backend"
 GPT_DIR="$TENSORRTLLM_BACKEND_DIR/tensorrt_llm/examples/gpt"
 TOKENIZER_DIR="$GPT_DIR/gpt2"
 ENGINES_DIR="${BASE_DIR}/engines/inflight_batcher_llm/${NUM_GPUS}-gpu"
@@ -47,40 +47,27 @@ SERVER_TIMEOUT=${SERVER_TIMEOUT:=120}
 function clone_tensorrt_llm_backend_repo {
     rm -rf $TENSORRTLLM_BACKEND_DIR && mkdir $TENSORRTLLM_BACKEND_DIR
     apt-get update && apt-get install git-lfs -y --no-install-recommends
-    git clone --single-branch --depth=1 -b ${TENSORRTLLM_BACKEND_REPO_TAG} https://github.com/triton-inference-server/tensorrtllm_backend.git $TENSORRTLLM_BACKEND_DIR
+    git clone --single-branch --depth=1 -b ${TENSORRTLLM_BACKEND_REPO_TAG} ${TRITON_REPO_ORG}/tensorrtllm_backend.git $TENSORRTLLM_BACKEND_DIR
     cd $TENSORRTLLM_BACKEND_DIR && git lfs install && git submodule update --init --recursive
 }
 
 # Update Open MPI to a version compatible with SLURM.
 function upgrade_openmpi {
-    cd /tmp/
     local CURRENT_VERSION=$(mpirun --version 2>&1 | awk '/Open MPI/ {gsub(/rc[0-9]+/, "", $NF); print $NF}')
 
     if [ -n "$CURRENT_VERSION" ] && dpkg --compare-versions "$CURRENT_VERSION" lt "5.0.1"; then
         # Uninstall the current version of Open MPI
-        wget "https://download.open-mpi.org/release/open-mpi/v$(echo "${CURRENT_VERSION}" | awk -F. '{print $1"."$2}')/openmpi-${CURRENT_VERSION}.tar.gz" || {
-            echo "Failed to download Open MPI ${CURRENT_VERSION}"
-            exit 1
-        }
-        rm -rf "openmpi-${CURRENT_VERSION}" && tar -xzf "openmpi-${CURRENT_VERSION}.tar.gz" && cd "openmpi-${CURRENT_VERSION}" || {
-            echo "Failed to extract Open MPI ${CURRENT_VERSION}"
-            exit 1
-        }
-        unset PMIX_VERSION && ./configure --prefix=/opt/hpcx/ompi/ && make uninstall || {
-            echo "Failed to uninstall Open MPI ${CURRENT_VERSION}"
-            exit 1
-        }
-        rm -rf /opt/hpcx/ompi/ /usr/local/mpi/ || {
-            echo "Failed to remove Open MPI ${CURRENT_VERSION} installation directories"
+        rm -r /opt/hpcx/ompi/ /usr/local/mpi && rm -rf /usr/lib/$(gcc -print-multiarch)/openmpi || {
+            echo "Failed to uninstall the existing Open MPI version $CURRENT_VERSION."
             exit 1
         }
-        cd ../ && rm -r openmpi-${CURRENT_VERSION}
     else
-        echo "Installed Open MPI version is not less than 5.0.1. Skipping the upgrade."
+        echo "The installed Open MPI version ($CURRENT_VERSION) is 5.0.1 or higher. Skipping the upgrade."
         return
     fi
 
     # Install SLURM supported Open MPI version
+    cd /tmp/
     wget "https://download.open-mpi.org/release/open-mpi/v5.0/openmpi-5.0.1.tar.gz" || {
         echo "Failed to download Open MPI 5.0.1"
         exit 1
@@ -108,18 +95,6 @@ function upgrade_openmpi {
     mpirun --version
 }
 
-function install_tensorrt_llm {
-    # Install CMake
-    bash ${TENSORRTLLM_BACKEND_DIR}/tensorrt_llm/docker/common/install_cmake.sh
-    export PATH="/usr/local/cmake/bin:${PATH}"
-
-    TORCH_INSTALL_TYPE="pypi" &&
-        (cd ${TENSORRTLLM_BACKEND_DIR}/tensorrt_llm &&
-            bash docker/common/install_pytorch.sh $TORCH_INSTALL_TYPE &&
-            python3 ./scripts/build_wheel.py --trt_root=/usr/local/tensorrt &&
-            pip3 install ./build/tensorrt_llm*.whl)
-}
-
 function build_gpt2_base_model {
     # Download weights from HuggingFace Transformers
     cd ${GPT_DIR} && rm -rf gpt2 && git clone https://huggingface.co/gpt2-medium gpt2 && cd gpt2
@@ -131,24 +106,21 @@ function build_gpt2_base_model {
     cd ${GPT_DIR}
 
     # Convert weights from HF Tranformers to FT format
-    python3 hf_gpt_convert.py -p 1 -i gpt2 -o ./c-model/gpt2 --tensor-parallelism ${NUM_GPUS} --storage-type float16
+    python3 convert_checkpoint.py --model_dir gpt2 --dtype float16 --tp_size ${NUM_GPUS} --output_dir "./c-model/gpt2/${NUM_GPUS}-gpu/"
     cd ${BASE_DIR}
 }
 
 function build_gpt2_tensorrt_engine {
     # Build TensorRT engines
     cd ${GPT_DIR}
-    python3 build.py --model_dir="./c-model/gpt2/${NUM_GPUS}-gpu/" \
-        --world_size="${NUM_GPUS}" \
-        --dtype float16 \
-        --use_inflight_batching \
-        --use_gpt_attention_plugin float16 \
-        --paged_kv_cache \
-        --use_gemm_plugin float16 \
-        --remove_input_padding \
-        --hidden_act gelu \
-        --parallel_build \
-        --output_dir="${ENGINES_DIR}"
+    trtllm-build --checkpoint_dir "./c-model/gpt2/${NUM_GPUS}-gpu/" \
+        --gpt_attention_plugin float16 \
+        --remove_input_padding enable \
+        --paged_kv_cache enable \
+        --gemm_plugin float16 \
+        --workers "${NUM_GPUS}" \
+        --output_dir "${ENGINES_DIR}"
+
     cd ${BASE_DIR}
 }
 
@@ -172,18 +144,18 @@ function prepare_model_repository {
     replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_REPOSITORY}/preprocessing/config.pbtxt"
     replace_config_tags '${preprocessing_instance_count}' '1' "${MODEL_REPOSITORY}/preprocessing/config.pbtxt"
     replace_config_tags '${tokenizer_dir}' "${TOKENIZER_DIR}/" "${MODEL_REPOSITORY}/preprocessing/config.pbtxt"
-    replace_config_tags '${tokenizer_type}' 'auto' "${MODEL_REPOSITORY}/preprocessing/config.pbtxt"
 
     replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_REPOSITORY}/postprocessing/config.pbtxt"
     replace_config_tags '${postprocessing_instance_count}' '1' "${MODEL_REPOSITORY}/postprocessing/config.pbtxt"
     replace_config_tags '${tokenizer_dir}' "${TOKENIZER_DIR}/" "${MODEL_REPOSITORY}/postprocessing/config.pbtxt"
-    replace_config_tags '${tokenizer_type}' 'auto' "${MODEL_REPOSITORY}/postprocessing/config.pbtxt"
 
     replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt"
     replace_config_tags '${decoupled_mode}' 'true' "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt"
     replace_config_tags '${max_queue_delay_microseconds}' "1000000" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt"
     replace_config_tags '${batching_strategy}' 'inflight_fused_batching' "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt"
     replace_config_tags '${engine_dir}' "${ENGINES_DIR}" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt"
+    replace_config_tags '${triton_backend}' "tensorrtllm" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt"
+    replace_config_tags '${max_queue_size}' "0" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt"
 }
 
 # Wait until server health endpoint shows ready. Sets WAIT_RET to 0 on
@@ -244,13 +216,12 @@ function kill_server {
 
 upgrade_openmpi
 clone_tensorrt_llm_backend_repo
-install_tensorrt_llm
 build_gpt2_base_model
 build_gpt2_tensorrt_engine
 prepare_model_repository
 
 # Install perf_analyzer
-pip3 install tritonclient nvidia-ml-py3
+pip3 install tritonclient
 
 ARCH="amd64"
 STATIC_BATCH=1
diff --git a/qa/L0_perf_vllm/test.sh b/qa/L0_perf_vllm/test.sh
index 498f6f8e14..e1ce8cf2ed 100755
--- a/qa/L0_perf_vllm/test.sh
+++ b/qa/L0_perf_vllm/test.sh
@@ -41,7 +41,7 @@ SERVER_ARGS="--model-repository=${MODEL_REPO} --backend-directory=${BACKEND_DIR}
 export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:=0}
 EXPORT_FILE=profile-export-vllm-model.json
 
-pip3 install tritonclient nvidia-ml-py3
+pip3 install tritonclient
 rm -rf $MODEL_REPO $EXPORT_FILE *.tjson *.json *.csv
 
 mkdir -p $MODEL_REPO/$MODEL_NAME/1

From e7c8e7b2e52cf927da29b9d857746592222be0ff Mon Sep 17 00:00:00 2001
From: Yingge He <157551214+yinggeh@users.noreply.github.com>
Date: Thu, 15 Aug 2024 20:35:51 -0700
Subject: [PATCH 09/44] feat: Add vLLM counter metrics access through Triton 
 (#7493)

Report vLLM counter metrics through Triton server
---
 build.py                   | 4 ++++
 docs/user_guide/metrics.md | 6 ++++++
 2 files changed, 10 insertions(+)

diff --git a/build.py b/build.py
index 2c95cbded3..0487636b09 100755
--- a/build.py
+++ b/build.py
@@ -1806,6 +1806,10 @@ def backend_clone(
         os.path.join(build_dir, be, "src", "model.py"),
         backend_dir,
     )
+    clone_script.cpdir(
+        os.path.join(build_dir, be, "src", "utils"),
+        backend_dir,
+    )
 
     clone_script.comment()
     clone_script.comment(f"end '{be}' backend")
diff --git a/docs/user_guide/metrics.md b/docs/user_guide/metrics.md
index 0a7f3cf1a3..b8fc0d8ee0 100644
--- a/docs/user_guide/metrics.md
+++ b/docs/user_guide/metrics.md
@@ -378,3 +378,9 @@ Further documentation can be found in the `TRITONSERVER_MetricFamily*` and
 The TRT-LLM backend uses the custom metrics API to track and expose specific metrics about
 LLMs, KV Cache, and Inflight Batching to Triton:
 https://github.com/triton-inference-server/tensorrtllm_backend?tab=readme-ov-file#triton-metrics
+
+### vLLM Backend Metrics
+
+The vLLM backend uses the custom metrics API to track and expose specific metrics about
+LLMs to Triton:
+https://github.com/triton-inference-server/vllm_backend?tab=readme-ov-file#triton-metrics

From 7a158a4075bbd3d757ce0d2a2afc419d509ed599 Mon Sep 17 00:00:00 2001
From: Kyle McGill <101670481+nv-kmcgill53@users.noreply.github.com>
Date: Fri, 16 Aug 2024 10:41:19 -0700
Subject: [PATCH 10/44] build: RHEL 8 Compatibility (#7519)

Co-authored-by: Francesco Petrini <francescogpetrini@gmail.com>
---
 CMakeLists.txt                  |  16 +--
 build.py                        | 216 +++++++++++++++++++++++++++++---
 qa/L0_infer/install_and_test.sh |  22 +++-
 src/CMakeLists.txt              |  11 +-
 4 files changed, 228 insertions(+), 37 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ff578c9724..56cb346dc0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -125,17 +125,13 @@ FetchContent_Declare(
 
 # Some libs are installed to ${TRITON_THIRD_PARTY_INSTALL_PREFIX}/{LIB}/lib64 instead
 # of ${TRITON_THIRD_PARTY_INSTALL_PREFIX}/{LIB}/lib on Centos
-set (LIB_DIR "lib")
-# /etc/os-release does not exist on Windows
-if(EXISTS "/etc/os-release")
-  file(STRINGS /etc/os-release DISTRO REGEX "^NAME=")
-  string(REGEX REPLACE "NAME=\"(.*)\"" "\\1" DISTRO "${DISTRO}")
-  message(STATUS "Distro Name: ${DISTRO}")
-  if(DISTRO MATCHES "CentOS.*")
+set(LIB_DIR "lib")
+if(LINUX)
+  file(STRINGS "/etc/os-release" DISTRO_ID_LIKE REGEX "ID_LIKE")
+  if(${DISTRO_ID_LIKE} MATCHES "rhel|centos")
     set (LIB_DIR "lib64")
-  endif()
-endif()
-
+  endif(${DISTRO_ID_LIKE} MATCHES "rhel|centos")
+endif(LINUX)
 set(TRITON_CORE_HEADERS_ONLY OFF)
 
 FetchContent_MakeAvailable(repo-third-party repo-core)
diff --git a/build.py b/build.py
index 0487636b09..2a9b2469fc 100755
--- a/build.py
+++ b/build.py
@@ -37,6 +37,7 @@
 import sys
 from inspect import getsourcefile
 
+import distro
 import requests
 
 #
@@ -117,7 +118,17 @@ def fail_if(p, msg):
 def target_platform():
     if FLAGS.target_platform is not None:
         return FLAGS.target_platform
-    return platform.system().lower()
+    platform_string = platform.system().lower()
+    if platform_string == "linux":
+        # Need to inspect the /etc/os-release file to get
+        # the distribution of linux
+        id_like_list = distro.like().split()
+        if "debian" in id_like_list:
+            return "linux"
+        else:
+            return "rhel"
+    else:
+        return platform_string
 
 
 def target_machine():
@@ -649,7 +660,8 @@ def onnxruntime_cmake_args(images, library_paths):
     ]
 
     # TRITON_ENABLE_GPU is already set for all backends in backend_cmake_args()
-    if FLAGS.enable_gpu:
+    # TODO: TPRD-334 TensorRT extension is not currently supported by our manylinux build
+    if FLAGS.enable_gpu and target_platform() != "rhel":
         cargs.append(
             cmake_backend_enable(
                 "onnxruntime", "TRITON_ENABLE_ONNXRUNTIME_TENSORRT", True
@@ -680,8 +692,11 @@ def onnxruntime_cmake_args(images, library_paths):
                 )
             )
 
-        if (target_machine() != "aarch64") and (
-            TRITON_VERSION_MAP[FLAGS.version][3] is not None
+        # TODO: TPRD-333 OpenVino extension is not currently supported by our manylinux build
+        if (
+            (target_machine() != "aarch64")
+            and (target_platform() != "rhel")
+            and (TRITON_VERSION_MAP[FLAGS.version][3] is not None)
         ):
             cargs.append(
                 cmake_backend_enable(
@@ -697,7 +712,7 @@ def onnxruntime_cmake_args(images, library_paths):
                 )
             )
 
-        if target_platform() == "igpu":
+        if (target_platform() == "igpu") or (target_platform() == "rhel"):
             cargs.append(
                 cmake_backend_arg(
                     "onnxruntime",
@@ -833,8 +848,31 @@ def install_dcgm_libraries(dcgm_version, target_machine):
         )
         return ""
     else:
-        if target_machine == "aarch64":
-            return """
+        # RHEL has the same install instructions for both aarch64 and x86
+        if target_platform() == "rhel":
+            if target_machine == "aarch64":
+                return """
+ENV DCGM_VERSION {}
+# Install DCGM. Steps from https://developer.nvidia.com/dcgm#Downloads
+RUN dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/sbsa/cuda-rhel8.repo \\
+    && dnf clean expire-cache \\
+    && dnf install -y datacenter-gpu-manager-{}
+""".format(
+                    dcgm_version, dcgm_version
+                )
+            else:
+                return """
+ENV DCGM_VERSION {}
+# Install DCGM. Steps from https://developer.nvidia.com/dcgm#Downloads
+RUN dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo \\
+    && dnf clean expire-cache \\
+    && dnf install -y datacenter-gpu-manager-{}
+""".format(
+                    dcgm_version, dcgm_version
+                )
+        else:
+            if target_machine == "aarch64":
+                return """
 ENV DCGM_VERSION {}
 # Install DCGM. Steps from https://developer.nvidia.com/dcgm#Downloads
 RUN curl -o /tmp/cuda-keyring.deb \\
@@ -844,10 +882,10 @@ def install_dcgm_libraries(dcgm_version, target_machine):
       && apt-get update \\
       && apt-get install -y datacenter-gpu-manager=1:{}
 """.format(
-                dcgm_version, dcgm_version
-            )
-        else:
-            return """
+                    dcgm_version, dcgm_version
+                )
+            else:
+                return """
 ENV DCGM_VERSION {}
 # Install DCGM. Steps from https://developer.nvidia.com/dcgm#Downloads
 RUN curl -o /tmp/cuda-keyring.deb \\
@@ -857,8 +895,106 @@ def install_dcgm_libraries(dcgm_version, target_machine):
       && apt-get update \\
       && apt-get install -y datacenter-gpu-manager=1:{}
 """.format(
-                dcgm_version, dcgm_version
-            )
+                    dcgm_version, dcgm_version
+                )
+
+
+def create_dockerfile_buildbase_rhel(ddir, dockerfile_name, argmap):
+    df = """
+ARG TRITON_VERSION={}
+ARG TRITON_CONTAINER_VERSION={}
+ARG BASE_IMAGE={}
+""".format(
+        argmap["TRITON_VERSION"],
+        argmap["TRITON_CONTAINER_VERSION"],
+        argmap["BASE_IMAGE"],
+    )
+
+    df += """
+FROM ${BASE_IMAGE}
+
+ARG TRITON_VERSION
+ARG TRITON_CONTAINER_VERSION
+"""
+    df += """
+# Install docker docker buildx
+RUN yum install -y ca-certificates curl gnupg yum-utils \\
+      && yum-config-manager --add-repo https://download.docker.com/linux/rhel/docker-ce.repo \\
+      && yum install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
+#   && yum install -y docker.io docker-buildx-plugin
+
+# libcurl4-openSSL-dev is needed for GCS
+# python3-dev is needed by Torchvision
+# python3-pip and libarchive-dev is needed by python backend
+# libxml2-dev is needed for Azure Storage
+# scons is needed for armnn_tflite backend build dep
+RUN yum install -y \\
+            ca-certificates \\
+            autoconf \\
+            automake \\
+            git \\
+            gperf \\
+            re2-devel \\
+            openssl-devel \\
+            libtool \\
+            libcurl-devel \\
+            libb64-devel \\
+            gperftools-devel \\
+            patchelf \\
+            python3.11-devel \\
+            python3-pip \\
+            python3-setuptools \\
+            rapidjson-devel \\
+            python3-scons \\
+            pkg-config \\
+            unzip \\
+            wget \\
+            zlib-devel \\
+            libarchive-devel \\
+            libxml2-devel \\
+            numactl-devel \\
+            wget
+
+RUN pip3 install --upgrade pip \\
+      && pip3 install --upgrade \\
+          wheel \\
+          setuptools \\
+          docker \\
+          virtualenv
+
+# Install boost version >= 1.78 for boost::span
+# Current libboost-dev apt packages are < 1.78, so install from tar.gz
+RUN wget -O /tmp/boost.tar.gz \\
+          https://archives.boost.io/release/1.80.0/source/boost_1_80_0.tar.gz \\
+      && (cd /tmp && tar xzf boost.tar.gz) \\
+      && mv /tmp/boost_1_80_0/boost /usr/include/boost
+
+# Server build requires recent version of CMake (FetchContent required)
+# Might not need this if the installed version of cmake is high enough for our build.
+# RUN apt update -q=2 \\
+#       && apt install -y gpg wget \\
+#       && wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - |  tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null \\
+#       && . /etc/os-release \\
+#       && echo "deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ $UBUNTU_CODENAME main" | tee /etc/apt/sources.list.d/kitware.list >/dev/null \\
+#       && apt-get update -q=2 \\
+#       && apt-get install -y --no-install-recommends cmake=3.27.7* cmake-data=3.27.7*
+"""
+    if FLAGS.enable_gpu:
+        df += install_dcgm_libraries(argmap["DCGM_VERSION"], target_machine())
+    df += """
+ENV TRITON_SERVER_VERSION ${TRITON_VERSION}
+ENV NVIDIA_TRITON_SERVER_VERSION ${TRITON_CONTAINER_VERSION}
+"""
+
+    df += """
+WORKDIR /workspace
+RUN rm -fr *
+COPY . .
+ENTRYPOINT []
+"""
+
+    with open(os.path.join(ddir, dockerfile_name), "w") as dfile:
+        dfile.write(df)
 
 
 def create_dockerfile_buildbase(ddir, dockerfile_name, argmap):
@@ -1161,7 +1297,29 @@ def dockerfile_prepare_container_linux(argmap, backends, enable_gpu, target_mach
         fi \\
       && [ `id -u $TRITON_SERVER_USER` -eq 1000 ] \\
       && [ `id -g $TRITON_SERVER_USER` -eq 1000 ]
+""".format(
+        gpu_enabled=gpu_enabled
+    )
 
+    # This
+    if target_platform() == "rhel":
+        df += """
+# Common dpeendencies.
+RUN yum install -y \\
+        git \\
+        gperf \\
+        re2-devel \\
+        openssl-devel \\
+        libtool \\
+        libcurl-devel \\
+        libb64-devel \\
+        gperftools-devel \\
+        patchelf \\
+        wget \\
+        numactl-devel
+"""
+    else:
+        df += """
 # Ensure apt-get won't prompt for selecting options
 ENV DEBIAN_FRONTEND=noninteractive
 
@@ -1184,12 +1342,14 @@ def dockerfile_prepare_container_linux(argmap, backends, enable_gpu, target_mach
               wget \\
               {backend_dependencies} \\
       && rm -rf /var/lib/apt/lists/*
+""".format(
+            backend_dependencies=backend_dependencies
+        )
 
+    df += """
 # Set TCMALLOC_RELEASE_RATE for users setting LD_PRELOAD with tcmalloc
 ENV TCMALLOC_RELEASE_RATE 200
-""".format(
-        gpu_enabled=gpu_enabled, backend_dependencies=backend_dependencies
-    )
+"""
 
     if "fastertransformer" in backends:
         be = "fastertransformer"
@@ -1433,9 +1593,14 @@ def create_build_dockerfiles(
             )
         dockerfileargmap["GPU_BASE_IMAGE"] = gpu_base_image
 
-    create_dockerfile_buildbase(
-        FLAGS.build_dir, "Dockerfile.buildbase", dockerfileargmap
-    )
+    if target_platform() == "rhel":
+        create_dockerfile_buildbase_rhel(
+            FLAGS.build_dir, "Dockerfile.buildbase", dockerfileargmap
+        )
+    else:
+        create_dockerfile_buildbase(
+            FLAGS.build_dir, "Dockerfile.buildbase", dockerfileargmap
+        )
 
     if target_platform() == "windows":
         create_dockerfile_windows(
@@ -1651,6 +1816,17 @@ def core_build(
             os.path.join(repo_install_dir, "lib", "tritonserver.lib"),
             os.path.join(install_dir, "bin"),
         )
+    elif target_platform() == "rhel":
+        cmake_script.mkdir(os.path.join(install_dir, "bin"))
+        cmake_script.cp(
+            os.path.join(repo_install_dir, "bin", "tritonserver"),
+            os.path.join(install_dir, "bin"),
+        )
+        cmake_script.mkdir(os.path.join(install_dir, "lib64"))
+        cmake_script.cp(
+            os.path.join(repo_install_dir, "lib64", "libtritonserver.so"),
+            os.path.join(install_dir, "lib64"),
+        )
     else:
         cmake_script.mkdir(os.path.join(install_dir, "bin"))
         cmake_script.cp(
@@ -2128,7 +2304,7 @@ def enable_all():
         "--target-platform",
         required=False,
         default=None,
-        help='Target platform for build, can be "linux", "windows" or "igpu". If not specified, build targets the current platform.',
+        help='Target platform for build, can be "linux", "rhel", "windows" or "igpu". If not specified, build targets the current platform.',
     )
     parser.add_argument(
         "--target-machine",
diff --git a/qa/L0_infer/install_and_test.sh b/qa/L0_infer/install_and_test.sh
index 28e5dad52e..4c136cf1dd 100755
--- a/qa/L0_infer/install_and_test.sh
+++ b/qa/L0_infer/install_and_test.sh
@@ -25,14 +25,24 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+# Determine the operating system to call the correct package manager.
+ID_LIKE=$(grep -Po '(?<=ID_LIKE=).*' /etc/os-release | awk -F= '{print $1}' |  tr -d '"' | awk '{print $1}')
+
 # Note: This script is to be used with customized triton containers that need
 # dependencies to run L0_infer tests
-apt-get update && \
-    apt-get install -y --no-install-recommends \
-         curl \
-         jq \
-         python3 \
-         python3-pip
+if [[ "$ID_LIKE" =~ "debian" ]]; then
+    apt-get update && \
+        apt-get install -y --no-install-recommends \
+            curl \
+            jq \
+            python3 \
+            python3-pip
+else
+    yum install -y \
+        jq \
+        curl
+fi
+
 pip3 install --upgrade pip
 # install client libraries
 pip3 install tritonclient[all]
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 783275d8d7..cf43765dba 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -138,6 +138,15 @@ else()
   )
 endif()
 
+set(LIB_DIR "lib")
+if(LINUX)
+  file(STRINGS "/etc/os-release" DISTRO_ID_LIKE REGEX "ID_LIKE")
+  if(${DISTRO_ID_LIKE} MATCHES "rhel|centos")
+    set (LIB_DIR "lib64")
+  endif(${DISTRO_ID_LIKE} MATCHES "rhel|centos")
+endif(LINUX)
+set(TRITON_CORE_HEADERS_ONLY OFF)
+
 set_target_properties(
   main
   PROPERTIES
@@ -145,7 +154,7 @@ set_target_properties(
     SKIP_BUILD_RPATH TRUE
     BUILD_WITH_INSTALL_RPATH TRUE
     INSTALL_RPATH_USE_LINK_PATH FALSE
-    INSTALL_RPATH "$\{ORIGIN\}/../lib"
+    INSTALL_RPATH "$\{ORIGIN\}/../${LIB_DIR}"
 )
 
 target_link_libraries(

From 66ccb9d8baf5d6d0d6a3fa901c98d781255e28eb Mon Sep 17 00:00:00 2001
From: Indrajit Bhosale <iamindrajitb@gmail.com>
Date: Fri, 16 Aug 2024 16:15:34 -0700
Subject: [PATCH 11/44] feat: Add GRPC error codes to GRPC streaming if enabled
 by user. (#7499)

---
 Dockerfile.QA                                 |   6 +
 .../inference_protocols.md                    |  10 ++
 .../lifecycle/lifecycle_test.py               | 130 ++++++++++++++++++
 qa/L0_backend_python/lifecycle/test.sh        |   8 ++
 qa/L0_decoupled/decoupled_test.py             |  16 ++-
 qa/L0_decoupled/test.sh                       |   2 +-
 qa/L0_grpc_state_cleanup/cleanup_test.py      |  42 ++++--
 .../execute_grpc_error/config.pbtxt           |  51 +++++++
 qa/python_models/execute_grpc_error/model.py  |  52 +++++++
 src/grpc/grpc_utils.h                         |  41 +++++-
 src/grpc/infer_handler.h                      |  43 +++++-
 src/grpc/stream_infer_handler.cc              |  64 ++++++++-
 12 files changed, 444 insertions(+), 21 deletions(-)
 create mode 100644 qa/python_models/execute_grpc_error/config.pbtxt
 create mode 100644 qa/python_models/execute_grpc_error/model.py

diff --git a/Dockerfile.QA b/Dockerfile.QA
index 2c43f735a5..b381abfaaf 100644
--- a/Dockerfile.QA
+++ b/Dockerfile.QA
@@ -267,6 +267,12 @@ RUN cp -r qa/L0_decoupled/models qa/L0_decoupled/python_models/ && \
     cp /workspace/tritonbuild/python/examples/decoupled/square_config.pbtxt \
         qa/L0_decoupled/python_models/square_int32/.
 
+RUN mkdir -p qa/L0_decoupled_grpc_error && \
+    cp -r qa/L0_decoupled/. qa/L0_decoupled_grpc_error
+
+RUN mkdir -p qa/L0_grpc_error_state_cleanup && \
+    cp -r qa/L0_grpc_state_cleanup/. qa/L0_grpc_error_state_cleanup
+
 RUN mkdir -p qa/L0_repoagent_checksum/models/identity_int32/1 && \
     cp tritonbuild/identity/install/backends/identity/libtriton_identity.so \
         qa/L0_repoagent_checksum/models/identity_int32/1/.
diff --git a/docs/customization_guide/inference_protocols.md b/docs/customization_guide/inference_protocols.md
index 592f26e7d1..a241f097da 100644
--- a/docs/customization_guide/inference_protocols.md
+++ b/docs/customization_guide/inference_protocols.md
@@ -115,6 +115,16 @@ These options can be used to configure the KeepAlive settings:
 
 For client-side documentation, see [Client-Side GRPC KeepAlive](https://github.com/triton-inference-server/client/blob/main/README.md#grpc-keepalive).
 
+#### GRPC Status Codes
+
+Triton implements GRPC error handling for streaming requests when a specific flag is enabled through headers. Upon encountering an error, Triton returns the appropriate GRPC error code and subsequently closes the stream.
+
+* `triton_grpc_error` : The header value needs to be set to true while starting the stream.
+
+GRPC status codes can be used for better visibility and monitoring. For more details, see [gRPC Status Codes](https://grpc.io/docs/guides/status-codes/)
+
+For client-side documentation, see [Client-Side GRPC Status Codes](https://github.com/triton-inference-server/client/tree/main#GRPC-Status-Codes)
+
 ### Limit Endpoint Access (BETA)
 
 Triton users may want to restrict access to protocols or APIs that are
diff --git a/qa/L0_backend_python/lifecycle/lifecycle_test.py b/qa/L0_backend_python/lifecycle/lifecycle_test.py
index 883f6d20b6..d6eb2a8f53 100755
--- a/qa/L0_backend_python/lifecycle/lifecycle_test.py
+++ b/qa/L0_backend_python/lifecycle/lifecycle_test.py
@@ -35,6 +35,7 @@
 sys.path.append("../../common")
 
 import queue
+import threading
 import time
 import unittest
 from functools import partial
@@ -241,6 +242,135 @@ def test_infer_pymodel_error(self):
             initial_metrics_value,
         )
 
+    # Test grpc stream behavior when triton_grpc_error is set to true.
+    # Expected to close stream and return GRPC error when model returns error.
+    def test_triton_grpc_error_error_on(self):
+        model_name = "execute_grpc_error"
+        shape = [2, 2]
+        number_of_requests = 2
+        user_data = UserData()
+        triton_client = grpcclient.InferenceServerClient(f"{_tritonserver_ipaddr}:8001")
+        metadata = {"triton_grpc_error": "true"}
+        triton_client.start_stream(
+            callback=partial(callback, user_data), headers=metadata
+        )
+        stream_end = False
+        for i in range(number_of_requests):
+            input_data = np.random.randn(*shape).astype(np.float32)
+            inputs = [
+                grpcclient.InferInput(
+                    "IN", input_data.shape, np_to_triton_dtype(input_data.dtype)
+                )
+            ]
+            inputs[0].set_data_from_numpy(input_data)
+            try:
+                triton_client.async_stream_infer(model_name=model_name, inputs=inputs)
+                result = user_data._completed_requests.get()
+                if type(result) == InferenceServerException:
+                    # execute_grpc_error intentionally returns error with StatusCode.INTERNAL status on 2nd request
+                    self.assertEqual(str(result.status()), "StatusCode.INTERNAL")
+                    stream_end = True
+                else:
+                    # Stream is not killed
+                    output_data = result.as_numpy("OUT")
+                    self.assertIsNotNone(output_data, "error: expected 'OUT'")
+            except Exception as e:
+                if stream_end == True:
+                    # We expect the stream to have closed
+                    self.assertTrue(
+                        True,
+                        "This should always pass as cancellation should succeed",
+                    )
+                else:
+                    self.assertFalse(
+                        True, "Unexpected Stream killed without Error from CORE"
+                    )
+
+    # Test grpc stream behavior when triton_grpc_error is set to true in multiple open streams.
+    # Expected to close stream and return GRPC error when model returns error.
+    def test_triton_grpc_error_multithreaded(self):
+        thread1 = threading.Thread(target=self.test_triton_grpc_error_error_on)
+        thread2 = threading.Thread(target=self.test_triton_grpc_error_error_on)
+        # Start the threads
+        thread1.start()
+        thread2.start()
+        # Wait for both threads to finish
+        thread1.join()
+        thread2.join()
+
+    # Test grpc stream behavior when triton_grpc_error is set to true and subsequent stream is cancelled.
+    # Expected cancellation is successful.
+    def test_triton_grpc_error_cancel(self):
+        model_name = "execute_grpc_error"
+        shape = [2, 2]
+        number_of_requests = 1
+        user_data = UserData()
+        triton_server_url = "localhost:8001"  # Replace with your Triton server address
+        stream_end = False
+        triton_client = grpcclient.InferenceServerClient(triton_server_url)
+
+        metadata = {"triton_grpc_error": "true"}
+
+        triton_client.start_stream(
+            callback=partial(callback, user_data), headers=metadata
+        )
+
+        for i in range(number_of_requests):
+            input_data = np.random.randn(*shape).astype(np.float32)
+            inputs = [
+                grpcclient.InferInput(
+                    "IN", input_data.shape, np_to_triton_dtype(input_data.dtype)
+                )
+            ]
+            inputs[0].set_data_from_numpy(input_data)
+            try:
+                triton_client.async_stream_infer(model_name=model_name, inputs=inputs)
+                result = user_data._completed_requests.get()
+                if type(result) == InferenceServerException:
+                    stream_end = True
+                if i == 0:
+                    triton_client.stop_stream(cancel_requests=True)
+            except Exception as e:
+                if stream_end == True:
+                    # We expect the stream to have closed
+                    self.assertTrue(
+                        True,
+                        "This should always pass as cancellation should succeed",
+                    )
+                else:
+                    self.assertFalse(
+                        True, "Unexpected Stream killed without Error from CORE"
+                    )
+        self.assertTrue(
+            True,
+            "This should always pass as cancellation should succeed without any exception",
+        )
+
+    # Test grpc stream behavior when triton_grpc_error is set to false
+    # and subsequent stream is NOT closed when error is reported from CORE
+    def test_triton_grpc_error_error_off(self):
+        model_name = "execute_grpc_error"
+        shape = [2, 2]
+        number_of_requests = 4
+        response_counter = 0
+        user_data = UserData()
+        triton_client = grpcclient.InferenceServerClient(f"{_tritonserver_ipaddr}:8001")
+        triton_client.start_stream(callback=partial(callback, user_data))
+        for i in range(number_of_requests):
+            input_data = np.random.randn(*shape).astype(np.float32)
+            inputs = [
+                grpcclient.InferInput(
+                    "IN", input_data.shape, np_to_triton_dtype(input_data.dtype)
+                )
+            ]
+            inputs[0].set_data_from_numpy(input_data)
+            triton_client.async_stream_infer(model_name=model_name, inputs=inputs)
+            _ = user_data._completed_requests.get()
+            response_counter += 1
+        # we expect response_counter == number_of_requests,
+        # which indicates that after the first reported grpc error stream did NOT close and mode != triton_grpc_error
+        self.assertEqual(response_counter, number_of_requests)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_backend_python/lifecycle/test.sh b/qa/L0_backend_python/lifecycle/test.sh
index dba4581ddd..59b846f56b 100755
--- a/qa/L0_backend_python/lifecycle/test.sh
+++ b/qa/L0_backend_python/lifecycle/test.sh
@@ -52,6 +52,14 @@ cp ../../python_models/execute_error/config.pbtxt ./models/execute_error/
           sed -i "s/^max_batch_size:.*/max_batch_size: 8/" config.pbtxt && \
           echo "dynamic_batching { preferred_batch_size: [8], max_queue_delay_microseconds: 12000000 }" >> config.pbtxt)
 
+mkdir -p models/execute_grpc_error/1/
+cp ../../python_models/execute_grpc_error/model.py ./models/execute_grpc_error/1/
+cp ../../python_models/execute_grpc_error/config.pbtxt ./models/execute_grpc_error/
+(cd models/execute_grpc_error && \
+          sed -i "s/^name:.*/name: \"execute_grpc_error\"/" config.pbtxt && \
+          sed -i "s/^max_batch_size:.*/max_batch_size: 8/" config.pbtxt && \
+          echo "dynamic_batching { preferred_batch_size: [8], max_queue_delay_microseconds: 1200000 }" >> config.pbtxt)
+
 mkdir -p models/execute_return_error/1/
 cp ../../python_models/execute_return_error/model.py ./models/execute_return_error/1/
 cp ../../python_models/execute_return_error/config.pbtxt ./models/execute_return_error/
diff --git a/qa/L0_decoupled/decoupled_test.py b/qa/L0_decoupled/decoupled_test.py
index 1f76f4845b..d7bc59f5c7 100755
--- a/qa/L0_decoupled/decoupled_test.py
+++ b/qa/L0_decoupled/decoupled_test.py
@@ -116,7 +116,13 @@ def _stream_infer_with_params(
             url="localhost:8001", verbose=True
         ) as triton_client:
             # Establish stream
-            triton_client.start_stream(callback=partial(callback, user_data))
+            if "TRITONSERVER_GRPC_STATUS_FLAG" in os.environ:
+                metadata = {"triton_grpc_error": "true"}
+                triton_client.start_stream(
+                    callback=partial(callback, user_data), headers=metadata
+                )
+            else:
+                triton_client.start_stream(callback=partial(callback, user_data))
             # Send specified many requests in parallel
             for i in range(request_count):
                 time.sleep((request_delay / 1000))
@@ -175,7 +181,13 @@ def _stream_infer(
             url="localhost:8001", verbose=True
         ) as triton_client:
             # Establish stream
-            triton_client.start_stream(callback=partial(callback, user_data))
+            if "TRITONSERVER_GRPC_STATUS_FLAG" in os.environ:
+                metadata = {"triton_grpc_error": "true"}
+                triton_client.start_stream(
+                    callback=partial(callback, user_data), headers=metadata
+                )
+            else:
+                triton_client.start_stream(callback=partial(callback, user_data))
             # Send specified many requests in parallel
             for i in range(request_count):
                 time.sleep((request_delay / 1000))
diff --git a/qa/L0_decoupled/test.sh b/qa/L0_decoupled/test.sh
index 98ad134d8b..22c37dff49 100755
--- a/qa/L0_decoupled/test.sh
+++ b/qa/L0_decoupled/test.sh
@@ -176,4 +176,4 @@ else
   echo -e "\n***\n*** Test Failed\n***"
 fi
 
-exit $RET
+exit $RET
\ No newline at end of file
diff --git a/qa/L0_grpc_state_cleanup/cleanup_test.py b/qa/L0_grpc_state_cleanup/cleanup_test.py
index 431eeb1720..f7507747e9 100755
--- a/qa/L0_grpc_state_cleanup/cleanup_test.py
+++ b/qa/L0_grpc_state_cleanup/cleanup_test.py
@@ -161,9 +161,17 @@ def _stream_infer_with_params(
             url="localhost:8001", verbose=True
         ) as triton_client:
             # Establish stream
-            triton_client.start_stream(
-                callback=partial(callback, user_data), stream_timeout=stream_timeout
-            )
+            if "TRITONSERVER_GRPC_STATUS_FLAG" in os.environ:
+                metadata = {"triton_grpc_error": "true"}
+                triton_client.start_stream(
+                    callback=partial(callback, user_data),
+                    stream_timeout=stream_timeout,
+                    headers=metadata,
+                )
+            else:
+                triton_client.start_stream(
+                    callback=partial(callback, user_data), stream_timeout=stream_timeout
+                )
             # Send specified many requests in parallel
             for i in range(request_count):
                 time.sleep((request_delay / 1000))
@@ -229,9 +237,17 @@ def _stream_infer(
             url="localhost:8001", verbose=True
         ) as triton_client:
             # Establish stream
-            triton_client.start_stream(
-                callback=partial(callback, user_data), stream_timeout=stream_timeout
-            )
+            if "TRITONSERVER_GRPC_STATUS_FLAG" in os.environ:
+                metadata = {"triton_grpc_error": "true"}
+                triton_client.start_stream(
+                    callback=partial(callback, user_data),
+                    stream_timeout=stream_timeout,
+                    headers=metadata,
+                )
+            else:
+                triton_client.start_stream(
+                    callback=partial(callback, user_data), stream_timeout=stream_timeout
+                )
             # Send specified many requests in parallel
             for i in range(request_count):
                 time.sleep((request_delay / 1000))
@@ -608,9 +624,17 @@ def test_non_decoupled_streaming_multi_response(self):
             url="localhost:8001", verbose=True
         ) as client:
             # Establish stream
-            client.start_stream(
-                callback=partial(callback, user_data), stream_timeout=16
-            )
+            if "TRITONSERVER_GRPC_STATUS_FLAG" in os.environ:
+                metadata = {"triton_grpc_error": "true"}
+                client.start_stream(
+                    callback=partial(callback, user_data),
+                    stream_timeout=16,
+                    headers=metadata,
+                )
+            else:
+                client.start_stream(
+                    callback=partial(callback, user_data), stream_timeout=16
+                )
             # Send a request
             client.async_stream_infer(
                 model_name=self.repeat_non_decoupled_model_name,
diff --git a/qa/python_models/execute_grpc_error/config.pbtxt b/qa/python_models/execute_grpc_error/config.pbtxt
new file mode 100644
index 0000000000..70e247148a
--- /dev/null
+++ b/qa/python_models/execute_grpc_error/config.pbtxt
@@ -0,0 +1,51 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+backend: "python"
+max_batch_size: 64
+
+input [
+  {
+    name: "IN"
+    data_type: TYPE_FP32
+    dims: [ -1 ]
+  }
+]
+
+output [
+  {
+    name: "OUT"
+    data_type: TYPE_FP32
+    dims: [ -1 ]
+  }
+]
+
+instance_group [
+  {
+    count: 1
+    kind : KIND_CPU
+  }
+]
diff --git a/qa/python_models/execute_grpc_error/model.py b/qa/python_models/execute_grpc_error/model.py
new file mode 100644
index 0000000000..d5087a49ec
--- /dev/null
+++ b/qa/python_models/execute_grpc_error/model.py
@@ -0,0 +1,52 @@
+# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import triton_python_backend_utils as pb_utils
+
+
+class TritonPythonModel:
+    def __init__(self):
+        # Maintain total inference count, so as to return error on 2nd request, all of this to simulate model failure
+        self.inf_count = 1
+
+    def execute(self, requests):
+        """This function is called on inference request."""
+        responses = []
+
+        # Generate the error for the second request
+        for request in requests:
+            input_tensor = pb_utils.get_input_tensor_by_name(request, "IN")
+            out_tensor = pb_utils.Tensor("OUT", input_tensor.as_numpy())
+            if self.inf_count % 2:
+                # Every odd request is success
+                responses.append(pb_utils.InferenceResponse([out_tensor]))
+            else:
+                # Every even request is failure
+                error = pb_utils.TritonError("An error occurred during execution")
+                responses.append(pb_utils.InferenceResponse([out_tensor], error))
+            self.inf_count += 1
+
+        return responses
diff --git a/src/grpc/grpc_utils.h b/src/grpc/grpc_utils.h
index 898e4acb4f..032dec3ad9 100644
--- a/src/grpc/grpc_utils.h
+++ b/src/grpc/grpc_utils.h
@@ -76,6 +76,46 @@ typedef enum {
   PARTIAL_COMPLETION
 } Steps;
 
+typedef enum {
+  // No error from CORE seen yet
+  NONE,
+  // Error from CORE encountered, waiting to be picked up by completion queue to
+  // initiate cancellation
+  ERROR_ENCOUNTERED,
+  // Error from CORE encountered, stream closed
+  // This state is added to avoid double cancellation
+  ERROR_HANDLING_COMPLETE
+} TritonGRPCErrorSteps;
+
+class gRPCErrorTracker {
+ public:
+  // True if set by user via header
+  // Can be accessed without a lock, as set only once in startstream
+  std::atomic<bool> triton_grpc_error_;
+
+  // Indicates the state of triton_grpc_error, only relevant if special
+  // triton_grpc_error feature set to true by client
+  TritonGRPCErrorSteps grpc_stream_error_state_;
+
+  // Constructor
+  gRPCErrorTracker()
+      : triton_grpc_error_(false),
+        grpc_stream_error_state_(TritonGRPCErrorSteps::NONE)
+  {
+  }
+  // Changes the state of grpc_stream_error_state_ to ERROR_HANDLING_COMPLETE,
+  // indicating we have closed the stream and initiated the cancel flow
+  void MarkGRPCErrorHandlingComplete();
+
+  // Returns true ONLY when GRPC_ERROR from CORE is waiting to be processed.
+  bool CheckAndUpdateGRPCError();
+
+  // Marks error after it has been responded to
+  void MarkGRPCErrorEncountered();
+
+  // Checks if error already responded to in triton_grpc_error mode
+  bool GRPCErrorEncountered();
+};
 // Debugging helper
 std::ostream& operator<<(std::ostream& out, const Steps& step);
 
@@ -183,5 +223,4 @@ TRITONSERVER_Error* ParseClassificationParams(
 
 
 void ReadFile(const std::string& filename, std::string& data);
-
 }}}  // namespace triton::server::grpc
diff --git a/src/grpc/infer_handler.h b/src/grpc/infer_handler.h
index 0e1091feb8..6382c96c3c 100644
--- a/src/grpc/infer_handler.h
+++ b/src/grpc/infer_handler.h
@@ -646,6 +646,7 @@ class InferHandlerState {
     {
       ctx_.reset(new ::grpc::ServerContext());
       responder_.reset(new ServerResponderType(ctx_.get()));
+      gRPCErrorTracker_ = std::make_unique<gRPCErrorTracker>();
     }
 
     void SetCompressionLevel(grpc_compression_level compression_level)
@@ -666,9 +667,11 @@ class InferHandlerState {
 
     bool IsCancelled()
     {
-      return received_notification_ ? ctx_->IsCancelled() : false;
+      return received_notification_
+                 ? (ctx_->IsCancelled() ||
+                    gRPCErrorTracker_->CheckAndUpdateGRPCError())
+                 : false;
     }
-
     // Increments the ongoing request counter
     void IncrementRequestCounter() { ongoing_requests_++; }
 
@@ -710,6 +713,37 @@ class InferHandlerState {
       return false;
     }
 
+    // Extracts headers from GRPC request and updates state
+    void ExtractStateFromHeaders(InferHandlerStateType* state)
+    {
+      const auto& metadata = state->context_->ctx_->client_metadata();
+      std::string triton_grpc_error_key = "triton_grpc_error";
+
+      auto it = metadata.find(
+          {triton_grpc_error_key.data(), triton_grpc_error_key.size()});
+
+      if (it != metadata.end()) {
+        if (it->second == "true") {
+          LOG_VERBOSE(2)
+              << "GRPC: triton_grpc_error mode detected in new grpc stream";
+          state->context_->gRPCErrorTracker_->triton_grpc_error_ = true;
+        }
+      }
+    }
+
+    void WriteGRPCErrorResponse(InferHandlerStateType* state)
+    {
+      std::lock_guard<std::recursive_mutex> lock(state->context_->mu_);
+      // Check if Error not responded previously
+      // Avoid closing connection twice on multiple errors from core
+      if (!state->context_->gRPCErrorTracker_->GRPCErrorEncountered()) {
+        state->step_ = Steps::COMPLETE;
+        state->context_->responder_->Finish(state->status_, state);
+        // Mark error for this stream
+        state->context_->gRPCErrorTracker_->MarkGRPCErrorEncountered();
+      }
+    }
+
     const std::string DebugString(InferHandlerStateType* state)
     {
       std::string debug_string("");
@@ -793,6 +827,7 @@ class InferHandlerState {
     bool HandleCancellation(
         InferHandlerStateType* state, bool rpc_ok, const std::string& name)
     {
+      // Check to avoid early exit in case of triton_grpc_error
       if (!IsCancelled()) {
         LOG_ERROR
             << "[INTERNAL] HandleCancellation called even when the context was "
@@ -816,7 +851,6 @@ class InferHandlerState {
           IssueRequestCancellation();
           // Mark the context as cancelled
           state->context_->step_ = Steps::CANCELLED;
-
           // The state returns true because the CancelExecution
           // call above would have raised alarm objects on all
           // pending inflight states objects. This state will
@@ -999,6 +1033,8 @@ class InferHandlerState {
     // Tracks whether the async notification has been delivered by
     // completion queue.
     bool received_notification_;
+
+    std::unique_ptr<gRPCErrorTracker> gRPCErrorTracker_;
   };
 
   // This constructor is used to build a wrapper state object
@@ -1090,7 +1126,6 @@ class InferHandlerState {
 
   void MarkAsAsyncNotifyState() { async_notify_state_ = true; }
   bool IsAsyncNotifyState() { return async_notify_state_; }
-
   // Needed in the response handle for classification outputs.
   TRITONSERVER_Server* tritonserver_;
 
diff --git a/src/grpc/stream_infer_handler.cc b/src/grpc/stream_infer_handler.cc
index 585f88d536..6651eca813 100644
--- a/src/grpc/stream_infer_handler.cc
+++ b/src/grpc/stream_infer_handler.cc
@@ -189,7 +189,7 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok)
       state->context_->responder_->Finish(status, state);
       return !finished;
     }
-
+    state->context_->ExtractStateFromHeaders(state);
   } else if (state->step_ == Steps::READ) {
     TRITONSERVER_Error* err = nullptr;
     const inference::ModelInferRequest& request = state->request_;
@@ -355,7 +355,6 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok)
       GrpcStatusUtil::Create(&status, err);
       TRITONSERVER_ErrorDelete(err);
       response->set_error_message(status.error_message());
-
       response->mutable_infer_response()->Clear();
       // repopulate the id so that client knows which request failed.
       response->mutable_infer_response()->set_id(request.id());
@@ -596,7 +595,13 @@ ModelStreamInferHandler::StreamInferResponseComplete(
     void* userp)
 {
   State* state = reinterpret_cast<State*>(userp);
-
+  // Ignore Response from CORE in case GRPC Strict as we dont care about
+  if (state->context_->gRPCErrorTracker_->triton_grpc_error_) {
+    std::lock_guard<std::recursive_mutex> lock(state->context_->mu_);
+    if (state->context_->gRPCErrorTracker_->GRPCErrorEncountered()) {
+      return;
+    }
+  }
   // Increment the callback index
   uint32_t response_index = state->cb_count_++;
 
@@ -671,14 +676,27 @@ ModelStreamInferHandler::StreamInferResponseComplete(
     } else {
       LOG_ERROR << "expected the response allocator to have added the response";
     }
-
     if (err != nullptr) {
       failed = true;
       ::grpc::Status status;
+      // Converts CORE errors to GRPC error codes
       GrpcStatusUtil::Create(&status, err);
       response->mutable_infer_response()->Clear();
       response->set_error_message(status.error_message());
       LOG_VERBOSE(1) << "Failed for ID: " << log_request_id << std::endl;
+      if (state->context_->gRPCErrorTracker_->triton_grpc_error_) {
+        state->status_ = status;
+        // Finish only once, if backend ignores cancellation
+        LOG_VERBOSE(1) << "GRPC streaming error detected with status: "
+                       << status.error_code() << "Closing stream connection."
+                       << std::endl;
+        state->context_->WriteGRPCErrorResponse(state);
+        TRITONSERVER_ErrorDelete(err);
+        LOG_TRITONSERVER_ERROR(
+            TRITONSERVER_InferenceResponseDelete(iresponse),
+            "deleting GRPC inference response");
+        return;
+      }
     }
 
     TRITONSERVER_ErrorDelete(err);
@@ -802,4 +820,42 @@ ModelStreamInferHandler::StreamInferResponseComplete(
   }
 }
 
+// Changes the state of grpc_stream_error_state_ to ERROR_HANDLING_COMPLETE,
+// indicating we have closed the stream and initiated the cancel flow
+void
+gRPCErrorTracker::MarkGRPCErrorHandlingComplete()
+{
+  grpc_stream_error_state_ = TritonGRPCErrorSteps::ERROR_HANDLING_COMPLETE;
+}
+
+// Returns true ONLY when GRPC_ERROR from CORE is waiting to be processed.
+bool
+gRPCErrorTracker::CheckAndUpdateGRPCError()
+{
+  if (grpc_stream_error_state_ == TritonGRPCErrorSteps::ERROR_ENCOUNTERED) {
+    // Change the state to ERROR_HANDLING_COMPLETE as we have called
+    // HandleCancellation
+    MarkGRPCErrorHandlingComplete();
+    return true;
+  }
+  return false;
+}
+
+// Marks error after it has been responded to
+void
+gRPCErrorTracker::MarkGRPCErrorEncountered()
+{
+  grpc_stream_error_state_ = TritonGRPCErrorSteps::ERROR_ENCOUNTERED;
+}
+
+// Checks if error already responded to in triton_grpc_error mode
+bool
+gRPCErrorTracker::GRPCErrorEncountered()
+{
+  if (grpc_stream_error_state_ == TritonGRPCErrorSteps::NONE) {
+    return false;
+  }
+  return true;
+}
+
 }}}  // namespace triton::server::grpc

From 62184db709e7bc6e43d3098c837a464b9f2249cb Mon Sep 17 00:00:00 2001
From: Yingge He <157551214+yinggeh@users.noreply.github.com>
Date: Fri, 16 Aug 2024 17:12:43 -0700
Subject: [PATCH 12/44] test: Add python backend tests for the new histogram
 metric (#7540)

---
 qa/python_models/custom_metrics/model.py | 144 ++++++++++++++++++++++-
 1 file changed, 141 insertions(+), 3 deletions(-)

diff --git a/qa/python_models/custom_metrics/model.py b/qa/python_models/custom_metrics/model.py
index 31f105a1dd..7c78b46894 100644
--- a/qa/python_models/custom_metrics/model.py
+++ b/qa/python_models/custom_metrics/model.py
@@ -1,4 +1,4 @@
-# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -74,6 +74,96 @@ def _metric_api_helper(self, metric, kind):
             self.assertEqual(metric.value(), value)
             logger.log_info("Set metric to : {}".format(metric.value()))
 
+        # Test observe value
+        observe = 0.05
+        # Counter and gauge do not support observe
+        with self.assertRaises(pb_utils.TritonModelException):
+            metric.observe(observe)
+
+    def _histogram_api_helper(self, metric, name, labels):
+        def histogram_str_builder(name, type, labels, value, le=None):
+            if type == "count" or type == "sum":
+                return f"{name}_{type}{{{labels}}} {value}"
+            elif type == "bucket":
+                return f'{name}_bucket{{{labels},le="{le}"}} {value}'
+            else:
+                raise
+
+        # Adding logger to test if custom metrics and logging work together
+        # as they use the same message queue.
+        logger = pb_utils.Logger
+
+        # All values should be 0.0 before the test
+        metrics = self._get_metrics()
+        self.assertIn(histogram_str_builder(name, "count", labels, "0"), metrics)
+        self.assertIn(histogram_str_builder(name, "sum", labels, "0"), metrics)
+        self.assertIn(
+            histogram_str_builder(name, "bucket", labels, "0", le="0.1"), metrics
+        )
+        self.assertIn(
+            histogram_str_builder(name, "bucket", labels, "0", le="1"), metrics
+        )
+        self.assertIn(
+            histogram_str_builder(name, "bucket", labels, "0", le="2.5"), metrics
+        )
+        self.assertIn(
+            histogram_str_builder(name, "bucket", labels, "0", le="5"), metrics
+        )
+        self.assertIn(
+            histogram_str_builder(name, "bucket", labels, "0", le="10"), metrics
+        )
+        self.assertIn(
+            histogram_str_builder(name, "bucket", labels, "0", le="+Inf"), metrics
+        )
+
+        # Histogram does not support value
+        with self.assertRaises(pb_utils.TritonModelException):
+            metric.value()
+
+        # Test increment value
+        increment = 2023.0
+        # Histogram does not support increment
+        with self.assertRaises(pb_utils.TritonModelException):
+            metric.increment(increment)
+
+        # Test set value
+        value = 999.9
+        # Histogram does not support set
+        with self.assertRaises(pb_utils.TritonModelException):
+            metric.set(value)
+
+        # Test observe value
+        data = [0.05, 1.5, 6.0]
+        for datum in data:
+            metric.observe(datum)
+            logger.log_info("Observe histogram metric with value : {}".format(datum))
+
+        metrics = self._get_metrics()
+        self.assertIn(
+            histogram_str_builder(name, "count", labels, str(len(data))), metrics
+        )
+        self.assertIn(
+            histogram_str_builder(name, "sum", labels, str(sum(data))), metrics
+        )
+        self.assertIn(
+            histogram_str_builder(name, "bucket", labels, "1", le="0.1"), metrics
+        )
+        self.assertIn(
+            histogram_str_builder(name, "bucket", labels, "1", le="1"), metrics
+        )
+        self.assertIn(
+            histogram_str_builder(name, "bucket", labels, "2", le="2.5"), metrics
+        )
+        self.assertIn(
+            histogram_str_builder(name, "bucket", labels, "2", le="5"), metrics
+        )
+        self.assertIn(
+            histogram_str_builder(name, "bucket", labels, "3", le="10"), metrics
+        )
+        self.assertIn(
+            histogram_str_builder(name, "bucket", labels, "3", le="+Inf"), metrics
+        )
+
     def _dup_metric_helper(self, labels={}):
         # Adding logger to test if custom metrics and logging work together
         # as they use the same message queue.
@@ -128,14 +218,62 @@ def test_gauge_e2e(self):
             description="test metric gauge kind end to end",
             kind=pb_utils.MetricFamily.GAUGE,
         )
-        labels = {"example1": "counter_label1", "example2": "counter_label2"}
+        labels = {"example1": "gauge_label1", "example2": "gauge_label2"}
         metric = metric_family.Metric(labels=labels)
         self._metric_api_helper(metric, "gauge")
 
-        pattern = 'test_gauge_e2e{example1="counter_label1",example2="counter_label2"}'
+        pattern = 'test_gauge_e2e{example1="gauge_label1",example2="gauge_label2"}'
         metrics = self._get_metrics()
         self.assertIn(pattern, metrics)
 
+    def test_histogram_e2e(self):
+        name = "test_histogram_e2e"
+        metric_family = pb_utils.MetricFamily(
+            name=name,
+            description="test metric histogram kind end to end",
+            kind=pb_utils.MetricFamily.HISTOGRAM,
+        )
+
+        labels = {"example1": "histogram_label1", "example2": "histogram_label2"}
+        buckets = [0.1, 1.0, 2.5, 5.0, 10.0]
+        metric = metric_family.Metric(labels=labels, buckets=buckets)
+
+        labels_str = 'example1="histogram_label1",example2="histogram_label2"'
+        self._histogram_api_helper(metric, name, labels_str)
+
+        metrics = self._get_metrics()
+        count_pattern = f"{name}_count{{{labels_str}}}"
+        sum_pattern = f"{name}_sum{{{labels_str}}}"
+        bucket_pattern = f"{name}_bucket{{{labels_str}"
+        self.assertEqual(metrics.count(count_pattern), 1)
+        self.assertEqual(metrics.count(sum_pattern), 1)
+        self.assertEqual(metrics.count(bucket_pattern), len(buckets) + 1)
+
+    def test_histogram_args(self):
+        name = "test_histogram_args"
+        metric_family = pb_utils.MetricFamily(
+            name=name,
+            description="test metric histogram args",
+            kind=pb_utils.MetricFamily.HISTOGRAM,
+        )
+
+        # Test "None" value buckets
+        with self.assertRaises(pb_utils.TritonModelException):
+            metric_family.Metric(labels={})
+        with self.assertRaises(pb_utils.TritonModelException):
+            metric_family.Metric(labels={}, buckets=None)
+
+        # Test non-ascending order buckets
+        with self.assertRaises(pb_utils.TritonModelException):
+            metric_family.Metric(labels={}, buckets=[2.5, 0.1, 1.0, 10.0, 5.0])
+
+        # Test duplicate value buckets
+        with self.assertRaises(pb_utils.TritonModelException):
+            metric_family.Metric(labels={}, buckets=[1, 1, 2, 5, 5])
+
+        # Test empty list bucket
+        metric_family.Metric(labels={}, buckets=[])
+
     def test_dup_metric_family_diff_kind(self):
         # Test that a duplicate metric family can't be added with a conflicting type/kind
         metric_family1 = pb_utils.MetricFamily(

From 5e397715635ca57f762e9df667e5192a0ed7d6f0 Mon Sep 17 00:00:00 2001
From: Jacky <18255193+kthui@users.noreply.github.com>
Date: Tue, 20 Aug 2024 12:22:39 -0700
Subject: [PATCH 13/44] test: Load new model version should not reload loaded
 existing model version(s) (#7527)

---
 qa/L0_lifecycle/lifecycle_test.py | 127 ++++++++++++++++++++++++++++++
 qa/L0_lifecycle/test.sh           |  35 ++++++++
 2 files changed, 162 insertions(+)

diff --git a/qa/L0_lifecycle/lifecycle_test.py b/qa/L0_lifecycle/lifecycle_test.py
index a2bfc067bc..49fe684ff1 100755
--- a/qa/L0_lifecycle/lifecycle_test.py
+++ b/qa/L0_lifecycle/lifecycle_test.py
@@ -3493,6 +3493,133 @@ def test_delete_custom_config(self):
             except Exception as ex:
                 self.assertTrue(False, "unexpected error {}".format(ex))
 
+    def test_load_new_model_version(self):
+        model_name = "identity_fp32"
+        client = self._get_client(use_grpc=True)
+
+        # version 1 and 2 are already loaded
+        # version 3 is in the model directory but not loaded
+        # version 4 does not exist anywhere
+        self.assertTrue(client.is_model_ready(model_name, "1"))
+        self.assertTrue(client.is_model_ready(model_name, "2"))
+        self.assertFalse(client.is_model_ready(model_name, "3"))
+        self.assertFalse(client.is_model_ready(model_name, "4"))
+        with open(os.environ["SERVER_LOG"]) as f:
+            server_log = f.read()
+        self.assertEqual(server_log.count("[PB model] Loading version 1"), 1)
+        self.assertEqual(server_log.count("[PB model] Loading version 2"), 1)
+        self.assertEqual(server_log.count("[PB model] Loading version 3"), 0)
+        self.assertEqual(server_log.count("[PB model] Loading version 4"), 0)
+        self.assertEqual(server_log.count("successfully loaded 'identity_fp32'"), 1)
+
+        # update version 2 model file
+        Path(os.path.join("models", model_name, "2", "model.py")).touch()
+        # add version 4 model file
+        src_path = os.path.join("models", model_name, "3")
+        dst_path = os.path.join("models", model_name, "4")
+        shutil.copytree(src_path, dst_path)
+        # update model config to load version 1 to 4
+        config_path = os.path.join("models", model_name, "config.pbtxt")
+        with open(config_path, mode="r+", encoding="utf-8", errors="strict") as f:
+            config = f.read()
+            config = config.replace(
+                "version_policy: { specific: { versions: [1, 2] } }",
+                "version_policy: { specific: { versions: [1, 2, 3, 4] } }",
+            )
+            f.truncate(0)
+            f.seek(0)
+            f.write(config)
+        # reload the model
+        client.load_model(model_name)
+
+        # version 1 is unmodified so it should not be reloaded
+        # version 2 is modified so it should be reloaded
+        # version 3 model file existed but not loaded so it should be loaded
+        # version 4 is a new version so it should be loaded
+        self.assertTrue(client.is_model_ready(model_name, "1"))
+        self.assertTrue(client.is_model_ready(model_name, "2"))
+        self.assertTrue(client.is_model_ready(model_name, "3"))
+        self.assertTrue(client.is_model_ready(model_name, "4"))
+        with open(os.environ["SERVER_LOG"]) as f:
+            server_log = f.read()
+        self.assertEqual(server_log.count("[PB model] Loading version 1"), 1)
+        self.assertEqual(server_log.count("[PB model] Loading version 2"), 2)
+        self.assertEqual(server_log.count("[PB model] Loading version 3"), 1)
+        self.assertEqual(server_log.count("[PB model] Loading version 4"), 1)
+        self.assertEqual(server_log.count("successfully loaded 'identity_fp32'"), 2)
+
+        # simulate a dependency change to all versions
+        Path(os.path.join("models", model_name, "dummy_dependency.py")).touch()
+        # reload the model
+        client.load_model(model_name)
+
+        # all 4 versions should be reloaded
+        self.assertTrue(client.is_model_ready(model_name, "1"))
+        self.assertTrue(client.is_model_ready(model_name, "2"))
+        self.assertTrue(client.is_model_ready(model_name, "3"))
+        self.assertTrue(client.is_model_ready(model_name, "4"))
+        with open(os.environ["SERVER_LOG"]) as f:
+            server_log = f.read()
+        self.assertEqual(server_log.count("[PB model] Loading version 1"), 2)
+        self.assertEqual(server_log.count("[PB model] Loading version 2"), 3)
+        self.assertEqual(server_log.count("[PB model] Loading version 3"), 2)
+        self.assertEqual(server_log.count("[PB model] Loading version 4"), 2)
+        self.assertEqual(server_log.count("successfully loaded 'identity_fp32'"), 3)
+
+        # update model config to only load version 4
+        config_path = os.path.join("models", model_name, "config.pbtxt")
+        with open(config_path, mode="r+", encoding="utf-8", errors="strict") as f:
+            config = f.read()
+            config = config.replace(
+                "version_policy: { specific: { versions: [1, 2, 3, 4] } }",
+                "version_policy: { specific: { versions: [4] } }",
+            )
+            f.truncate(0)
+            f.seek(0)
+            f.write(config)
+        # reload the model
+        client.load_model(model_name)
+
+        # only version 4 should be available and no reloads should happen
+        self.assertFalse(client.is_model_ready(model_name, "1"))
+        self.assertFalse(client.is_model_ready(model_name, "2"))
+        self.assertFalse(client.is_model_ready(model_name, "3"))
+        self.assertTrue(client.is_model_ready(model_name, "4"))
+        with open(os.environ["SERVER_LOG"]) as f:
+            server_log = f.read()
+        self.assertEqual(server_log.count("[PB model] Loading version 1"), 2)
+        self.assertEqual(server_log.count("[PB model] Loading version 2"), 3)
+        self.assertEqual(server_log.count("[PB model] Loading version 3"), 2)
+        self.assertEqual(server_log.count("[PB model] Loading version 4"), 2)
+        self.assertEqual(server_log.count("successfully loaded 'identity_fp32'"), 4)
+
+        # update model config to load version 1 and 4
+        config_path = os.path.join("models", model_name, "config.pbtxt")
+        with open(config_path, mode="r+", encoding="utf-8", errors="strict") as f:
+            config = f.read()
+            config = config.replace(
+                "version_policy: { specific: { versions: [4] } }",
+                "version_policy: { specific: { versions: [1, 4] } }",
+            )
+            f.truncate(0)
+            f.seek(0)
+            f.write(config)
+        # reload the model
+        client.load_model(model_name)
+
+        # version 1 should be loaded and version 4 should not be reloaded
+        self.assertTrue(client.is_model_ready(model_name, "1"))
+        self.assertFalse(client.is_model_ready(model_name, "2"))
+        self.assertFalse(client.is_model_ready(model_name, "3"))
+        self.assertTrue(client.is_model_ready(model_name, "4"))
+        with open(os.environ["SERVER_LOG"]) as f:
+            server_log = f.read()
+        self.assertEqual(server_log.count("[PB model] Loading version 1"), 3)
+        self.assertEqual(server_log.count("[PB model] Loading version 2"), 3)
+        self.assertEqual(server_log.count("[PB model] Loading version 3"), 2)
+        self.assertEqual(server_log.count("[PB model] Loading version 4"), 2)
+        self.assertEqual(server_log.count("successfully loaded 'identity_fp32'"), 5)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_lifecycle/test.sh b/qa/L0_lifecycle/test.sh
index 9236fdabfb..4efd244c76 100755
--- a/qa/L0_lifecycle/test.sh
+++ b/qa/L0_lifecycle/test.sh
@@ -2196,6 +2196,41 @@ set -e
 kill $SERVER_PID
 wait $SERVER_PID
 
+LOG_IDX=$((LOG_IDX+1))
+
+# LifeCycleTest.test_load_new_model_version
+rm -rf models
+mkdir models
+cp -r ../python_models/identity_fp32 models/ && (cd models/identity_fp32 && \
+    echo "version_policy: { specific: { versions: [1, 2] } }" >> config.pbtxt && \
+    echo "    def initialize(self, args):" >> model.py && \
+    echo "        pb_utils.Logger.log_info(f'[PB model] Loading version {args[\"model_version\"]}')" >> model.py && \
+    mkdir 1 && cp model.py 1 && \
+    mkdir 2 && cp model.py 2 && \
+    mkdir 3 && mv model.py 3)
+
+export PYTHONDONTWRITEBYTECODE="True"
+SERVER_ARGS="--model-repository=`pwd`/models --model-control-mode=explicit --load-model=*"
+SERVER_LOG="./inference_server_$LOG_IDX.log"
+run_server
+if [ "$SERVER_PID" == "0" ]; then
+    echo -e "\n***\n*** Failed to start $SERVER\n***"
+    cat $SERVER_LOG
+    exit 1
+fi
+
+set +e
+SERVER_LOG=$SERVER_LOG python $LC_TEST LifeCycleTest.test_load_new_model_version >>$CLIENT_LOG 2>&1
+if [ $? -ne 0 ]; then
+    cat $CLIENT_LOG
+    echo -e "\n***\n*** Test Failed\n***"
+    RET=1
+fi
+set -e
+
+kill $SERVER_PID
+wait $SERVER_PID
+unset PYTHONDONTWRITEBYTECODE
 
 if [ $RET -eq 0 ]; then
   echo -e "\n***\n*** Test Passed\n***"

From 44fa2078859ef2e0b116b26b7d27184b836c8c33 Mon Sep 17 00:00:00 2001
From: Indrajit Bhosale <iamindrajitb@gmail.com>
Date: Tue, 20 Aug 2024 12:49:24 -0700
Subject: [PATCH 14/44] Intermittent `L0_decoupled_grpc_error` crash fixed.
 (#7552)

---
 src/grpc/infer_handler.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/grpc/infer_handler.h b/src/grpc/infer_handler.h
index 6382c96c3c..51307d4ae0 100644
--- a/src/grpc/infer_handler.h
+++ b/src/grpc/infer_handler.h
@@ -667,6 +667,7 @@ class InferHandlerState {
 
     bool IsCancelled()
     {
+      std::lock_guard<std::recursive_mutex> lock(mu_);
       return received_notification_
                  ? (ctx_->IsCancelled() ||
                     gRPCErrorTracker_->CheckAndUpdateGRPCError())

From fb60c0ea1fcb81ae4906531aa1b722111260f4d4 Mon Sep 17 00:00:00 2001
From: Francesco Petrini <francescogpetrini@gmail.com>
Date: Thu, 22 Aug 2024 10:13:08 -0700
Subject: [PATCH 15/44] ci: Raise Documentation Generation Errors (#7559)

* ci: Raise Documentation Generation Errors
---
 docs/generate_docs.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/docs/generate_docs.py b/docs/generate_docs.py
index 6982294d21..0e4079e40b 100755
--- a/docs/generate_docs.py
+++ b/docs/generate_docs.py
@@ -123,9 +123,7 @@ def run_command(command):
             stderr=subprocess.PIPE,
         )
     except subprocess.CalledProcessError as e:
-        log_message(f"Error executing command: {e.cmd}")
-        log_message(e.output)
-        log_message(e.stderr)
+        raise (e)
 
 
 def clone_from_github(repo, tag, org):
@@ -137,7 +135,6 @@ def clone_from_github(repo, tag, org):
     """
     # Construct the full GitHub repository URL
     repo_url = f"https://github.com/{org}/{repo}.git"
-    print(repo_url)
     # Construct the git clone command
     if tag:
         clone_command = [
@@ -155,7 +152,7 @@ def clone_from_github(repo, tag, org):
         subprocess.run(clone_command, check=True)
         log_message(f"Successfully cloned {repo}")
     except subprocess.CalledProcessError as e:
-        log_message(f"Failed to clone {repo}. Error: {e}")
+        raise (e)
 
 
 def parse_repo_tag(repo_tags):
@@ -189,8 +186,8 @@ def get_git_repo_name(file_path):
             .decode()
             .strip()
         )
-    except subprocess.CalledProcessError:
-        return None
+    except subprocess.CalledProcessError as e:
+        raise (e)
 
     # Extract repository name from the remote URL.
     if remote_url.endswith(".git"):

From 187a4a3f8fc8d6b577bebe716f7a8028ea3005ac Mon Sep 17 00:00:00 2001
From: Kris Hung <krish@nvidia.com>
Date: Fri, 23 Aug 2024 10:51:26 -0700
Subject: [PATCH 16/44] docs: Add tensorrtllm_backend into doc generation
 (#7563)

---
 docs/generate_docs.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/docs/generate_docs.py b/docs/generate_docs.py
index 0e4079e40b..acf0afc981 100755
--- a/docs/generate_docs.py
+++ b/docs/generate_docs.py
@@ -393,6 +393,12 @@ def main():
     if "python_backend" in repo_tags:
         clone_from_github("python_backend", repo_tags["python_backend"], github_org)
 
+    # Usage generate_docs.py --repo-tag=tensorrtllm_backend:main
+    if "tensorrtllm_backend" in repo_tags:
+        clone_from_github(
+            "tensorrtllm_backend", repo_tags["tensorrtllm_backend"], github_org
+        )
+
     # Usage generate_docs.py --backend-tag=custom_backend:main
     # Custom backend can be anything currently empty
     if "custom_backend" in backend_tags:
@@ -409,6 +415,10 @@ def main():
         run_command("rm -rf python_backend")
     if "custom_backend" in backend_tags:
         run_command("rm -rf custom_backend")
+    if "tensorrtllm_backend" in repo_tags:
+        run_command("rm -rf tensorrtllm_backend")
+    if "perf_analyzer" in repo_tags:
+        run_command("rm -rf perf_analyzer")
 
     # Return to previous working directory server/.
     os.chdir(server_abspath)

From 6a697632aa5c40791ad2e040a9921ffc80c766bb Mon Sep 17 00:00:00 2001
From: Francesco Petrini <francescogpetrini@gmail.com>
Date: Tue, 27 Aug 2024 09:34:47 -0700
Subject: [PATCH 17/44] build: RHEL8 EA2 Backends (#7568)

* build: RHEL8 EA2 Backends
---
 build.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/build.py b/build.py
index 2a9b2469fc..6901c2e201 100755
--- a/build.py
+++ b/build.py
@@ -116,7 +116,8 @@ def fail_if(p, msg):
 
 
 def target_platform():
-    if FLAGS.target_platform is not None:
+    # When called by compose.py, FLAGS will be None
+    if FLAGS and FLAGS.target_platform is not None:
         return FLAGS.target_platform
     platform_string = platform.system().lower()
     if platform_string == "linux":
@@ -132,7 +133,8 @@ def target_platform():
 
 
 def target_machine():
-    if FLAGS.target_machine is not None:
+    # When called by compose.py, FLAGS will be None
+    if FLAGS and FLAGS.target_machine is not None:
         return FLAGS.target_machine
     return platform.machine().lower()
 
@@ -639,13 +641,16 @@ def pytorch_cmake_args(images):
         cmake_backend_arg("pytorch", "TRITON_PYTORCH_DOCKER_IMAGE", None, image),
     ]
 
-    if FLAGS.enable_gpu:
+    # TODO: TPRD-372 TorchTRT extension is not currently supported by our manylinux build
+    # TODO: TPRD-373 NVTX extension is not currently supported by our manylinux build
+    if target_platform() != "rhel":
+        if FLAGS.enable_gpu:
+            cargs.append(
+                cmake_backend_enable("pytorch", "TRITON_PYTORCH_ENABLE_TORCHTRT", True)
+            )
         cargs.append(
-            cmake_backend_enable("pytorch", "TRITON_PYTORCH_ENABLE_TORCHTRT", True)
+            cmake_backend_enable("pytorch", "TRITON_ENABLE_NVTX", FLAGS.enable_nvtx)
         )
-    cargs.append(
-        cmake_backend_enable("pytorch", "TRITON_ENABLE_NVTX", FLAGS.enable_nvtx)
-    )
     return cargs
 
 
@@ -1301,7 +1306,6 @@ def dockerfile_prepare_container_linux(argmap, backends, enable_gpu, target_mach
         gpu_enabled=gpu_enabled
     )
 
-    # This
     if target_platform() == "rhel":
         df += """
 # Common dpeendencies.

From 89641b693e2ea4762577209f468da38a0c88fa5e Mon Sep 17 00:00:00 2001
From: Pavithra Vijayakrishnan
 <160681768+pvijayakrish@users.noreply.github.com>
Date: Tue, 27 Aug 2024 12:16:08 -0700
Subject: [PATCH 18/44] Release: Update NGC versions post-24.08 release (#7565)

Co-authored-by: Misha Chornyi <99709299+mc-nv@users.noreply.github.com>
Co-authored-by: Francesco Petrini <francescogpetrini@gmail.com>
---
 Dockerfile.sdk                                |  2 +-
 Dockerfile.win10.min                          | 12 +++++------
 README.md                                     |  8 ++++----
 TRITON_VERSION                                |  2 +-
 build.py                                      | 12 +++++++----
 deploy/aws/values.yaml                        |  2 +-
 deploy/fleetcommand/Chart.yaml                |  2 +-
 deploy/fleetcommand/values.yaml               |  6 +++---
 deploy/gcp/values.yaml                        |  2 +-
 .../perf-analyzer-script/triton_client.yaml   |  2 +-
 .../server-deployer/build_and_push.sh         |  6 +++---
 .../server-deployer/chart/triton/Chart.yaml   |  4 ++--
 .../server-deployer/chart/triton/values.yaml  |  6 +++---
 .../server-deployer/data-test/schema.yaml     |  2 +-
 .../server-deployer/schema.yaml               |  4 ++--
 .../gke-marketplace-app/trt-engine/README.md  |  6 +++---
 deploy/k8s-onprem/values.yaml                 |  2 +-
 deploy/oci/values.yaml                        |  2 +-
 docs/customization_guide/build.md             |  6 +++---
 docs/customization_guide/compose.md           | 18 ++++++++---------
 docs/customization_guide/test.md              |  2 +-
 docs/generate_docs.py                         |  4 ++--
 docs/user_guide/custom_operations.md          |  6 +++---
 docs/user_guide/performance_tuning.md         |  4 ++--
 qa/L0_backend_python/test.sh                  |  6 +++---
 qa/L0_batcher/test.sh                         |  4 ++--
 qa/L0_grpc/test.sh                            |  2 +-
 qa/L0_http/test.sh                            |  2 +-
 qa/L0_infer/test.sh                           |  2 +-
 qa/L0_sequence_batcher/test.sh                |  2 +-
 qa/L0_trt_plugin/test.sh                      |  4 ++--
 qa/common/gen_jetson_trt_models               |  2 +-
 qa/common/gen_qa_custom_ops                   |  2 +-
 qa/common/gen_qa_model_repository             |  2 +-
 qa/common/util.sh                             | 20 +++++++++++++------
 35 files changed, 91 insertions(+), 79 deletions(-)

diff --git a/Dockerfile.sdk b/Dockerfile.sdk
index 7897c2a215..c7a68fc6af 100644
--- a/Dockerfile.sdk
+++ b/Dockerfile.sdk
@@ -29,7 +29,7 @@
 #
 
 # Base image on the minimum Triton container
-ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.07-py3-min
+ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.08-py3-min
 
 ARG TRITON_CLIENT_REPO_SUBDIR=clientrepo
 ARG TRITON_PA_REPO_SUBDIR=perfanalyzerrepo
diff --git a/Dockerfile.win10.min b/Dockerfile.win10.min
index 0a554fbcf4..29d2c2a43a 100644
--- a/Dockerfile.win10.min
+++ b/Dockerfile.win10.min
@@ -37,7 +37,7 @@ RUN choco install unzip -y
 #
 # Installing TensorRT
 #
-ARG TENSORRT_VERSION=10.2.0.19
+ARG TENSORRT_VERSION=10.3.0.26
 ARG TENSORRT_ZIP="TensorRT-${TENSORRT_VERSION}.Windows10.x86_64.cuda-12.5.zip"
 ARG TENSORRT_SOURCE=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.2.0/zip/TensorRT-10.2.0.19.Windows10.x86_64.cuda-12.5.zip
 # COPY ${TENSORRT_ZIP} /tmp/${TENSORRT_ZIP}
@@ -51,7 +51,7 @@ LABEL TENSORRT_VERSION="${TENSORRT_VERSION}"
 #
 # Installing cuDNN
 #
-ARG CUDNN_VERSION=9.2.1.18
+ARG CUDNN_VERSION=9.3.0.75
 ARG CUDNN_ZIP=cudnn-windows-x86_64-${CUDNN_VERSION}_cuda12-archive.zip
 ARG CUDNN_SOURCE=https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/windows-x86_64/cudnn-windows-x86_64-9.2.1.18_cuda12-archive.zip
 ADD ${CUDNN_SOURCE} /tmp/${CUDNN_ZIP}
@@ -101,14 +101,14 @@ LABEL CMAKE_VERSION=${CMAKE_VERSION}
 #
 # Installing Visual Studio BuildTools: VS17 2022
 #
-ARG BUILDTOOLS_VERSION=17.9.34622.214
+ARG BUILDTOOLS_VERSION=17.10.35201.131
 # Download collect.exe in case of an install failure.
 ADD https://aka.ms/vscollect.exe "C:\tmp\collect.exe"
 
 # Use the latest release channel. For more control, specify the location of an internal layout.
 # Download the Build Tools bootstrapper.
 # ARG BUILD_TOOLS_SOURCE=https://aka.ms/vs/17/release/vs_buildtools.exe
-ARG BUILD_TOOLS_SOURCE=https://download.visualstudio.microsoft.com/download/pr/5e7b923b-7d89-4e14-95b8-a84ab168e243/96b21d216c7954aaf606c6d7ba59a3de991884a8a86c578c767ba349c23188a9/vs_BuildTools.exe
+ARG BUILD_TOOLS_SOURCE=https://download.visualstudio.microsoft.com/download/pr/28626b4b-f88f-4b55-a0cf-f3eaa2c643fb/e6c43d4dfb36338d954cdb3ad9010ab2a479e712088f4f6b016eadcc721bab28/vs_BuildTools.exe
 ADD ${BUILD_TOOLS_SOURCE} vs_buildtools.exe
 # Install Build Tools with the Microsoft.VisualStudio.Workload.VCTools workload, including recommended.
 ARG VS_INSTALL_PATH_WP="C:\BuildTools"
@@ -175,7 +175,7 @@ RUN copy "%CUDA_INSTALL_ROOT_WP%\extras\visual_studio_integration\MSBuildExtensi
 
 RUN setx PATH "%CUDA_INSTALL_ROOT_WP%\bin;%PATH%"
 
-ARG CUDNN_VERSION=9.2.1.18
+ARG CUDNN_VERSION=9.3.0.75
 ENV CUDNN_VERSION ${CUDNN_VERSION}
 COPY --from=dependency_base /cudnn /cudnn
 RUN copy cudnn\bin\cudnn*.dll "%CUDA_INSTALL_ROOT_WP%\bin\."
@@ -183,7 +183,7 @@ RUN copy cudnn\lib\x64\cudnn*.lib "%CUDA_INSTALL_ROOT_WP%\lib\x64\."
 RUN copy cudnn\include\cudnn*.h "%CUDA_INSTALL_ROOT_WP%\include\."
 LABEL CUDNN_VERSION="${CUDNN_VERSION}"
 
-ARG TENSORRT_VERSION=10.2.0.19
+ARG TENSORRT_VERSION=10.3.0.26
 ENV TRT_VERSION ${TENSORRT_VERSION}
 COPY --from=dependency_base /TensorRT /TensorRT
 RUN setx PATH "c:\TensorRT\lib;%PATH%"
diff --git a/README.md b/README.md
index f9b1a483f3..da80cc3a2b 100644
--- a/README.md
+++ b/README.md
@@ -45,7 +45,7 @@ ___
 
 ##### LATEST RELEASE
 You are currently on the `main` branch which tracks under-development progress towards the next release.
-The current release is version [2.48.0](https://github.com/triton-inference-server/server/releases/latest) and corresponds to the 24.07 container release on NVIDIA GPU Cloud (NGC).
+The current release is version [2.49.0](https://github.com/triton-inference-server/server/releases/latest) and corresponds to the 24.08 container release on NVIDIA GPU Cloud (NGC).
 
 Triton Inference Server is an open source inference serving software that
 streamlines AI inferencing. Triton enables teams to deploy any AI model from
@@ -103,16 +103,16 @@ Inference Server with the
 
 ```bash
 # Step 1: Create the example model repository
-git clone -b r24.07 https://github.com/triton-inference-server/server.git
+git clone -b r24.08 https://github.com/triton-inference-server/server.git
 cd server/docs/examples
 ./fetch_models.sh
 
 # Step 2: Launch triton from the NGC Triton container
-docker run --gpus=1 --rm --net=host -v ${PWD}/model_repository:/models nvcr.io/nvidia/tritonserver:24.07-py3 tritonserver --model-repository=/models
+docker run --gpus=1 --rm --net=host -v ${PWD}/model_repository:/models nvcr.io/nvidia/tritonserver:24.08-py3 tritonserver --model-repository=/models
 
 # Step 3: Sending an Inference Request
 # In a separate console, launch the image_client example from the NGC Triton SDK container
-docker run -it --rm --net=host nvcr.io/nvidia/tritonserver:24.07-py3-sdk
+docker run -it --rm --net=host nvcr.io/nvidia/tritonserver:24.08-py3-sdk
 /workspace/install/bin/image_client -m densenet_onnx -c 3 -s INCEPTION /workspace/images/mug.jpg
 
 # Inference should return the following
diff --git a/TRITON_VERSION b/TRITON_VERSION
index 37433781ef..5db7ab5ba3 100644
--- a/TRITON_VERSION
+++ b/TRITON_VERSION
@@ -1 +1 @@
-2.49.0dev
\ No newline at end of file
+2.50.0dev
\ No newline at end of file
diff --git a/build.py b/build.py
index 6901c2e201..3195c50cbb 100755
--- a/build.py
+++ b/build.py
@@ -70,9 +70,9 @@
 # incorrectly load the other version of the openvino libraries.
 #
 TRITON_VERSION_MAP = {
-    "2.49.0dev": (
-        "24.08dev",  # triton container
-        "24.07",  # upstream container
+    "2.50.0dev": (
+        "24.09dev",  # triton container
+        "24.08",  # upstream container
         "1.18.1",  # ORT
         "2024.0.0",  # ORT OpenVINO
         "2024.0.0",  # Standalone OpenVINO
@@ -216,6 +216,8 @@ def header(self, desc=None):
 
         self.comment("Exit script immediately if any command fails")
         if target_platform() == "windows":
+            self._file.write("$UseStructuredOutput = $false\n")
+            self.blankln()
             self._file.write("function ExitWithCode($exitcode) {\n")
             self._file.write("    $host.SetShouldExit($exitcode)\n")
             self._file.write("    exit $exitcode\n")
@@ -660,7 +662,9 @@ def onnxruntime_cmake_args(images, library_paths):
             "onnxruntime",
             "TRITON_BUILD_ONNXRUNTIME_VERSION",
             None,
-            TRITON_VERSION_MAP[FLAGS.version][2],
+            os.getenv("TRITON_BUILD_ONNXRUNTIME_VERSION")
+            if os.getenv("TRITON_BUILD_ONNXRUNTIME_VERSION")
+            else TRITON_VERSION_MAP[FLAGS.version][2],
         )
     ]
 
diff --git a/deploy/aws/values.yaml b/deploy/aws/values.yaml
index 98151829c7..67ecba6c53 100644
--- a/deploy/aws/values.yaml
+++ b/deploy/aws/values.yaml
@@ -27,7 +27,7 @@
 replicaCount: 1
 
 image:
-  imageName: nvcr.io/nvidia/tritonserver:24.07-py3
+  imageName: nvcr.io/nvidia/tritonserver:24.08-py3
   pullPolicy: IfNotPresent
   modelRepositoryPath: s3://triton-inference-server-repository/model_repository
   numGpus: 1
diff --git a/deploy/fleetcommand/Chart.yaml b/deploy/fleetcommand/Chart.yaml
index 340e19fb50..68aaf8f405 100644
--- a/deploy/fleetcommand/Chart.yaml
+++ b/deploy/fleetcommand/Chart.yaml
@@ -26,7 +26,7 @@
 
 apiVersion: v1
 # appVersion is the Triton version; update when changing release
-appVersion: "2.48.0"
+appVersion: "2.49.0"
 description: Triton Inference Server (Fleet Command)
 name: triton-inference-server
 # version is the Chart version; update when changing anything in the chart
diff --git a/deploy/fleetcommand/values.yaml b/deploy/fleetcommand/values.yaml
index 7a556ef7df..f3036d5bee 100644
--- a/deploy/fleetcommand/values.yaml
+++ b/deploy/fleetcommand/values.yaml
@@ -27,7 +27,7 @@
 replicaCount: 1
 
 image:
-  imageName: nvcr.io/nvidia/tritonserver:24.07-py3
+  imageName: nvcr.io/nvidia/tritonserver:24.08-py3
   pullPolicy: IfNotPresent
   numGpus: 1
   serverCommand: tritonserver
@@ -47,13 +47,13 @@ image:
     #
     # To set model control mode, uncomment and configure below
     # TODO: Fix the following url, it is invalid
-    # See https://github.com/triton-inference-server/server/blob/r24.07/docs/model_management.md
+    # See https://github.com/triton-inference-server/server/blob/r24.08/docs/model_management.md
     #  for more details
     #- --model-control-mode=explicit|poll|none
     #
     # Additional server args
     #
-    # see https://github.com/triton-inference-server/server/blob/r24.07/README.md
+    # see https://github.com/triton-inference-server/server/blob/r24.08/README.md
     #  for more details
 
 service:
diff --git a/deploy/gcp/values.yaml b/deploy/gcp/values.yaml
index 937acc6b80..c25bcf58ce 100644
--- a/deploy/gcp/values.yaml
+++ b/deploy/gcp/values.yaml
@@ -27,7 +27,7 @@
 replicaCount: 1
 
 image:
-  imageName: nvcr.io/nvidia/tritonserver:24.07-py3
+  imageName: nvcr.io/nvidia/tritonserver:24.08-py3
   pullPolicy: IfNotPresent
   modelRepositoryPath: gs://triton-inference-server-repository/model_repository
   numGpus: 1
diff --git a/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml b/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml
index 21e5a34077..4b896a1ac7 100644
--- a/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml
+++ b/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml
@@ -33,7 +33,7 @@ metadata:
   namespace: default
 spec:
   containers:
-  - image: nvcr.io/nvidia/tritonserver:24.07-py3-sdk
+  - image: nvcr.io/nvidia/tritonserver:24.08-py3-sdk
     imagePullPolicy: Always
     name: nv-triton-client
     securityContext:
diff --git a/deploy/gke-marketplace-app/server-deployer/build_and_push.sh b/deploy/gke-marketplace-app/server-deployer/build_and_push.sh
index e4fe8fe04f..cc5fa998b4 100755
--- a/deploy/gke-marketplace-app/server-deployer/build_and_push.sh
+++ b/deploy/gke-marketplace-app/server-deployer/build_and_push.sh
@@ -27,9 +27,9 @@
 
 export REGISTRY=gcr.io/$(gcloud config get-value project | tr ':' '/')
 export APP_NAME=tritonserver
-export MAJOR_VERSION=2.48
-export MINOR_VERSION=2.48.0
-export NGC_VERSION=24.07-py3
+export MAJOR_VERSION=2.49
+export MINOR_VERSION=2.49.0
+export NGC_VERSION=24.08-py3
 
 docker pull nvcr.io/nvidia/$APP_NAME:$NGC_VERSION
 
diff --git a/deploy/gke-marketplace-app/server-deployer/chart/triton/Chart.yaml b/deploy/gke-marketplace-app/server-deployer/chart/triton/Chart.yaml
index e2b00ad12b..41e2e8cdb9 100644
--- a/deploy/gke-marketplace-app/server-deployer/chart/triton/Chart.yaml
+++ b/deploy/gke-marketplace-app/server-deployer/chart/triton/Chart.yaml
@@ -25,7 +25,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 apiVersion: v1
-appVersion: "2.48"
+appVersion: "2.49"
 description: Triton Inference Server
 name: triton-inference-server
-version: 2.48.0
+version: 2.49.0
diff --git a/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml b/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml
index 3d460f8aa0..7f8a96608f 100644
--- a/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml
+++ b/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml
@@ -31,14 +31,14 @@ maxReplicaCount: 3
 tritonProtocol: HTTP
 # HPA GPU utilization autoscaling target
 HPATargetAverageValue: 85
-modelRepositoryPath: gs://triton_sample_models/24.07
-publishedVersion: '2.48.0'
+modelRepositoryPath: gs://triton_sample_models/24.08
+publishedVersion: '2.49.0'
 gcpMarketplace: true
 
 image:
   registry: gcr.io
   repository: nvidia-ngc-public/tritonserver
-  tag: 24.07-py3
+  tag: 24.08-py3
   pullPolicy: IfNotPresent
   # modify the model repository here to match your GCP storage bucket
   numGpus: 1
diff --git a/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml b/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml
index 0ecf429a44..356b3cce0f 100644
--- a/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml
+++ b/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml
@@ -27,7 +27,7 @@
 x-google-marketplace:
   schemaVersion: v2
   applicationApiVersion: v1beta1
-  publishedVersion: '2.48.0'
+  publishedVersion: '2.49.0'
   publishedVersionMetadata:
     releaseNote: >-
       Initial release.
diff --git a/deploy/gke-marketplace-app/server-deployer/schema.yaml b/deploy/gke-marketplace-app/server-deployer/schema.yaml
index c82f73e47f..076f62e05b 100644
--- a/deploy/gke-marketplace-app/server-deployer/schema.yaml
+++ b/deploy/gke-marketplace-app/server-deployer/schema.yaml
@@ -27,7 +27,7 @@
 x-google-marketplace:
   schemaVersion: v2
   applicationApiVersion: v1beta1
-  publishedVersion: '2.48.0'
+  publishedVersion: '2.49.0'
   publishedVersionMetadata:
     releaseNote: >-
       Initial release.
@@ -89,7 +89,7 @@ properties:
   modelRepositoryPath:
     type: string
     title: Bucket where models are stored. Please make sure the user/service account to create the GKE app has permission to this GCS bucket. Read Triton documentation on configs and formatting details, supporting TensorRT, TensorFlow, Pytorch, Onnx ... etc.
-    default: gs://triton_sample_models/24.07
+    default: gs://triton_sample_models/24.08
   image.ldPreloadPath:
     type: string
     title: Leave this empty by default. Triton allows users to create custom layers for backend such as TensorRT plugin or Tensorflow custom ops, the compiled shared library must be provided via LD_PRELOAD environment variable.
diff --git a/deploy/gke-marketplace-app/trt-engine/README.md b/deploy/gke-marketplace-app/trt-engine/README.md
index 22343d966d..aa8fa2a399 100644
--- a/deploy/gke-marketplace-app/trt-engine/README.md
+++ b/deploy/gke-marketplace-app/trt-engine/README.md
@@ -33,7 +33,7 @@
 ```
 docker run --gpus all -it --network host \
     --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 \
-    -v ~:/scripts nvcr.io/nvidia/tensorrt:24.07-py3
+    -v ~:/scripts nvcr.io/nvidia/tensorrt:24.08-py3
 
 pip install onnx six torch tf2onnx tensorflow
 
@@ -57,7 +57,7 @@ mkdir -p engines
 
 python3 builder.py -m models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/model.ckpt -o engines/bert_large_int8_bs1_s128.engine -b 1 -s 128 -c models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/ -v models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/vocab.txt --int8 --fp16 --strict --calib-num 1 -iln -imh
 
-gsutil cp bert_large_int8_bs1_s128.engine gs://triton_sample_models/24.07/bert/1/model.plan
+gsutil cp bert_large_int8_bs1_s128.engine gs://triton_sample_models/24.08/bert/1/model.plan
 ```
 
-For each Triton upgrade, container version used to generate the model, and the model path in GCS `gs://triton_sample_models/24.07/` should be updated accordingly with the correct version.
+For each Triton upgrade, container version used to generate the model, and the model path in GCS `gs://triton_sample_models/24.08/` should be updated accordingly with the correct version.
diff --git a/deploy/k8s-onprem/values.yaml b/deploy/k8s-onprem/values.yaml
index 9366a0710c..f0f28b68e1 100644
--- a/deploy/k8s-onprem/values.yaml
+++ b/deploy/k8s-onprem/values.yaml
@@ -29,7 +29,7 @@ tags:
   loadBalancing: true
 
 image:
-  imageName: nvcr.io/nvidia/tritonserver:24.07-py3
+  imageName: nvcr.io/nvidia/tritonserver:24.08-py3
   pullPolicy: IfNotPresent
   modelRepositoryServer: < Replace with the IP Address of your file server >
   modelRepositoryPath: /srv/models
diff --git a/deploy/oci/values.yaml b/deploy/oci/values.yaml
index 3a85e7901b..bf83490db4 100644
--- a/deploy/oci/values.yaml
+++ b/deploy/oci/values.yaml
@@ -27,7 +27,7 @@
 replicaCount: 1
 
 image:
-  imageName: nvcr.io/nvidia/tritonserver:24.07-py3
+  imageName: nvcr.io/nvidia/tritonserver:24.08-py3
   pullPolicy: IfNotPresent
   modelRepositoryPath: s3://https://<OCI_NAMESPACE>.compat.objectstorage.<OCI_REGION>.oraclecloud.com:443/triton-inference-server-repository
   numGpus: 1
diff --git a/docs/customization_guide/build.md b/docs/customization_guide/build.md
index 0c1cc08a41..43160f43bb 100644
--- a/docs/customization_guide/build.md
+++ b/docs/customization_guide/build.md
@@ -173,7 +173,7 @@ $ ./build.py ... --repo-tag=common:<container tag> --repo-tag=core:<container ta
 
 If you are building on a release branch then `<container tag>` will
 default to the branch name. For example, if you are building on the
-r24.07 branch, `<container tag>` will default to r24.07. If you are
+r24.08 branch, `<container tag>` will default to r24.08. If you are
 building on any other branch (including the *main* branch) then
 `<container tag>` will default to "main". Therefore, you typically do
 not need to provide `<container tag>` at all (nor the preceding
@@ -334,8 +334,8 @@ python build.py --cmake-dir=<path/to/repo>/build --build-dir=/tmp/citritonbuild
 If you are building on *main* branch then `<container tag>` will
 default to "main". If you are building on a release branch then
 `<container tag>` will default to the branch name. For example, if you
-are building on the r24.07 branch, `<container tag>` will default to
-r24.07. Therefore, you typically do not need to provide `<container
+are building on the r24.08 branch, `<container tag>` will default to
+r24.08. Therefore, you typically do not need to provide `<container
 tag>` at all (nor the preceding colon). You can use a different
 `<container tag>` for a component to instead use the corresponding
 branch/tag in the build. For example, if you have a branch called
diff --git a/docs/customization_guide/compose.md b/docs/customization_guide/compose.md
index ca3aafdbd0..563061c317 100644
--- a/docs/customization_guide/compose.md
+++ b/docs/customization_guide/compose.md
@@ -46,8 +46,8 @@ The `compose.py` script can be found in the
 Simply clone the repository and run `compose.py` to create a custom container.
 Note: Created container version will depend on the branch that was cloned.
 For example branch
- [r24.07](https://github.com/triton-inference-server/server/tree/r24.07)
-should be used to create a image based on the NGC 24.07 Triton release.
+ [r24.08](https://github.com/triton-inference-server/server/tree/r24.08)
+should be used to create a image based on the NGC 24.08 Triton release.
 
 `compose.py` provides `--backend`, `--repoagent` options that allow you to
 specify which backends and repository agents to include in the custom image.
@@ -79,20 +79,20 @@ For example, running
 ```
 python3 compose.py --backend pytorch --repoagent checksum
 ```
-on branch [r24.07](https://github.com/triton-inference-server/server/tree/r24.07) pulls:
-- `min` container `nvcr.io/nvidia/tritonserver:24.07-py3-min`
-- `full` container `nvcr.io/nvidia/tritonserver:24.07-py3`
+on branch [r24.08](https://github.com/triton-inference-server/server/tree/r24.08) pulls:
+- `min` container `nvcr.io/nvidia/tritonserver:24.08-py3-min`
+- `full` container `nvcr.io/nvidia/tritonserver:24.08-py3`
 
 Alternatively, users can specify the version of Triton container to pull from
 any branch by either:
 1. Adding flag `--container-version <container version>` to branch
 ```
-python3 compose.py --backend pytorch --repoagent checksum --container-version 24.07
+python3 compose.py --backend pytorch --repoagent checksum --container-version 24.08
 ```
 2. Specifying `--image min,<min container image name> --image full,<full container image name>`.
    The user is responsible for specifying compatible `min` and `full` containers.
 ```
-python3 compose.py --backend pytorch --repoagent checksum --image min,nvcr.io/nvidia/tritonserver:24.07-py3-min --image full,nvcr.io/nvidia/tritonserver:24.07-py3
+python3 compose.py --backend pytorch --repoagent checksum --image min,nvcr.io/nvidia/tritonserver:24.08-py3-min --image full,nvcr.io/nvidia/tritonserver:24.08-py3
 ```
 Method 1 and 2 will result in the same composed container. Furthermore,
 `--image` flag overrides the `--container-version` flag when both are specified.
@@ -103,8 +103,8 @@ Note:
 2. vLLM and TensorRT-LLM backends are currently not supported backends for
 `compose.py`. If you want to build additional backends on top of these backends,
 it would be better to [build it yourself](#build-it-yourself) by using
-`nvcr.io/nvidia/tritonserver:24.07-vllm-python-py3` or
-`nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3` as a `min` container.
+`nvcr.io/nvidia/tritonserver:24.08-vllm-python-py3` or
+`nvcr.io/nvidia/tritonserver:24.08-trtllm-python-py3` as a `min` container.
 
 
 ### CPU-only container composition
diff --git a/docs/customization_guide/test.md b/docs/customization_guide/test.md
index e066d31493..898267e34f 100644
--- a/docs/customization_guide/test.md
+++ b/docs/customization_guide/test.md
@@ -49,7 +49,7 @@ $ ./gen_qa_custom_ops
 ```
 
 This will create multiple model repositories in /tmp/\<version\>/qa_*
-(for example /tmp/24.07/qa_model_repository).  The TensorRT models
+(for example /tmp/24.08/qa_model_repository).  The TensorRT models
 will be created for the GPU on the system that CUDA considers device 0
 (zero). If you have multiple GPUs on your system see the documentation
 in the scripts for how to target a specific GPU.
diff --git a/docs/generate_docs.py b/docs/generate_docs.py
index acf0afc981..3cb9de4bc6 100755
--- a/docs/generate_docs.py
+++ b/docs/generate_docs.py
@@ -43,11 +43,11 @@
 """
 TODO: Needs to handle cross-branch linkage.
 
-For example, server/docs/user_guide/architecture.md on branch 24.07 links to
+For example, server/docs/user_guide/architecture.md on branch 24.08 links to
 server/docs/user_guide/model_analyzer.md on main branch. In this case, the
 hyperlink of model_analyzer.md should be a URL instead of relative path.
 
-Another example can be server/docs/user_guide/model_analyzer.md on branch 24.07
+Another example can be server/docs/user_guide/model_analyzer.md on branch 24.08
 links to a file in server repo with relative path. Currently all URLs are
 hardcoded to main branch. We need to make sure that the URL actually points to the
 correct branch. We also need to handle cases like deprecated or removed files from
diff --git a/docs/user_guide/custom_operations.md b/docs/user_guide/custom_operations.md
index 136edd180f..6fa3cee3dc 100644
--- a/docs/user_guide/custom_operations.md
+++ b/docs/user_guide/custom_operations.md
@@ -64,7 +64,7 @@ simple way to ensure you are using the correct version of TensorRT is
 to use the [NGC TensorRT
 container](https://ngc.nvidia.com/catalog/containers/nvidia:tensorrt)
 corresponding to the Triton container. For example, if you are using
-the 24.07 version of Triton, use the 24.07 version of the TensorRT
+the 24.08 version of Triton, use the 24.08 version of the TensorRT
 container.
 
 ## TensorFlow
@@ -123,7 +123,7 @@ simple way to ensure you are using the correct version of TensorFlow
 is to use the [NGC TensorFlow
 container](https://ngc.nvidia.com/catalog/containers/nvidia:tensorflow)
 corresponding to the Triton container. For example, if you are using
-the 24.07 version of Triton, use the 24.07 version of the TensorFlow
+the 24.08 version of Triton, use the 24.08 version of the TensorFlow
 container.
 
 ## PyTorch
@@ -167,7 +167,7 @@ simple way to ensure you are using the correct version of PyTorch is
 to use the [NGC PyTorch
 container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch)
 corresponding to the Triton container. For example, if you are using
-the 24.07 version of Triton, use the 24.07 version of the PyTorch
+the 24.08 version of Triton, use the 24.08 version of the PyTorch
 container.
 
 ## ONNX
diff --git a/docs/user_guide/performance_tuning.md b/docs/user_guide/performance_tuning.md
index 446534da99..70e76cd5ef 100644
--- a/docs/user_guide/performance_tuning.md
+++ b/docs/user_guide/performance_tuning.md
@@ -235,7 +235,7 @@ with a `tritonserver` binary.
 
 ```bash
 # Start server container
-docker run -ti --rm --gpus=all --network=host -v $PWD:/mnt --name triton-server nvcr.io/nvidia/tritonserver:24.07-py3
+docker run -ti --rm --gpus=all --network=host -v $PWD:/mnt --name triton-server nvcr.io/nvidia/tritonserver:24.08-py3
 
 # Start serving your models
 tritonserver --model-repository=/mnt/models
@@ -284,7 +284,7 @@ by setting the `-u` flag, such as `perf_analyzer -m densenet_onnx -u
 
 ```bash
 # Start the SDK container interactively
-docker run -ti --rm --gpus=all --network=host -v $PWD:/mnt --name triton-client nvcr.io/nvidia/tritonserver:24.07-py3-sdk
+docker run -ti --rm --gpus=all --network=host -v $PWD:/mnt --name triton-client nvcr.io/nvidia/tritonserver:24.08-py3-sdk
 
 # Benchmark model being served from step 3
 perf_analyzer -m densenet_onnx --concurrency-range 1:4
diff --git a/qa/L0_backend_python/test.sh b/qa/L0_backend_python/test.sh
index f6d4b7b445..324ee5ba1f 100755
--- a/qa/L0_backend_python/test.sh
+++ b/qa/L0_backend_python/test.sh
@@ -39,18 +39,18 @@ fi
 # /mnt/c when needed but the paths on the tritonserver command-line
 # must be C:/ style.
 export TEST_WINDOWS=0
-if [[ "$(< /proc/sys/kernel/osrelease)" == *microsoft* ]]; then
+if [[ -v WSL_DISTRO_NAME ]] || [[ -v MSYSTEM ]]; then
     export DATADIR=${DATADIR:="/c/data/inferenceserver/${REPO_VERSION}"}
     export TRITON_DIR=${TRITON_DIR:=c:/tritonserver}
     # This will run in WSL, but Triton will run in windows, so environment
     # variables meant for loaded models must be exported using WSLENV.
     # The /w flag indicates the value should only be included when invoking
     # Win32 from WSL.
-    export WSLENV=TRITON_DIR/w
+    export WSLENV=TRITON_DIR
     export SERVER=${SERVER:=c:/tritonserver/bin/tritonserver.exe}
     export BACKEND_DIR=${BACKEND_DIR:=c:/tritonserver/backends}
     export MODELDIR=${MODELDIR:=c:/}
-    TEST_WINDOWS=1
+    export TEST_WINDOWS=1
 else
     export DATADIR=${DATADIR:="/data/inferenceserver/${REPO_VERSION}"}
     export TRITON_DIR=${TRITON_DIR:="/opt/tritonserver"}
diff --git a/qa/L0_batcher/test.sh b/qa/L0_batcher/test.sh
index 827751eb40..7043aab2a5 100755
--- a/qa/L0_batcher/test.sh
+++ b/qa/L0_batcher/test.sh
@@ -79,7 +79,7 @@ TF_VERSION=${TF_VERSION:=2}
 # On windows the paths invoked by the script (running in WSL) must use
 # /mnt/c when needed but the paths on the tritonserver command-line
 # must be C:/ style.
-if [[ "$(< /proc/sys/kernel/osrelease)" == *microsoft* ]]; then
+if [[ -v WSL_DISTRO_NAME ]] || [[ -v MSYSTEM ]]; then
     MODELDIR=${MODELDIR:=C:/models}
     DATADIR=${DATADIR:="/mnt/c/data/inferenceserver/${REPO_VERSION}"}
     BACKEND_DIR=${BACKEND_DIR:=C:/tritonserver/backends}
@@ -601,7 +601,7 @@ done
 TEST_CASE=test_multi_batch_preserve_ordering
 
 # Skip test for Windows. Trace file concats at 8192 chars on Windows.
-if [[ "$(< /proc/sys/kernel/osrelease)" != *microsoft* ]]; then
+if  [[ ! -v WSL_DISTRO_NAME ]] || [[ ! -v MSYSTEM ]]; then
     rm -fr ./custom_models && mkdir ./custom_models && \
         cp -r ../custom_models/custom_zero_1_float32 ./custom_models/. && \
         mkdir -p ./custom_models/custom_zero_1_float32/1
diff --git a/qa/L0_grpc/test.sh b/qa/L0_grpc/test.sh
index 50cf5a6f91..93d22e75be 100755
--- a/qa/L0_grpc/test.sh
+++ b/qa/L0_grpc/test.sh
@@ -48,7 +48,7 @@ NGINX_CONF="./nginx.conf"
 # On windows the paths invoked by the script (running in WSL) must use
 # /mnt/c when needed but the paths on the tritonserver command-line
 # must be C:/ style.
-if [[ "$(< /proc/sys/kernel/osrelease)" == *microsoft* ]]; then
+if [[ -v WSL_DISTRO_NAME ]] || [[ -v MSYSTEM ]]; then
     SDKDIR=${SDKDIR:=C:/sdk}
     MODELDIR=${MODELDIR:=C:/models}
     CLIENT_PLUGIN_MODELDIR=${MODELDIR:=C:/client_plugin_models}
diff --git a/qa/L0_http/test.sh b/qa/L0_http/test.sh
index 321c398995..81ae4c254c 100755
--- a/qa/L0_http/test.sh
+++ b/qa/L0_http/test.sh
@@ -49,7 +49,7 @@ NGINX_CONF="./nginx.conf"
 # On windows the paths invoked by the script (running in WSL) must use
 # /mnt/c when needed but the paths on the tritonserver command-line
 # must be C:/ style.
-if [[ "$(< /proc/sys/kernel/osrelease)" == *microsoft* ]]; then
+if [[ -v WSL_DISTRO_NAME ]] || [[ -v MSYSTEM ]]; then
     SDKDIR=${SDKDIR:=C:/sdk}
     MODELDIR=${MODELDIR:=C:/models}
     DATADIR=${DATADIR:="/mnt/c/data/inferenceserver/${REPO_VERSION}"}
diff --git a/qa/L0_infer/test.sh b/qa/L0_infer/test.sh
index dba4d7dbcc..36f63053e3 100755
--- a/qa/L0_infer/test.sh
+++ b/qa/L0_infer/test.sh
@@ -87,7 +87,7 @@ DEFAULT_SHM_SIZE_BYTES=$((1024*1024*$DEFAULT_SHM_SIZE_MB))
 # On windows the paths invoked by the script (running in WSL) must use
 # /mnt/c when needed but the paths on the tritonserver command-line
 # must be C:/ style.
-if [[ "$(< /proc/sys/kernel/osrelease)" == *microsoft* ]]; then
+if [[ -v WSL_DISTRO_NAME ]] || [[ -v MSYSTEM ]]; then
     MODELDIR=${MODELDIR:=C:/models}
     DATADIR=${DATADIR:="/mnt/c/data/inferenceserver/${REPO_VERSION}"}
     BACKEND_DIR=${BACKEND_DIR:=C:/tritonserver/backends}
diff --git a/qa/L0_sequence_batcher/test.sh b/qa/L0_sequence_batcher/test.sh
index d91b433966..23ee387b55 100755
--- a/qa/L0_sequence_batcher/test.sh
+++ b/qa/L0_sequence_batcher/test.sh
@@ -93,7 +93,7 @@ TF_VERSION=${TF_VERSION:=2}
 # /mnt/c when needed but the paths on the tritonserver command-line
 # must be C:/ style.
 WINDOWS=0
-if [[ "$(< /proc/sys/kernel/osrelease)" == *microsoft* ]]; then
+if [[ -v WSL_DISTRO_NAME ]] || [[ -v MSYSTEM ]]; then
     MODELDIR=${MODELDIR:=C:/models}
     DATADIR=${DATADIR:="/mnt/c/data/inferenceserver/${REPO_VERSION}"}
     BACKEND_DIR=${BACKEND_DIR:=C:/tritonserver/backends}
diff --git a/qa/L0_trt_plugin/test.sh b/qa/L0_trt_plugin/test.sh
index 075dd54eab..a9d04331f0 100755
--- a/qa/L0_trt_plugin/test.sh
+++ b/qa/L0_trt_plugin/test.sh
@@ -47,7 +47,7 @@ PLUGIN_TEST=trt_plugin_test.py
 # On windows the paths invoked by the script (running in WSL) must use
 # /mnt/c when needed but the paths on the tritonserver command-line
 # must be C:/ style.
-if [[ "$(< /proc/sys/kernel/osrelease)" == *microsoft* ]]; then
+if [[ -v WSL_DISTRO_NAME ]] || [[ -v MSYSTEM ]]; then
     DATADIR=${DATADIR:="/mnt/c/data/inferenceserver/${REPO_VERSION}"}
     MODELDIR=${MODELDIR:=C:/models}
     CUSTOMPLUGIN=${CUSTOMPLUGIN:=$MODELDIR/HardmaxPlugin.dll}
@@ -135,7 +135,7 @@ SERVER_LD_PRELOAD=$CUSTOMPLUGIN
 SERVER_ARGS=$SERVER_ARGS_BASE
 SERVER_LOG="./inference_server_$LOG_IDX.log"
 
-if [[ "$(< /proc/sys/kernel/osrelease)" != *microsoft* ]]; then
+if  [[ ! -v WSL_DISTRO_NAME ]] || [[ ! -v MSYSTEM ]]; then
     run_server
     if [ "$SERVER_PID" == "0" ]; then
         echo -e "\n***\n*** Failed to start $SERVER\n***"
diff --git a/qa/common/gen_jetson_trt_models b/qa/common/gen_jetson_trt_models
index 99a6175a08..173d8c1efc 100755
--- a/qa/common/gen_jetson_trt_models
+++ b/qa/common/gen_jetson_trt_models
@@ -34,7 +34,7 @@
 # Make all generated files accessible outside of container
 umask 0000
 # Set the version of the models
-TRITON_VERSION=${TRITON_VERSION:=24.07}
+TRITON_VERSION=${TRITON_VERSION:=24.08}
 # Set the CUDA device to use
 CUDA_DEVICE=${RUNNER_ID:=0}
 # Set TensorRT image
diff --git a/qa/common/gen_qa_custom_ops b/qa/common/gen_qa_custom_ops
index 4ae0f006b3..c72ea49ca1 100755
--- a/qa/common/gen_qa_custom_ops
+++ b/qa/common/gen_qa_custom_ops
@@ -37,7 +37,7 @@
 ##
 ############################################################################
 
-TRITON_VERSION=${TRITON_VERSION:=24.07}
+TRITON_VERSION=${TRITON_VERSION:=24.08}
 NVIDIA_UPSTREAM_VERSION=${NVIDIA_UPSTREAM_VERSION:=$TRITON_VERSION}
 TENSORFLOW_IMAGE=${TENSORFLOW_IMAGE:=nvcr.io/nvidia/tensorflow:$NVIDIA_UPSTREAM_VERSION-tf2-py3}
 PYTORCH_IMAGE=${PYTORCH_IMAGE:=nvcr.io/nvidia/pytorch:$NVIDIA_UPSTREAM_VERSION-py3}
diff --git a/qa/common/gen_qa_model_repository b/qa/common/gen_qa_model_repository
index cab497aa86..be9383217b 100755
--- a/qa/common/gen_qa_model_repository
+++ b/qa/common/gen_qa_model_repository
@@ -48,7 +48,7 @@
 ##
 ############################################################################
 
-TRITON_VERSION=${TRITON_VERSION:=24.07}
+TRITON_VERSION=${TRITON_VERSION:=24.08}
 
 # ONNX. Use ONNX_OPSET 0 to use the default for ONNX version
 ONNX_VERSION=1.13.0
diff --git a/qa/common/util.sh b/qa/common/util.sh
index 3297dd2914..3874916573 100755
--- a/qa/common/util.sh
+++ b/qa/common/util.sh
@@ -257,7 +257,7 @@ function run_server_nowait () {
         return
     fi
 
-    if [[ "$(< /proc/sys/kernel/osrelease)" == *microsoft* ]]; then
+    if [[ -v WSL_DISTRO_NAME ]] || [[ -v MSYSTEM ]]; then
         # LD_PRELOAD not yet supported on windows
         if [ -z "$SERVER_LD_PRELOAD" ]; then
             echo "=== Running $SERVER $SERVER_ARGS"
@@ -329,7 +329,7 @@ function kill_server () {
     # causes the entire WSL shell to just exit. So instead we must use
     # taskkill.exe which can only forcefully kill tritonserver which
     # means that it does not gracefully exit.
-    if [[ "$(< /proc/sys/kernel/osrelease)" == *microsoft* ]]; then
+    if [[ -v WSL_DISTRO_NAME ]]; then
         # Disable -x as it makes output below hard to read
         oldstate="$(set +o)"; [[ -o errexit ]] && oldstate="$oldstate; set -e"
         set +x
@@ -353,6 +353,8 @@ function kill_server () {
         fi
 
         set +vx; eval "$oldstate"
+    elif [[ -v MSYSTEM ]] ; then
+        taskkill //F //IM tritonserver.exe
     else
         # Non-windows...
         kill $SERVER_PID
@@ -512,17 +514,23 @@ remove_array_outliers() {
 
 function setup_virtualenv() {
     # Create and activate virtual environment
-    virtualenv --system-site-packages venv
-    source venv/bin/activate
-    pip install pytest
+    if [[ -v MSYSTEM ]]; then
+      pip3 install pytest
+    else
+      virtualenv --system-site-packages venv
+      source venv/bin/activate
+      pip install pytest
+    fi
 
     if [[ ${TEST_WINDOWS} == 1 ]]; then
-        pip3 install "numpy<2" tritonclient[all]
+      pip3 install "numpy<2" tritonclient[all]
     fi
 }
 
 function deactivate_virtualenv() {
     # Deactivate virtual environment and clean up
+  if [[ ! -v MSYSTEM ]]; then
     deactivate
     rm -fr venv
+  fi
 }

From 96144e0a0c51a09122229c885ac84b2756ba0248 Mon Sep 17 00:00:00 2001
From: Kris Hung <krish@nvidia.com>
Date: Tue, 27 Aug 2024 12:28:47 -0700
Subject: [PATCH 19/44] docs: Add python backend to windows build command
 (#7572)

---
 docs/customization_guide/build.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/customization_guide/build.md b/docs/customization_guide/build.md
index 43160f43bb..f0f3bd99e2 100644
--- a/docs/customization_guide/build.md
+++ b/docs/customization_guide/build.md
@@ -328,7 +328,7 @@ and so you must enable them explicitly. The following build.py
 invocation builds all features and backends available on windows.
 
 ```bash
-python build.py --cmake-dir=<path/to/repo>/build --build-dir=/tmp/citritonbuild --no-container-pull --image=base,win10-py3-min --enable-logging --enable-stats --enable-tracing --enable-gpu --endpoint=grpc --endpoint=http --repo-tag=common:<container tag> --repo-tag=core:<container tag> --repo-tag=backend:<container tag> --repo-tag=thirdparty:<container tag> --backend=ensemble --backend=tensorrt:<container tag> --backend=onnxruntime:<container tag> --backend=openvino:<container tag>
+python build.py --cmake-dir=<path/to/repo>/build --build-dir=/tmp/citritonbuild --no-container-pull --image=base,win10-py3-min --enable-logging --enable-stats --enable-tracing --enable-gpu --endpoint=grpc --endpoint=http --repo-tag=common:<container tag> --repo-tag=core:<container tag> --repo-tag=backend:<container tag> --repo-tag=thirdparty:<container tag> --backend=ensemble --backend=tensorrt:<container tag> --backend=onnxruntime:<container tag> --backend=openvino:<container tag> --backend=python:<container tag>
 ```
 
 If you are building on *main* branch then `<container tag>` will

From 4ef45da2e468130ba774000416a285082d5d88f4 Mon Sep 17 00:00:00 2001
From: Kris Hung <krish@nvidia.com>
Date: Tue, 27 Aug 2024 12:30:23 -0700
Subject: [PATCH 20/44] docs: Triton TRT-LLM user guide (#7529)

---
 docs/getting_started/trtllm_user_guide.md | 118 ++++++++++++++++++++++
 1 file changed, 118 insertions(+)
 create mode 100644 docs/getting_started/trtllm_user_guide.md

diff --git a/docs/getting_started/trtllm_user_guide.md b/docs/getting_started/trtllm_user_guide.md
new file mode 100644
index 0000000000..7f128e98c7
--- /dev/null
+++ b/docs/getting_started/trtllm_user_guide.md
@@ -0,0 +1,118 @@
+<!--
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+-->
+
+# TensorRT-LLM User Guide
+
+## What is TensorRT-LLM
+
+[TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM)
+(TRT-LLM) is an open-source library designed to accelerate and optimize the
+inference performance of large language models (LLMs) on NVIDIA GPUs. TRT-LLM
+offers users an easy-to-use Python API to build TensorRT engines for LLMs,
+incorporating state-of-the-art optimizations to ensure efficient inference on
+NVIDIA GPUs.
+
+## How to run TRT-LLM models with Triton Server via TensorRT-LLM backend
+
+The
+[TensorRT-LLM Backend](https://github.com/triton-inference-server/tensorrtllm_backend)
+lets you serve TensorRT-LLM models with Triton Inference Server. Check out the
+[Getting Started](https://github.com/triton-inference-server/tensorrtllm_backend?tab=readme-ov-file#getting-started)
+section in the TensorRT-LLM Backend repo to learn how to utlize the
+[NGC Triton TRT-LLM container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/tritonserver)
+to prepare engines for your LLM models and serve them with Triton.
+
+## How to use your custom TRT-LLM model
+
+All the supported models can be found in the
+[examples](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples) folder in
+the TRT-LLM repo. Follow the examples to convert your models to TensorRT
+engines.
+
+After the engine is built, [prepare the model repository](https://github.com/triton-inference-server/tensorrtllm_backend?tab=readme-ov-file#prepare-the-model-repository)
+for Triton, and
+[modify the model configuration](https://github.com/triton-inference-server/tensorrtllm_backend?tab=readme-ov-file#modify-the-model-configuration).
+
+Only the *mandatory parameters* need to be set in the model config file. Feel free
+to modify the optional parameters as needed. To learn more about the
+parameters, model inputs, and outputs, see the
+[model config documentation](ttps://github.com/triton-inference-server/tensorrtllm_backend/blob/main/docs/model_config.md) for more details.
+
+## Advanced Configuration Options and Deployment Strategies
+
+Explore advanced configuration options and deployment strategies to optimize
+and run Triton with your TRT-LLM models effectively:
+
+- [Model Deployment](https://github.com/triton-inference-server/tensorrtllm_backend/tree/main?tab=readme-ov-file#model-deployment): Techniques for efficiently deploying and managing your models in various environments.
+- [Multi-Instance GPU (MIG) Support](https://github.com/triton-inference-server/tensorrtllm_backend/tree/main?tab=readme-ov-file#mig-support): Run Triton and TRT-LLM models with MIG to optimize GPU resource management.
+- [Scheduling](https://github.com/triton-inference-server/tensorrtllm_backend/tree/main?tab=readme-ov-file#scheduling): Configure scheduling policies to control how requests are managed and executed.
+- [Key-Value Cache](https://github.com/triton-inference-server/tensorrtllm_backend/tree/main?tab=readme-ov-file#key-value-cache): Utlizte KV cache and KV cache reuse to optimize memory usage and improve performance.
+- [Decoding](https://github.com/triton-inference-server/tensorrtllm_backend/tree/main?tab=readme-ov-file#decoding): Advanced methods for generating text, including top-k, top-p, top-k top-p, beam search, Medusa, and speculative decoding.
+- [Chunked Context](https://github.com/triton-inference-server/tensorrtllm_backend/tree/main?tab=readme-ov-file#chunked-context): Splitting the context into several chunks and batching them during generation phase to increase overall throughput.
+- [Quantization](https://github.com/triton-inference-server/tensorrtllm_backend/tree/main?tab=readme-ov-file#quantization): Apply quantization techniques to reduce model size and enhance inference speed.
+- [LoRa (Low-Rank Adaptation)](https://github.com/triton-inference-server/tensorrtllm_backend/tree/main?tab=readme-ov-file#lora): Use LoRa for efficient model fine-tuning and adaptation.
+
+## Tutorials
+
+Make sure to check out the
+[tutorials](https://github.com/triton-inference-server/tutorials) repo to see
+more guides on serving popular LLM models with Triton Server and TensorRT-LLM,
+as well as deploying them on Kubernetes.
+
+## Benchmark
+
+[GenAI-Perf](https://github.com/triton-inference-server/perf_analyzer/tree/main/genai-perf)
+is a command line tool for measuring the throughput and latency of LLMs served
+by Triton Inference Server. Check out the
+[Quick Start](https://github.com/triton-inference-server/perf_analyzer/tree/main/genai-perf#quick-start)
+to learn how to use GenAI-Perf to benchmark your LLM models.
+
+## Performance Best Practices
+
+Check out the
+[Performance Best Practices guide](https://nvidia.github.io/TensorRT-LLM/performance/perf-best-practices.html)
+to learn how to optimize your TensorRT-LLM models for better performance.
+
+## Metrics
+
+Triton Server provides
+[metrics](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/metrics.md)
+indicating GPU and request statistics.
+See the
+[Triton Metrics](https://github.com/triton-inference-server/tensorrtllm_backend?tab=readme-ov-file#triton-metrics)
+section in the TensorRT-LLM Backend repo to learn how to query the Triton
+metrics endpoint to obtain TRT-LLM statistics.
+
+## Ask questions or report issues
+
+Can't find what you're looking for, or have a question or issue? Feel free to
+ask questions or report issues in the GitHub issues page:
+
+- [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM/issues)
+- [TensorRT-LLM Backend](https://github.com/triton-inference-server/tensorrtllm_backend/issues)
+- [Triton Inference Server](https://github.com/triton-inference-server/server/issues)

From 9ec820b0f0b8b063589ecfd140cdb8c5ea45a035 Mon Sep 17 00:00:00 2001
From: Pavithra Vijayakrishnan
 <160681768+pvijayakrish@users.noreply.github.com>
Date: Tue, 27 Aug 2024 16:07:58 -0700
Subject: [PATCH 21/44] Build: Updating to allow passing DOCKER_GPU_ARGS at
 model generation (#7566)

---
 qa/common/gen_jetson_trt_models   | 4 +---
 qa/common/gen_qa_custom_ops       | 2 +-
 qa/common/gen_qa_model_repository | 9 ++++++---
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/qa/common/gen_jetson_trt_models b/qa/common/gen_jetson_trt_models
index 173d8c1efc..892b8dd383 100755
--- a/qa/common/gen_jetson_trt_models
+++ b/qa/common/gen_jetson_trt_models
@@ -48,9 +48,7 @@ HOST_MODEL_DIR=${HOST_MODEL_DIR:="${HOST_BUILD_DIR}/${TRITON_VERSION}"}
 HOST_SOURCE_DIR=$HOST_BUILD_DIR/gen_srcdir
 
 # Set CI specific parameters
-DOCKER_GPU_ARGS="${DOCKER_GPU_ARGS:="--gpus device=$CUDA_DEVICE"}"
-[[ $RUNNER_GPUS =~ ^[0-9] ]] && DOCKER_GPU_ARGS=$(eval $NV_DOCKER_ARGS)
-
+DOCKER_GPU_ARGS=${DOCKER_GPU_ARGS:-$([[ $RUNNER_GPUS =~ ^[0-9] ]] && eval $NV_DOCKER_ARGS || echo "--gpus device=$CUDA_DEVICE" )}
 
 # Set model output directories
 
diff --git a/qa/common/gen_qa_custom_ops b/qa/common/gen_qa_custom_ops
index c72ea49ca1..8864da69f5 100755
--- a/qa/common/gen_qa_custom_ops
+++ b/qa/common/gen_qa_custom_ops
@@ -44,7 +44,7 @@ PYTORCH_IMAGE=${PYTORCH_IMAGE:=nvcr.io/nvidia/pytorch:$NVIDIA_UPSTREAM_VERSION-p
 
 CUDA_DEVICE=${NV_GPU:=0}
 
-[[ $RUNNER_GPUS =~ ^[0-9] ]] && DOCKER_GPU_ARGS=$(eval $NV_DOCKER_ARGS) || DOCKER_GPU_ARGS="--gpus device=$CUDA_DEVICE"
+DOCKER_GPU_ARGS=${DOCKER_GPU_ARGS:-$([[ $RUNNER_GPUS =~ ^[0-9] ]] && eval $NV_DOCKER_ARGS || echo "--gpus device=$CUDA_DEVICE" )}
 
 ###
 HOST_BUILD_DIR=${HOST_BUILD_DIR:=/tmp}
diff --git a/qa/common/gen_qa_model_repository b/qa/common/gen_qa_model_repository
index be9383217b..900b8fdc03 100755
--- a/qa/common/gen_qa_model_repository
+++ b/qa/common/gen_qa_model_repository
@@ -63,7 +63,8 @@ TENSORFLOW_IMAGE=${TENSORFLOW_IMAGE:=nvcr.io/nvidia/tensorflow:$TRITON_VERSION-t
 TENSORRT_IMAGE=${TENSORRT_IMAGE:=nvcr.io/nvidia/tensorrt:$TRITON_VERSION-py3}
 CUDA_DEVICE=${NV_GPU:=0}
 
-[[ $RUNNER_GPUS =~ ^[0-9] ]] && DOCKER_GPU_ARGS=$(eval $NV_DOCKER_ARGS) || DOCKER_GPU_ARGS="--gpus device=$CUDA_DEVICE"
+DOCKER_GPU_ARGS=${DOCKER_GPU_ARGS:-$([[ $RUNNER_GPUS =~ ^[0-9] ]] && eval $NV_DOCKER_ARGS || echo "--gpus device=$CUDA_DEVICE" )}
+MODEL_TYPE=${MODEL_TYPE:-""}
 
 ###
 HOST_BUILD_DIR=${HOST_BUILD_DIR:=/tmp}
@@ -360,8 +361,10 @@ python3 $SRCDIR/gen_qa_implicit_models.py --libtorch --variable --models_dir=$VA
 chmod -R 777 $VARIMPLICITSEQDESTDIR
 python3 $SRCDIR/gen_qa_dyna_sequence_models.py --libtorch --models_dir=$DYNASEQDESTDIR
 chmod -R 777 $DYNASEQDESTDIR
-python3 $SRCDIR/gen_qa_torchtrt_models.py --models_dir=$TORCHTRTDESTDIR
-chmod -R 777 $TORCHTRTDESTDIR
+if [ -z "$MODEL_TYPE" ] || [ "$MODEL_TYPE" != "igpu" ]; then
+  python3 $SRCDIR/gen_qa_torchtrt_models.py --models_dir=$TORCHTRTDESTDIR
+  chmod -R 777 $TORCHTRTDESTDIR
+fi
 python3 $SRCDIR/gen_qa_ragged_models.py --libtorch --models_dir=$RAGGEDDESTDIR
 chmod -R 777 $RAGGEDDESTDIR
 # Export torchvision image models to ONNX

From f6021f7c48f58f7e299027e7479fb39cabfba36d Mon Sep 17 00:00:00 2001
From: KrishnanPrash <140860868+KrishnanPrash@users.noreply.github.com>
Date: Fri, 30 Aug 2024 18:23:03 -0500
Subject: [PATCH 22/44] feat: Python Deployment of Triton Inference Server
 (#7501)

Co-authored-by: GuanLuo <41310872+GuanLuo@users.noreply.github.com>
Co-authored-by: Ryan McCormick <rmccormick@nvidia.com>
Co-authored-by: Misha Chornyi <99709299+mc-nv@users.noreply.github.com>
---
 Dockerfile.QA                                 |   4 +
 docs/customization_guide/tritonfrontend.md    | 113 ++++++++
 qa/L0_python_api/test.sh                      |  11 +-
 qa/L0_python_api/test_kserve.py               | 252 ++++++++++++++++++
 .../delayed_identity/1/model.py               |  51 ++++
 .../delayed_identity/config.pbtxt             |  52 ++++
 .../test_model_repository/identity/1/model.py |  49 ++++
 .../identity/config.pbtxt                     |  44 +++
 qa/L0_python_api/testing_utils.py             |  95 +++++++
 src/CMakeLists.txt                            |   6 +-
 src/common.h                                  |  60 +++++
 src/grpc/grpc_server.cc                       |  95 +++++++
 src/grpc/grpc_server.h                        |  18 ++
 src/grpc/infer_handler.cc                     |  20 +-
 src/http_server.cc                            |  38 ++-
 src/http_server.h                             |   8 +
 src/python/CMakeLists.txt                     |  78 ++++++
 src/python/build_wheel.py                     | 135 ++++++++++
 src/python/examples/example.py                |  85 ++++++
 .../1/model.savedmodel/saved_model.pb         | Bin 0 -> 531 bytes
 .../identity/config.pbtxt                     |  44 +++
 src/python/setup.py                           | 112 ++++++++
 src/python/tritonfrontend/CMakeLists.txt      | 181 +++++++++++++
 src/python/tritonfrontend/__init__.py         |  33 +++
 src/python/tritonfrontend/__init__.pyi        |   1 +
 src/python/tritonfrontend/_api/__init__.py    |  25 ++
 .../tritonfrontend/_api/_error_mapping.py     |  48 ++++
 src/python/tritonfrontend/_api/_kservegrpc.py | 137 ++++++++++
 .../tritonfrontend/_api/_kservegrpc.pyi       |  74 +++++
 src/python/tritonfrontend/_api/_kservehttp.py |  97 +++++++
 .../tritonfrontend/_api/_kservehttp.pyi       |  49 ++++
 src/python/tritonfrontend/_c/__init__.py      |  27 ++
 src/python/tritonfrontend/_c/__init__.pyi     |  27 ++
 src/python/tritonfrontend/_c/tritonfrontend.h | 139 ++++++++++
 .../_c/tritonfrontend_bindings.pyi            |  44 +++
 .../_c/tritonfrontend_pybind.cc               |  76 ++++++
 src/python/tritonfrontend/py.typed            |   0
 37 files changed, 2315 insertions(+), 13 deletions(-)
 create mode 100644 docs/customization_guide/tritonfrontend.md
 create mode 100644 qa/L0_python_api/test_kserve.py
 create mode 100644 qa/L0_python_api/test_model_repository/delayed_identity/1/model.py
 create mode 100644 qa/L0_python_api/test_model_repository/delayed_identity/config.pbtxt
 create mode 100644 qa/L0_python_api/test_model_repository/identity/1/model.py
 create mode 100644 qa/L0_python_api/test_model_repository/identity/config.pbtxt
 create mode 100644 qa/L0_python_api/testing_utils.py
 create mode 100644 src/python/CMakeLists.txt
 create mode 100755 src/python/build_wheel.py
 create mode 100644 src/python/examples/example.py
 create mode 100755 src/python/examples/example_model_repository/identity/1/model.savedmodel/saved_model.pb
 create mode 100644 src/python/examples/example_model_repository/identity/config.pbtxt
 create mode 100755 src/python/setup.py
 create mode 100644 src/python/tritonfrontend/CMakeLists.txt
 create mode 100644 src/python/tritonfrontend/__init__.py
 create mode 100644 src/python/tritonfrontend/__init__.pyi
 create mode 100644 src/python/tritonfrontend/_api/__init__.py
 create mode 100644 src/python/tritonfrontend/_api/_error_mapping.py
 create mode 100644 src/python/tritonfrontend/_api/_kservegrpc.py
 create mode 100644 src/python/tritonfrontend/_api/_kservegrpc.pyi
 create mode 100644 src/python/tritonfrontend/_api/_kservehttp.py
 create mode 100644 src/python/tritonfrontend/_api/_kservehttp.pyi
 create mode 100644 src/python/tritonfrontend/_c/__init__.py
 create mode 100644 src/python/tritonfrontend/_c/__init__.pyi
 create mode 100644 src/python/tritonfrontend/_c/tritonfrontend.h
 create mode 100644 src/python/tritonfrontend/_c/tritonfrontend_bindings.pyi
 create mode 100644 src/python/tritonfrontend/_c/tritonfrontend_pybind.cc
 create mode 100644 src/python/tritonfrontend/py.typed

diff --git a/Dockerfile.QA b/Dockerfile.QA
index b381abfaaf..68ab519b41 100644
--- a/Dockerfile.QA
+++ b/Dockerfile.QA
@@ -390,6 +390,10 @@ RUN rm -fr qa/L0_copyrights qa/L0_build_variants && \
 RUN find qa/pkgs/ -maxdepth 1 -type f -name \
     "tritonserver-*.whl" | xargs -I {} pip3 install --upgrade {}[all]
 
+# Install Triton Frontend Python API
+RUN find qa/pkgs/ -type f -name \
+    "tritonfrontend-*.whl" | xargs -I {} pip3 install --upgrade {}[all]
+
 ENV LD_LIBRARY_PATH /opt/tritonserver/qa/clients:${LD_LIBRARY_PATH}
 
 # DLIS-3631: Needed to run Perf Analyzer CI tests correctly
diff --git a/docs/customization_guide/tritonfrontend.md b/docs/customization_guide/tritonfrontend.md
new file mode 100644
index 0000000000..caaac9308d
--- /dev/null
+++ b/docs/customization_guide/tritonfrontend.md
@@ -0,0 +1,113 @@
+### Triton Server (tritonfrontend) Bindings
+
+The `tritonfrontend` python package is a set of bindings to Triton's existing frontends implemented in C++. Currently, `tritonfrontend` supports starting up `KServeHttp` and `KServeGrpc` frontends. These bindings used in-combination with Triton's Python In-Process API ([`tritonserver`](https://github.com/triton-inference-server/core/tree/main/python/tritonserver)) and [`tritonclient`](https://github.com/triton-inference-server/client/tree/main/src/python/library) extend the ability to use Triton's full feature set with a couple of lines of Python.
+
+Let us walk through a simple example:
+1. First we need to load the desired models and start the server with `tritonserver`.
+```python
+import tritonserver
+
+# Constructing path to Model Repository
+model_path = f"server/src/python/examples/example_model_repository"
+
+server_options = tritonserver.Options(
+    server_id="ExampleServer",
+    model_repository=model_path,
+    log_error=True,
+    log_warn=True,
+    log_info=True,
+)
+server = tritonserver.Server(server_options).start(wait_until_ready=True)
+```
+Note: `model_path` may need to be edited depending on your setup.
+
+
+2. Now, to start up the respective services with `tritonfrontend`
+```python
+from tritonfrontend import KServeHttp, KServeGrpc
+http_options = KServeHttp.Options(thread_count=5)
+http_service = KServeHttp.Server(server, http_options)
+http_service.start()
+
+# Default options (if none provided)
+grpc_service = KServeGrpc.Server(server)
+grpc_service.start()
+```
+
+3. Finally, with running services, we can use `tritonclient` or simple `curl` commands to send requests and receive responses from the frontends.
+
+```python
+import tritonclient.http as httpclient
+import numpy as np # Use version numpy < 2
+model_name = "identity" # output == input
+url = "localhost:8000"
+
+# Create a Triton client
+client = httpclient.InferenceServerClient(url=url)
+
+# Prepare input data
+input_data = np.array([["Roger Roger"]], dtype=object)
+
+# Create input and output objects
+inputs = [httpclient.InferInput("INPUT0", input_data.shape, "BYTES")]
+
+# Set the data for the input tensor
+inputs[0].set_data_from_numpy(input_data)
+
+results = client.infer(model_name, inputs=inputs)
+
+# Get the output data
+output_data = results.as_numpy("OUTPUT0")
+
+# Print results
+print("[INFERENCE RESULTS]")
+print("Output data:", output_data)
+
+# Stop respective services and server.
+http_service.stop()
+grpc_service.stop()
+server.stop()
+```
+
+---
+
+Additionally, `tritonfrontend` provides context manager support as well. So steps 2-3, could also be achieved through:
+```python
+from tritonfrontend import KServeHttp
+import tritonclient.http as httpclient
+import numpy as np  # Use version numpy < 2
+
+with KServeHttp.Server(server) as http_service:
+    # The identity model returns an exact duplicate of the input data as output
+    model_name = "identity"
+    url = "localhost:8000"
+    # Create a Triton client
+    with httpclient.InferenceServerClient(url=url) as client:
+        # Prepare input data
+        input_data = np.array(["Roger Roger"], dtype=object)
+        # Create input and output objects
+        inputs = [httpclient.InferInput("INPUT0", input_data.shape, "BYTES")]
+        # Set the data for the input tensor
+        inputs[0].set_data_from_numpy(input_data)
+        # Perform inference
+        results = client.infer(model_name, inputs=inputs)
+        # Get the output data
+        output_data = results.as_numpy("OUTPUT0")
+        # Print results
+        print("[INFERENCE RESULTS]")
+        print("Output data:", output_data)
+
+server.stop()
+```
+With this workflow, you can avoid having to stop each service after client requests have terminated.
+
+
+## Known Issues
+- The following features are not currently supported when launching the Triton frontend services through the python bindings:
+    - [Tracing](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/trace.md)
+    - [Shared Memory](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_shared_memory.md)
+    - [Metrics](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/metrics.md)
+    - [Restricted Protocols](https://github.com/triton-inference-server/server/blob/main/docs/customization_guide/inference_protocols.md#limit-endpoint-access-beta)
+    - VertexAI
+    - Sagemaker
+- After a running server has been stopped, if the client sends an inference request, a Segmentation Fault will occur.
\ No newline at end of file
diff --git a/qa/L0_python_api/test.sh b/qa/L0_python_api/test.sh
index 6dc7206fe3..0d87d16771 100755
--- a/qa/L0_python_api/test.sh
+++ b/qa/L0_python_api/test.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -49,6 +49,15 @@ if [ $? -ne 0 ]; then
     RET=1
 fi
 
+
+FRONTEND_TEST_LOG="./python_kserve.log"
+python -m pytest --junitxml=test_kserve.xml test_kserve.py > $FRONTEND_TEST_LOG 2>&1
+if [ $? -ne 0 ]; then
+    cat $FRONTEND_TEST_LOG
+    echo -e "\n***\n*** Test Failed\n***"
+    RET=1
+fi
+
 set -e
 
 if [ $RET -eq 0 ]; then
diff --git a/qa/L0_python_api/test_kserve.py b/qa/L0_python_api/test_kserve.py
new file mode 100644
index 0000000000..ab77783d0c
--- /dev/null
+++ b/qa/L0_python_api/test_kserve.py
@@ -0,0 +1,252 @@
+import time
+from functools import partial
+
+import numpy as np
+import pytest
+import tritonclient.grpc as grpcclient
+import tritonclient.http as httpclient
+import tritonserver
+from testing_utils import (
+    send_and_test_inference_identity,
+    setup_client,
+    setup_server,
+    setup_service,
+    teardown_client,
+    teardown_server,
+    teardown_service,
+)
+from tritonclient.utils import InferenceServerException
+from tritonfrontend import KServeGrpc, KServeHttp
+
+
+class TestHttpOptions:
+    def test_correct_http_parameters(self):
+        KServeHttp.Options(
+            address="0.0.0.1", port=8080, reuse_port=True, thread_count=16
+        )
+
+    def test_wrong_http_parameters(self):
+        # Out of range
+        with pytest.raises(Exception):
+            KServeHttp.Options(port=-15)
+        with pytest.raises(Exception):
+            KServeHttp.Options(thread_count=-5)
+
+        # Wrong data type
+        with pytest.raises(Exception):
+            KServeHttp.Options(header_forward_pattern=10)
+
+
+class TestGrpcOptions:
+    def test_correct_grpc_parameters(self):
+        KServeGrpc.Options(
+            infer_compression_level=KServeGrpc.Grpc_compression_level.HIGH,
+            reuse_port=True,
+            infer_allocation_pool_size=12,
+            http2_max_pings_without_data=10,
+        )
+
+    def test_wrong_grpc_parameters(self):
+        # Out of Range
+        with pytest.raises(Exception):
+            KServeGrpc.Options(port=-5)
+        with pytest.raises(Exception):
+            KServeGrpc.Options(keepalive_timeout_ms=-20_000)
+
+        # Wrong data type
+        with pytest.raises(Exception):
+            KServeGrpc.Options(infer_allocation_pool_size="big pool")
+        with pytest.raises(Exception):
+            KServeGrpc.Options(server_key=10)
+
+
+HTTP_ARGS = (KServeHttp, httpclient, "localhost:8000")  # Default HTTP args
+GRPC_ARGS = (KServeGrpc, grpcclient, "localhost:8001")  # Default GRPC args
+
+
+class TestKServe:
+    @pytest.mark.parametrize("frontend, client_type, url", [HTTP_ARGS, GRPC_ARGS])
+    def test_server_ready(self, frontend, client_type, url):
+        server = setup_server()
+        service = setup_service(server, frontend)
+        client = setup_client(client_type, url=url)
+
+        assert client.is_server_ready()
+
+        teardown_client(client)
+        teardown_service(service)
+        teardown_server(server)
+
+    @pytest.mark.parametrize("frontend", [HTTP_ARGS[0], GRPC_ARGS[0]])
+    def test_service_double_start(self, frontend):
+        server = setup_server()
+        # setup_service() performs service.start()
+        service = setup_service(server, frontend)
+
+        with pytest.raises(
+            tritonserver.AlreadyExistsError, match="server is already running."
+        ):
+            service.start()
+
+        teardown_server(server)
+        teardown_service(service)
+
+    @pytest.mark.parametrize("frontend", [HTTP_ARGS[0], GRPC_ARGS[0]])
+    def test_invalid_options(self, frontend):
+        server = setup_server()
+        # Current flow is KServeHttp.Options or KServeGrpc.Options have to be
+        # provided to ensure type and range validation occurs.
+        with pytest.raises(
+            tritonserver.InvalidArgumentError,
+            match="Incorrect type for options. options argument must be of type",
+        ):
+            frontend.Server(server, {"port": 8001})
+
+        teardown_server(server)
+
+    @pytest.mark.parametrize("frontend", [HTTP_ARGS[0], GRPC_ARGS[0]])
+    def test_server_service_order(self, frontend):
+        server = setup_server()
+        service = setup_service(server, frontend)
+
+        teardown_server(server)
+        teardown_service(service)
+
+    @pytest.mark.parametrize("frontend, client_type", [HTTP_ARGS[:2], GRPC_ARGS[:2]])
+    def test_service_custom_port(self, frontend, client_type):
+        server = setup_server()
+        options = frontend.Options(port=8005)
+        service = setup_service(server, frontend, options)
+        client = setup_client(client_type, url="localhost:8005")
+
+        # Confirms that service starts at port 8005
+        client.is_server_ready()
+
+        teardown_client(client)
+        teardown_service(service)
+        teardown_server(server)
+
+    @pytest.mark.parametrize("frontend, client_type, url", [HTTP_ARGS, GRPC_ARGS])
+    def test_inference(self, frontend, client_type, url):
+        server = setup_server()
+        service = setup_service(server, frontend)
+
+        # TODO: use common/test_infer
+        assert send_and_test_inference_identity(client_type, url=url)
+
+        teardown_service(service)
+        teardown_server(server)
+
+    @pytest.mark.parametrize("frontend, client_type, url", [HTTP_ARGS])
+    def test_http_req_during_shutdown(self, frontend, client_type, url):
+        server = setup_server()
+        http_service = setup_service(server, frontend)
+        http_client = httpclient.InferenceServerClient(url="localhost:8000")
+        model_name = "delayed_identity"
+        delay = 2  # seconds
+        input_data0 = np.array([[delay]], dtype=np.float32)
+
+        input0 = httpclient.InferInput("INPUT0", input_data0.shape, "FP32")
+        input0.set_data_from_numpy(input_data0)
+
+        inputs = [input0]
+        outputs = [httpclient.InferRequestedOutput("OUTPUT0")]
+
+        async_request = http_client.async_infer(
+            model_name=model_name, inputs=inputs, outputs=outputs
+        )
+        # http_service.stop() does not use graceful shutdown
+        teardown_service(http_service)
+
+        # So, inference request will fail as http endpoints have been stopped.
+        with pytest.raises(
+            InferenceServerException, match="failed to obtain inference response"
+        ):
+            async_request.get_result(block=True, timeout=delay)
+
+        # http_client.close() calls join() to terminate pool of greenlets
+        # However, due to an unsuccessful get_result(), async_request is still
+        # an active thread. Hence, join stalls until greenlet timeouts.
+        # Does not throw an exception, but displays error in logs.
+        teardown_client(http_client)
+
+        # delayed_identity will still be an active model
+        # Hence, server.stop() causes InternalError: Timeout.
+        with pytest.raises(
+            tritonserver.InternalError,
+            match="Exit timeout expired. Exiting immediately.",
+        ):
+            teardown_server(server)
+
+    @pytest.mark.parametrize("frontend, client_type, url", [GRPC_ARGS])
+    def test_grpc_req_during_shutdown(self, frontend, client_type, url):
+        server = setup_server()
+        grpc_service = setup_service(server, frontend)
+        grpc_client = grpcclient.InferenceServerClient(url=url)
+        user_data = []
+
+        def callback(user_data, result, error):
+            if error:
+                user_data.append(error)
+            else:
+                user_data.append(result)
+
+        model_name = "delayed_identity"
+        delay = 2  # seconds
+
+        input_data0 = np.array([[delay]], dtype=np.float32)
+        input0 = client_type.InferInput("INPUT0", input_data0.shape, "FP32")
+        input0.set_data_from_numpy(input_data0)
+
+        inputs = [input0]
+        outputs = [client_type.InferRequestedOutput("OUTPUT0")]
+
+        grpc_client.async_infer(
+            model_name=model_name,
+            inputs=inputs,
+            outputs=outputs,
+            callback=partial(callback, user_data),
+        )
+
+        teardown_service(grpc_service)
+
+        time_out = delay + 1
+        while (len(user_data) == 0) and time_out > 0:
+            time_out = time_out - 1
+            time.sleep(1)
+
+        assert (
+            len(user_data) == 1
+            and isinstance(user_data[0], InferenceServerException)
+            and "[StatusCode.UNAVAILABLE] failed to connect to all addresses"
+            in str(user_data[0])
+        )
+
+        teardown_client(grpc_client)
+        teardown_server(server)
+
+    # KNOWN ISSUE: CAUSES SEGFAULT
+    # Created  [DLIS-7231] to address at future date
+    # Once the server has been stopped, the underlying TRITONSERVER_Server instance
+    # is deleted. However, the frontend does not know the server instance
+    # is no longer valid.
+    # def test_inference_after_server_stop(self):
+    #     server = setup_server()
+    #     http_service = setup_service(server, KServeHttp)
+    #     http_client = setup_client(httpclient, url="localhost:8000")
+
+    #     teardown_server(server) # Server has been stopped
+
+    #     model_name = "identity"
+    #     input_data = np.array([["testing"]], dtype=object)
+    #     # Create input and output objects
+    #     inputs = [httpclient.InferInput("INPUT0", input_data.shape, "BYTES")]
+    #     outputs = [httpclient.InferRequestedOutput("OUTPUT0")]
+
+    #     # Set the data for the input tensor
+    #     inputs[0].set_data_from_numpy(input_data)
+
+    #     results = http_client.infer(model_name, inputs=inputs, outputs=outputs)
+
+    #     teardown_client(http_client)
+    #     teardown_service(http_service)
diff --git a/qa/L0_python_api/test_model_repository/delayed_identity/1/model.py b/qa/L0_python_api/test_model_repository/delayed_identity/1/model.py
new file mode 100644
index 0000000000..b6095cec8f
--- /dev/null
+++ b/qa/L0_python_api/test_model_repository/delayed_identity/1/model.py
@@ -0,0 +1,51 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import time
+
+import triton_python_backend_utils as pb_utils
+
+
+class TritonPythonModel:
+    def execute(self, requests):
+        """
+        Mock Model that uses the input data to determine how long to wait
+        before returning identity data
+        """
+        assert len(requests) == 1
+        delay = 0
+        request = requests[0]
+        responses = []
+
+        delay_tensor = pb_utils.get_input_tensor_by_name(request, "INPUT0")
+        delay_as_numpy = delay_tensor.as_numpy()
+        delay = float(delay_as_numpy[0][0])
+
+        out_tensor = pb_utils.Tensor("OUTPUT0", delay_as_numpy)
+        responses.append(pb_utils.InferenceResponse([out_tensor]))
+
+        time.sleep(delay)
+        return responses
diff --git a/qa/L0_python_api/test_model_repository/delayed_identity/config.pbtxt b/qa/L0_python_api/test_model_repository/delayed_identity/config.pbtxt
new file mode 100644
index 0000000000..9ac8f1aaff
--- /dev/null
+++ b/qa/L0_python_api/test_model_repository/delayed_identity/config.pbtxt
@@ -0,0 +1,52 @@
+# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+name: "delayed_identity"
+backend: "python"
+max_batch_size: 64
+
+input [
+  {
+    name: "INPUT0"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+  }
+]
+
+output [
+  {
+    name: "OUTPUT0"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+  }
+]
+
+instance_group [
+  {
+    count: 1
+    kind : KIND_CPU
+  }
+]
\ No newline at end of file
diff --git a/qa/L0_python_api/test_model_repository/identity/1/model.py b/qa/L0_python_api/test_model_repository/identity/1/model.py
new file mode 100644
index 0000000000..629b6469c9
--- /dev/null
+++ b/qa/L0_python_api/test_model_repository/identity/1/model.py
@@ -0,0 +1,49 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import numpy as np
+import triton_python_backend_utils as pb_utils
+
+
+class TritonPythonModel:
+    """This model loops through different dtypes to make sure that
+    serialize_byte_tensor works correctly in the Python backend.
+    """
+
+    def initialize(self, args):
+        self._index = 0
+        self._dtypes = [np.bytes_, np.object_]
+
+    def execute(self, requests):
+        responses = []
+        for request in requests:
+            in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0")
+            out_tensor_0 = pb_utils.Tensor(
+                "OUTPUT0", in_0.as_numpy().astype(self._dtypes[self._index])
+            )
+            self._index += 1
+            responses.append(pb_utils.InferenceResponse([out_tensor_0]))
+        return responses
diff --git a/qa/L0_python_api/test_model_repository/identity/config.pbtxt b/qa/L0_python_api/test_model_repository/identity/config.pbtxt
new file mode 100644
index 0000000000..3f22e14468
--- /dev/null
+++ b/qa/L0_python_api/test_model_repository/identity/config.pbtxt
@@ -0,0 +1,44 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+name: "identity"
+backend: "python"
+max_batch_size: 0
+
+input [
+  {
+    name: "INPUT0"
+    data_type: TYPE_STRING
+    dims: [ 1 ]
+  }
+]
+output [
+  {
+    name: "OUTPUT0"
+    data_type: TYPE_STRING
+    dims: [ 1 ]
+  }
+]
\ No newline at end of file
diff --git a/qa/L0_python_api/testing_utils.py b/qa/L0_python_api/testing_utils.py
new file mode 100644
index 0000000000..8c63fea89b
--- /dev/null
+++ b/qa/L0_python_api/testing_utils.py
@@ -0,0 +1,95 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+from typing import Union
+
+import numpy as np
+import tritonserver
+from tritonfrontend import KServeGrpc, KServeHttp
+
+
+def setup_server(model_repository="test_model_repository") -> tritonserver.Server:
+    module_directory = os.path.split(os.path.abspath(__file__))[0]
+    model_path = os.path.abspath(os.path.join(module_directory, model_repository))
+
+    # Starting Server Instance
+    server_options = tritonserver.Options(
+        server_id="TestServer",
+        model_repository=model_path,
+        log_error=True,
+        log_warn=True,
+        log_info=True,
+    )
+
+    return tritonserver.Server(server_options).start(wait_until_ready=True)
+
+
+def teardown_server(server: tritonserver.Server) -> None:
+    server.stop()
+
+
+def setup_service(
+    server: tritonserver.Server,
+    frontend: Union[KServeHttp, KServeGrpc],
+    options=None,
+) -> Union[KServeHttp, KServeGrpc]:
+    service = frontend.Server(server=server, options=options)
+    service.start()
+    return service
+
+
+def teardown_service(service: Union[KServeHttp, KServeGrpc]) -> None:
+    service.stop()
+
+
+def setup_client(frontend_client, url: str):
+    return frontend_client.InferenceServerClient(url=url)
+
+
+def teardown_client(client) -> None:
+    client.close()
+
+
+# Sends an inference to test_model_repository/identity model and verifies input == output.
+def send_and_test_inference_identity(frontend_client, url: str) -> bool:
+    model_name = "identity"
+    client = setup_client(frontend_client, url)
+    input_data = np.array(["testing"], dtype=object)
+
+    # Create input and output objects
+    inputs = [frontend_client.InferInput("INPUT0", input_data.shape, "BYTES")]
+    outputs = [frontend_client.InferRequestedOutput("OUTPUT0")]
+    # Set the data for the input tensor
+    inputs[0].set_data_from_numpy(input_data)
+
+    # Perform inference request
+    results = client.infer(model_name=model_name, inputs=inputs, outputs=outputs)
+
+    output_data = results.as_numpy("OUTPUT0")  # Gather output data
+
+    teardown_client(client)
+    return input_data[0] == output_data[0].decode()
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index cf43765dba..2e0380470a 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -782,7 +782,11 @@ if (NOT WIN32)
   endif() # TRITON_ENABLE_GPU
 endif() # NOT WIN32
 
+# tritonfrontend python package
+add_subdirectory(python)
+
 # Currently unit tests do not build for windows...
 if ( NOT WIN32)
   add_subdirectory(test test)
 endif() # NOT WIN32
+
diff --git a/src/common.h b/src/common.h
index aa160f394f..011546d637 100644
--- a/src/common.h
+++ b/src/common.h
@@ -27,7 +27,11 @@
 
 #include <iostream>
 #include <sstream>
+#include <stdexcept>
 #include <string>
+#include <typeinfo>
+#include <unordered_map>
+#include <variant>
 #include <vector>
 
 #include "triton/core/tritonserver.h"
@@ -184,4 +188,60 @@ Join(const T& container, const std::string& delim)
   return ss.str();
 }
 
+
+// Used by Python Bindings to accept arguments to initialize Frontends.
+// Known pybind11 issue: bool has to come before int for std::variant
+using VariantType = std::variant<bool, int, std::string>;
+using UnorderedMapType = std::unordered_map<std::string, VariantType>;
+
+
+template <typename T>
+TRITONSERVER_Error*
+GetValue(const UnorderedMapType& options, const std::string& key, T* arg)
+{
+  auto curr = options.find(key);
+  bool is_present = (curr != options.end());
+  std::string msg;
+
+  if (!is_present) {
+    msg = "Key: " + key + " not found in options provided.";
+    return TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INVALID_ARG, msg.c_str());
+  }
+
+  bool correct_type = std::holds_alternative<T>(curr->second);
+  if (!correct_type) {
+    std::string expected;
+    std::string found;
+    VariantType value = *arg;
+    if (std::holds_alternative<int>(value)) {
+      expected = "int";
+    } else if (std::holds_alternative<bool>(value)) {
+      expected = "bool";
+    } else if (std::holds_alternative<std::string>(value)) {
+      expected = "string";
+    }
+
+    switch (curr->second.index()) {
+      case 0:
+        found = "bool";
+        break;
+      case 1:
+        found = "int";
+        break;
+      case 2:
+        found = "string";
+        break;
+    }
+
+    msg = "Key: " + key + " found, but incorrect type. Expected " + expected +
+          " Found: " + found;
+
+    return TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INVALID_ARG, msg.c_str());
+  }
+
+  *arg = std::get<T>(curr->second);
+  return nullptr;
+}
+
+
 }}  // namespace triton::server
diff --git a/src/grpc/grpc_server.cc b/src/grpc/grpc_server.cc
index c0a92ebd33..74ec443ae6 100644
--- a/src/grpc/grpc_server.cc
+++ b/src/grpc/grpc_server.cc
@@ -2435,6 +2435,101 @@ Server::Create(
   return nullptr;  // success
 }
 
+TRITONSERVER_Error*
+Server::Create(
+    std::shared_ptr<TRITONSERVER_Server>& server, UnorderedMapType& options,
+    triton::server::TraceManager* trace_manager,
+    const std::shared_ptr<SharedMemoryManager>& shm_manager,
+    const RestrictedFeatures& restricted_features,
+    std::unique_ptr<Server>* service)
+{
+  Options grpc_options;
+
+  RETURN_IF_ERR(GetOptions(grpc_options, options));
+
+  return Create(server, trace_manager, shm_manager, grpc_options, service);
+}
+
+TRITONSERVER_Error*
+Server::GetOptions(Options& options, UnorderedMapType& options_map)
+{
+  SocketOptions socket_selection;
+  SslOptions ssl_selection;
+  KeepAliveOptions keep_alive_selection;
+
+  RETURN_IF_ERR(GetSocketOptions(options.socket_, options_map));
+  RETURN_IF_ERR(GetSslOptions(options.ssl_, options_map));
+  RETURN_IF_ERR(GetKeepAliveOptions(options.keep_alive_, options_map));
+
+  int infer_compression_level_key;
+
+  RETURN_IF_ERR(GetValue(
+      options_map, "infer_compression_level", &infer_compression_level_key));
+
+  options.infer_compression_level_ =
+      static_cast<grpc_compression_level>(infer_compression_level_key);
+
+  RETURN_IF_ERR(GetValue(
+      options_map, "infer_allocation_pool_size",
+      &options.infer_allocation_pool_size_));
+  RETURN_IF_ERR(GetValue(
+      options_map, "forward_header_pattern", &options.forward_header_pattern_));
+
+  return nullptr;
+}
+
+TRITONSERVER_Error*
+Server::GetSocketOptions(SocketOptions& options, UnorderedMapType& options_map)
+{
+  RETURN_IF_ERR(GetValue(options_map, "address", &options.address_));
+  RETURN_IF_ERR(GetValue(options_map, "port", &options.port_));
+  RETURN_IF_ERR(GetValue(options_map, "reuse_port", &options.reuse_port_));
+
+  return nullptr;
+}
+
+TRITONSERVER_Error*
+Server::GetSslOptions(SslOptions& options, UnorderedMapType& options_map)
+{
+  RETURN_IF_ERR(GetValue(options_map, "use_ssl", &options.use_ssl_));
+  RETURN_IF_ERR(GetValue(options_map, "server_cert", &options.server_cert_));
+  RETURN_IF_ERR(GetValue(options_map, "server_key", &options.server_key_));
+  RETURN_IF_ERR(GetValue(options_map, "root_cert", &options.root_cert_));
+  RETURN_IF_ERR(
+      GetValue(options_map, "use_mutual_auth", &options.use_mutual_auth_));
+
+  return nullptr;
+}
+
+TRITONSERVER_Error*
+Server::GetKeepAliveOptions(
+    KeepAliveOptions& options, UnorderedMapType& options_map)
+{
+  RETURN_IF_ERR(
+      GetValue(options_map, "keepalive_time_ms", &options.keepalive_time_ms_));
+  RETURN_IF_ERR(GetValue(
+      options_map, "keepalive_timeout_ms", &options.keepalive_timeout_ms_));
+  RETURN_IF_ERR(GetValue(
+      options_map, "keepalive_permit_without_calls",
+      &options.keepalive_permit_without_calls_));
+  RETURN_IF_ERR(GetValue(
+      options_map, "http2_max_pings_without_data",
+      &options.http2_max_pings_without_data_));
+  RETURN_IF_ERR(GetValue(
+      options_map, "http2_min_recv_ping_interval_without_data_ms",
+      &options.http2_min_recv_ping_interval_without_data_ms_));
+  RETURN_IF_ERR(GetValue(
+      options_map, "http2_max_ping_strikes", &options.http2_max_ping_strikes_));
+  RETURN_IF_ERR(GetValue(
+      options_map, "max_connection_age_ms", &options.max_connection_age_ms_));
+  RETURN_IF_ERR(GetValue(
+      options_map, "max_connection_age_grace_ms",
+      &options.max_connection_age_grace_ms_));
+
+  return nullptr;
+}
+
+
 TRITONSERVER_Error*
 Server::Start()
 {
diff --git a/src/grpc/grpc_server.h b/src/grpc/grpc_server.h
index 8a38cdd4fe..89d8dc7388 100644
--- a/src/grpc/grpc_server.h
+++ b/src/grpc/grpc_server.h
@@ -29,6 +29,7 @@
 
 #include <vector>
 
+#include "../common.h"
 #include "../restricted_features.h"
 #include "../shared_memory_manager.h"
 #include "../tracer.h"
@@ -100,6 +101,13 @@ class Server {
       const std::shared_ptr<SharedMemoryManager>& shm_manager,
       const Options& server_options, std::unique_ptr<Server>* server);
 
+  static TRITONSERVER_Error* Create(
+      std::shared_ptr<TRITONSERVER_Server>& server, UnorderedMapType& options,
+      triton::server::TraceManager* trace_manager,
+      const std::shared_ptr<SharedMemoryManager>& shm_manager,
+      const RestrictedFeatures& restricted_features,
+      std::unique_ptr<Server>* service);
+
   ~Server();
 
   TRITONSERVER_Error* Start();
@@ -112,6 +120,16 @@ class Server {
       const std::shared_ptr<SharedMemoryManager>& shm_manager,
       const Options& server_options);
 
+  static TRITONSERVER_Error* GetSocketOptions(
+      SocketOptions& options, UnorderedMapType& options_map);
+  static TRITONSERVER_Error* GetSslOptions(
+      SslOptions& options, UnorderedMapType& options_map);
+  static TRITONSERVER_Error* GetKeepAliveOptions(
+      KeepAliveOptions& options, UnorderedMapType& options_map);
+
+  static TRITONSERVER_Error* GetOptions(
+      Options& options, UnorderedMapType& options_map);
+
   std::shared_ptr<TRITONSERVER_Server> tritonserver_;
   TraceManager* trace_manager_;
   std::shared_ptr<SharedMemoryManager> shm_manager_;
diff --git a/src/grpc/infer_handler.cc b/src/grpc/infer_handler.cc
index 35659f4900..916230381b 100644
--- a/src/grpc/infer_handler.cc
+++ b/src/grpc/infer_handler.cc
@@ -948,12 +948,14 @@ ModelInferHandler::Execute(InferHandler::State* state)
   if (err == nullptr) {
     TRITONSERVER_InferenceTrace* triton_trace = nullptr;
 #ifdef TRITON_ENABLE_TRACING
-    GrpcServerCarrier carrier(state->context_->ctx_.get());
-    auto start_options =
-        trace_manager_->GetTraceStartOptions(carrier, request.model_name());
-    state->trace_ = std::move(trace_manager_->SampleTrace(start_options));
-    if (state->trace_ != nullptr) {
-      triton_trace = state->trace_->trace_;
+    if (trace_manager_) {
+      GrpcServerCarrier carrier(state->context_->ctx_.get());
+      auto start_options =
+          trace_manager_->GetTraceStartOptions(carrier, request.model_name());
+      state->trace_ = std::move(trace_manager_->SampleTrace(start_options));
+      if (state->trace_ != nullptr) {
+        triton_trace = state->trace_->trace_;
+      }
     }
 #endif  // TRITON_ENABLE_TRACING
 
@@ -982,8 +984,10 @@ ModelInferHandler::Execute(InferHandler::State* state)
     inference::ModelInferResponse error_response;
 
 #ifdef TRITON_ENABLE_TRACING
-    state->trace_timestamps_.emplace_back(
-        std::make_pair("GRPC_SEND_START", TraceManager::CaptureTimestamp()));
+    if (trace_manager_) {
+      state->trace_timestamps_.emplace_back(
+          std::make_pair("GRPC_SEND_START", TraceManager::CaptureTimestamp()));
+    }
 #endif  // TRITON_ENABLE_TRACING
 
     state->step_ = COMPLETE;
diff --git a/src/http_server.cc b/src/http_server.cc
index 68b22ae649..cfd1da88ae 100644
--- a/src/http_server.cc
+++ b/src/http_server.cc
@@ -1181,6 +1181,7 @@ HTTPAPIServer::HTTPAPIServer(
 
 HTTPAPIServer::~HTTPAPIServer()
 {
+  LOG_VERBOSE(1) << "~HTTPAPIServer()";
   if (server_metadata_err_ != nullptr) {
     TRITONSERVER_ErrorDelete(server_metadata_err_);
   }
@@ -3586,10 +3587,12 @@ HTTPAPIServer::HandleInfer(
   RETURN_AND_RESPOND_IF_ERR(
       req, CheckTransactionPolicy(req, model_name, requested_model_version));
 
-  // If tracing is enabled see if this request should be traced.
   TRITONSERVER_InferenceTrace* triton_trace = nullptr;
-  std::shared_ptr<TraceManager::Trace> trace =
-      StartTrace(req, model_name, &triton_trace);
+  std::shared_ptr<TraceManager::Trace> trace;
+  if (trace_manager_) {
+    // If tracing is enabled see if this request should be traced.
+    trace = StartTrace(req, model_name, &triton_trace);
+  }
 
   // Decompress request body if it is compressed in supported type
   evbuffer* decompressed_buffer = nullptr;
@@ -4696,6 +4699,35 @@ HTTPAPIServer::Create(
   return nullptr;
 }
 
+
+TRITONSERVER_Error*
+HTTPAPIServer::Create(
+    std::shared_ptr<TRITONSERVER_Server>& server,
+    const UnorderedMapType& options,
+    triton::server::TraceManager* trace_manager,
+    const std::shared_ptr<SharedMemoryManager>& shm_manager,
+    const RestrictedFeatures& restricted_features,
+    std::unique_ptr<HTTPServer>* service)
+{
+  int port;
+  bool reuse_port;
+  std::string address;
+  std::string header_forward_pattern;
+  int thread_count;
+
+  RETURN_IF_ERR(GetValue(options, "port", &port));
+  RETURN_IF_ERR(GetValue(options, "reuse_port", &reuse_port));
+  RETURN_IF_ERR(GetValue(options, "address", &address));
+  RETURN_IF_ERR(
+      GetValue(options, "header_forward_pattern", &header_forward_pattern));
+  RETURN_IF_ERR(GetValue(options, "thread_count", &thread_count));
+
+  return Create(
+      server, trace_manager, shm_manager, port, reuse_port, address,
+      header_forward_pattern, thread_count, restricted_features, service);
+}
+
+
 bool
 HTTPAPIServer::RespondIfRestricted(
     evhtp_request_t* req, const Restriction& restriction)
diff --git a/src/http_server.h b/src/http_server.h
index 077324cba3..3ad3d60cc4 100644
--- a/src/http_server.h
+++ b/src/http_server.h
@@ -196,6 +196,14 @@ class HTTPAPIServer : public HTTPServer {
       const RestrictedFeatures& restricted_apis,
       std::unique_ptr<HTTPServer>* http_server);
 
+  static TRITONSERVER_Error* Create(
+      std::shared_ptr<TRITONSERVER_Server>& server,
+      const UnorderedMapType& options,
+      triton::server::TraceManager* trace_manager,
+      const std::shared_ptr<SharedMemoryManager>& shm_manager,
+      const RestrictedFeatures& restricted_features,
+      std::unique_ptr<HTTPServer>* service);
+
   virtual ~HTTPAPIServer();
 
   //
diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt
new file mode 100644
index 0000000000..f447f7eab2
--- /dev/null
+++ b/src/python/CMakeLists.txt
@@ -0,0 +1,78 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+cmake_minimum_required(VERSION 3.18)
+
+message("tritonfrontend python package build skipped when relevant frontends are disabled.")
+message("In order to build tritonfrontend, the following flags are needed: -DTRITON_ENABLE_HTTP=ON -DTRITON_ENABLE_GRPC=ON")
+
+# [DLIS-7232] tritonfrontend package expects all supported packages to be
+# built, without any check/verification for respective frontend enable flags.
+# Support for partial builds(ex: HTTP but not gRPC) will be addressed later.
+if(NOT (${TRITON_ENABLE_HTTP} AND ${TRITON_ENABLE_GRPC}))
+  return()
+endif()
+
+add_subdirectory(tritonfrontend)
+
+file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/TRITON_VERSION ${TRITON_VERSION})
+configure_file(../../LICENSE LICENSE.txt COPYONLY)
+configure_file(setup.py setup.py @ONLY)
+
+set(WHEEL_DEPENDS
+      ${CMAKE_CURRENT_BINARY_DIR}/TRITON_VERSION
+      ${CMAKE_CURRENT_BINARY_DIR}/LICENSE.txt
+      ${CMAKE_CURRENT_BINARY_DIR}/setup.py
+      ${CMAKE_CURRENT_BINARY_DIR}/tritonfrontend
+      py-bindings
+)
+
+set(wheel_stamp_file "stamp.whl")
+
+add_custom_command(
+  OUTPUT "${wheel_stamp_file}"
+  COMMAND python3
+  ARGS
+    "${CMAKE_CURRENT_SOURCE_DIR}/build_wheel.py"
+    --dest-dir "${CMAKE_CURRENT_BINARY_DIR}/generic"
+    --binding-path $<TARGET_FILE:py-bindings>
+  DEPENDS ${WHEEL_DEPENDS}
+)
+
+add_custom_target(
+  frontend-server-wheel ALL
+  DEPENDS
+    "${wheel_stamp_file}"
+)
+
+
+# Wheel
+set(WHEEL_OUT_DIR "${CMAKE_CURRENT_BINARY_DIR}/generic/wheel/dist/")
+install(
+  DIRECTORY
+  ${WHEEL_OUT_DIR}
+  DESTINATION "${CMAKE_INSTALL_PREFIX}/python"
+)
\ No newline at end of file
diff --git a/src/python/build_wheel.py b/src/python/build_wheel.py
new file mode 100755
index 0000000000..875dd32a70
--- /dev/null
+++ b/src/python/build_wheel.py
@@ -0,0 +1,135 @@
+#!/usr/bin/env python3
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import argparse
+import os
+import pathlib
+import re
+import shutil
+import subprocess
+import sys
+from distutils.dir_util import copy_tree
+from tempfile import mkstemp
+
+
+def fail_if(p, msg):
+    if p:
+        print("error: {}".format(msg), file=sys.stderr)
+        sys.exit(1)
+
+
+def mkdir(path):
+    pathlib.Path(path).mkdir(parents=True, exist_ok=True)
+
+
+def touch(path):
+    pathlib.Path(path).touch()
+
+
+def cpdir(src, dest):
+    copy_tree(src, dest, preserve_symlinks=1)
+
+
+def sed(pattern, replace, source, dest=None):
+    name = None
+    if dest:
+        name = dest
+    if dest is None:
+        fd, name = mkstemp()
+
+    with open(source, "r") as fin, open(name, "w") as fout:
+        for line in fin:
+            out = re.sub(pattern, replace, line)
+            fout.write(out)
+
+    if not dest:
+        shutil.copyfile(name, source)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--dest-dir", type=str, required=True, help="Destination directory."
+    )
+    parser.add_argument(
+        "--binding-path",
+        type=str,
+        required=True,
+        help="Path to Triton Frontend Python binding.",
+    )
+
+    FLAGS = parser.parse_args()
+
+    FLAGS.triton_version = None
+    with open("TRITON_VERSION", "r") as vfile:
+        FLAGS.triton_version = vfile.readline().strip()
+
+    FLAGS.whl_dir = os.path.join(FLAGS.dest_dir, "wheel")
+
+    print("=== Building in: {}".format(os.getcwd()))
+    print("=== Using builddir: {}".format(FLAGS.whl_dir))
+    print("Adding package files")
+    mkdir(os.path.join(FLAGS.whl_dir, "tritonfrontend"))
+    shutil.copy(
+        "tritonfrontend/__init__.py", os.path.join(FLAGS.whl_dir, "tritonfrontend")
+    )
+    # Type checking marker file indicating support for type checkers.
+    # https://peps.python.org/pep-0561/
+    shutil.copy(
+        "tritonfrontend/py.typed", os.path.join(FLAGS.whl_dir, "tritonfrontend")
+    )
+    cpdir("tritonfrontend/_c", os.path.join(FLAGS.whl_dir, "tritonfrontend", "_c"))
+    cpdir("tritonfrontend/_api", os.path.join(FLAGS.whl_dir, "tritonfrontend", "_api"))
+    PYBIND_LIB = os.path.basename(FLAGS.binding_path)
+    shutil.copyfile(
+        FLAGS.binding_path,
+        os.path.join(FLAGS.whl_dir, "tritonfrontend", "_c", PYBIND_LIB),
+    )
+
+    shutil.copyfile("LICENSE.txt", os.path.join(FLAGS.whl_dir, "LICENSE.txt"))
+    shutil.copyfile("setup.py", os.path.join(FLAGS.whl_dir, "setup.py"))
+
+    os.chdir(FLAGS.whl_dir)
+    print("=== Building wheel")
+    args = ["python3", "setup.py", "bdist_wheel"]
+
+    wenv = os.environ.copy()
+    wenv["VERSION"] = FLAGS.triton_version
+    wenv["TRITON_PYBIND"] = PYBIND_LIB
+    p = subprocess.Popen(args, env=wenv)
+    p.wait()
+    fail_if(p.returncode != 0, "setup.py failed")
+
+    cpdir("dist", FLAGS.dest_dir)
+
+    print(f"=== Output wheel file is in: {FLAGS.dest_dir}")
+    touch(os.path.join(FLAGS.dest_dir, "stamp.whl"))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/python/examples/example.py b/src/python/examples/example.py
new file mode 100644
index 0000000000..a1fba6e9d1
--- /dev/null
+++ b/src/python/examples/example.py
@@ -0,0 +1,85 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pathlib
+
+import numpy as np
+import tritonclient.http as httpclient
+import tritonserver
+from tritonfrontend import KServeHttp
+
+
+def main():
+    # Constructing path to Model Repository
+    model_path = f"{pathlib.Path(__file__).parent.resolve()}/example_model_repository"
+    # Selecting Server Options
+    server_options = tritonserver.Options(
+        server_id="ExampleServer",
+        model_repository=model_path,
+        log_error=True,
+        log_info=True,
+        log_warn=True,
+    )
+
+    # Creating server instance
+    server = tritonserver.Server(server_options).start(wait_until_ready=True)
+
+    # Selecting Options for KServeHttp Frontend
+    http_options = KServeHttp.Options(port=8005)
+
+    # or http_service = KServeHttp.Server(server, http_options) & http_service.stop()
+    with KServeHttp.Server(server, http_options) as http_service:
+        # The identity model returns an exact duplicate of the input data as output
+        model_name = "identity"
+        url = "localhost:8005"
+
+        # Create a Triton client
+        client = httpclient.InferenceServerClient(url=url)
+
+        # Prepare input data
+        input_data = np.array([["Roger Roger"]], dtype=object)
+
+        # Create input and output objects
+        inputs = [httpclient.InferInput("INPUT0", input_data.shape, "BYTES")]
+
+        # Set the data for the input tensor
+        inputs[0].set_data_from_numpy(input_data)
+
+        results = client.infer(model_name, inputs=inputs)
+
+        # Get the output data
+        output_data = results.as_numpy("OUTPUT0")
+
+        print("--------------------- INFERENCE RESULTS ---------------------")
+        print("Input data:", input_data)
+        print("Output data:", output_data)
+        print("-------------------------------------------------------------")
+
+    server.stop()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/python/examples/example_model_repository/identity/1/model.savedmodel/saved_model.pb b/src/python/examples/example_model_repository/identity/1/model.savedmodel/saved_model.pb
new file mode 100755
index 0000000000000000000000000000000000000000..63f78fecb4d2e7a8fe857b668055ae7d7372c891
GIT binary patch
literal 531
zcmb7>J5R$f5XW)Js~17hi2+%NArcaJNYesKR;Dh{A!UuC#-$OF6D1B+)x9I^d^Wxd
zkSDMd;pKd~*YAI45PVVKm0$rj7?Y3-8Ob#QsF6quX%UZ_pe2Q@9jgz-Lq)Dr4!2-V
z<6zF0NgVQw4!SfoEaTPz)So)tItDdg1}VqdP{r+?5KGN@_RN_!vPU=e)mSgYlSozq
z-wA9$o7_!jo`XN<Fb59(n#ZsD<#-0CE=AM$Xdn6IFtY{eIXXLUF?QC#1L*fNubSR%
zg*H4)FPBg@`!@^jS^LJ8ur~sOQh6jE{g5w$B9^!%FzzH@d>x|_?>a|1-JuSzzez8l
a-$4IWMXBT>PqIavtcDH9FwJ-v&E+Rf?!_4Z

literal 0
HcmV?d00001

diff --git a/src/python/examples/example_model_repository/identity/config.pbtxt b/src/python/examples/example_model_repository/identity/config.pbtxt
new file mode 100644
index 0000000000..ae83e47556
--- /dev/null
+++ b/src/python/examples/example_model_repository/identity/config.pbtxt
@@ -0,0 +1,44 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+name: "identity"
+platform: "tensorflow_savedmodel"
+max_batch_size: 8
+
+input [
+  {
+    name: "INPUT0"
+    data_type: TYPE_STRING
+    dims: [ -1 ]
+  }
+]
+output [
+  {
+    name: "OUTPUT0"
+    data_type: TYPE_STRING
+    dims: [ -1 ]
+  }
+]
diff --git a/src/python/setup.py b/src/python/setup.py
new file mode 100755
index 0000000000..ee1e7c0ec4
--- /dev/null
+++ b/src/python/setup.py
@@ -0,0 +1,112 @@
+#!/usr/bin/env python3
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+import sys
+
+from setuptools import find_packages, setup
+
+if "--plat-name" in sys.argv:
+    PLATFORM_FLAG = sys.argv[sys.argv.index("--plat-name") + 1]
+else:
+    PLATFORM_FLAG = "any"
+
+if "VERSION" not in os.environ:
+    raise Exception("envvar VERSION must be specified")
+
+VERSION = os.environ["VERSION"]
+
+try:
+    from wheel.bdist_wheel import bdist_wheel as _bdist_wheel
+
+    class bdist_wheel(_bdist_wheel):
+        def finalize_options(self):
+            _bdist_wheel.finalize_options(self)
+            self.root_is_pure = False
+
+        def get_tag(self):
+            pyver, abi, plat = "py3", "none", PLATFORM_FLAG
+            return pyver, abi, plat
+
+except ImportError:
+    bdist_wheel = None
+
+this_directory = os.path.abspath(os.path.dirname(__file__))
+
+data_files = [
+    ("", ["LICENSE.txt"]),
+]
+
+# Type checking marker file indicating support for type checkers.
+# https://peps.python.org/pep-0561/
+# Type hints for c extension generated by mypy
+platform_package_data = [
+    os.environ["TRITON_PYBIND"],
+    "py.typed",
+    "_c/__init__.pyi",
+    "_c/triton_bindings.pyi",
+]
+
+gpu_extras = ["cupy-cuda12x"]
+test_extras = ["pytest"]
+all_extras = gpu_extras + test_extras
+
+setup(
+    name="tritonfrontend",
+    version=VERSION,
+    author="NVIDIA Inc.",
+    author_email="sw-dl-triton@nvidia.com",
+    description="Triton Inference Server In-Process Python API",
+    license="BSD",
+    url="https://developer.nvidia.com/nvidia-triton-inference-server",
+    classifiers=[
+        "Development Status :: 5 - Production/Stable",
+        "Intended Audience :: Developers",
+        "Intended Audience :: Science/Research",
+        "Intended Audience :: Information Technology",
+        "Topic :: Scientific/Engineering",
+        "Topic :: Scientific/Engineering :: Image Recognition",
+        "Topic :: Scientific/Engineering :: Artificial Intelligence",
+        "Topic :: Software Development :: Libraries",
+        "Topic :: Utilities",
+        "License :: OSI Approved :: BSD License",
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3.10",
+        "Environment :: Console",
+        "Natural Language :: English",
+        "Operating System :: OS Independent",
+    ],
+    packages=find_packages(),
+    package_data={
+        "": platform_package_data,
+    },
+    zip_safe=False,
+    cmdclass={"bdist_wheel": bdist_wheel},
+    data_files=data_files,
+    install_requires=["tritonserver", "pydantic"],
+    extras_require={"GPU": gpu_extras, "test": test_extras, "all": all_extras},
+)
diff --git a/src/python/tritonfrontend/CMakeLists.txt b/src/python/tritonfrontend/CMakeLists.txt
new file mode 100644
index 0000000000..e22be30602
--- /dev/null
+++ b/src/python/tritonfrontend/CMakeLists.txt
@@ -0,0 +1,181 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+cmake_minimum_required(VERSION 3.18)
+
+# ================= Ensures Package is Structured Properly ==================
+# Top level module entry point and typed marker
+file(COPY __init__.py DESTINATION .)
+file(COPY py.typed DESTINATION .)
+# Copy the '__init__.py' for the '_c' module
+file(COPY _c/__init__.py DESTINATION ./_c/.)
+file(COPY _c/__init__.pyi DESTINATION ./_c/.)
+file(COPY _c/tritonfrontend_bindings.pyi DESTINATION ./_c/.)
+# Find and copy _api modules
+file(GLOB PYTHON_MODULE_FILES ./_api/*.py)
+file(COPY ${PYTHON_MODULE_FILES} DESTINATION ./_api/.)
+# ================================= END =====================================
+
+
+# =================== Downloading and Installing pybind11 ===================
+include(FetchContent)
+
+FetchContent_Declare(
+    pybind11
+    GIT_REPOSITORY https://github.com/pybind/pybind11.git
+    GIT_TAG v2.13.1
+    GIT_SHALLOW ON
+)
+
+FetchContent_MakeAvailable(pybind11)
+# ================================= END =====================================
+
+# ================== Collect the Dependencies ===============================
+set(
+  PYTHON_FRONTEND_BINDING_DEPS
+  ../../shared_memory_manager.h
+  ../../shared_memory_manager.cc
+  ../../data_compressor.h
+  ../../common.h
+  ../../common.cc
+  ../../restricted_features.h
+  ../../tracer.h
+  $<$<BOOL:${TRITON_ENABLE_TRACING}>:../../tracer.cc>
+  ../../classification.cc
+)
+
+set(PY_BINDING_DEPENDENCY_LIBS
+      triton-common-json
+      triton-common-logging
+      triton-core-serverapi
+      triton-core-serverstub
+  )
+
+# Conditional Linking Based on Flags
+if(${TRITON_ENABLE_HTTP})
+  list(APPEND PY_BINDING_DEPENDENCY_LIBS
+      http-endpoint-library
+    )
+endif()
+
+if(${TRITON_ENABLE_GRPC})
+  list(APPEND PY_BINDING_DEPENDENCY_LIBS
+      grpc-endpoint-library
+  )
+endif()
+
+if(${TRITON_ENABLE_GPU})
+  find_package(CUDAToolkit REQUIRED)
+  list(APPEND PY_BINDING_DEPENDENCY_LIBS
+      CUDA::cudart
+  )
+endif()
+
+if(${TRITON_ENABLE_TRACING})
+  message("TRACING/STATS IS CURRENTLY NOT SUPPORTED.")
+  find_package(absl CONFIG REQUIRED)
+  find_package(CURL CONFIG REQUIRED)
+  find_package(nlohmann_json CONFIG REQUIRED)
+  find_package(opentelemetry-cpp CONFIG REQUIRED)
+  list(APPEND PY_BINDING_DEPENDENCY_LIBS
+    tracing-library
+  )
+endif()
+
+# ===================== End of Collection ===================================
+
+
+# ================== Create Python Frontend Bindings ========================
+set(
+  PYTHON_FRONTEND_BINDING_SRCS
+  _c/tritonfrontend.h
+  _c/tritonfrontend_pybind.cc
+)
+
+pybind11_add_module(
+  py-bindings
+  MODULE
+  ${PYTHON_FRONTEND_BINDING_DEPS}
+  ${PYTHON_FRONTEND_BINDING_SRCS}
+)
+
+target_include_directories(py-bindings PRIVATE ${CMAKE_SOURCE_DIR}/src)
+
+target_link_libraries(
+    py-bindings
+    PRIVATE
+      ${PY_BINDING_DEPENDENCY_LIBS}
+)
+
+if(${TRITON_ENABLE_HTTP})
+  target_compile_definitions(
+    py-bindings
+    PRIVATE TRITON_ENABLE_HTTP=1
+  )
+endif()
+
+if(${TRITON_ENABLE_GRPC})
+  target_compile_definitions(
+    py-bindings
+    PRIVATE TRITON_ENABLE_GRPC=1
+  )
+endif()
+
+if(${TRITON_ENABLE_GPU})
+  target_compile_definitions(
+    py-bindings
+    PRIVATE TRITON_ENABLE_GPU=1
+    PRIVATE TRITON_MIN_COMPUTE_CAPABILITY=${TRITON_MIN_COMPUTE_CAPABILITY}
+  )
+endif()
+
+if(${TRITON_ENABLE_TRACING})
+  target_include_directories(
+      py-bindings
+      PRIVATE ${OPENTELEMETRY_CPP_INCLUDE_DIRS}
+    )
+  target_compile_definitions(
+    py-bindings
+    PRIVATE TRITON_ENABLE_TRACING=1
+  )
+endif()
+
+if(${TRITON_ENABLE_STATS})
+  target_compile_definitions(
+    py-bindings
+    PRIVATE TRITON_ENABLE_STATS=1
+  )
+endif()
+
+
+set_property(TARGET py-bindings PROPERTY OUTPUT_NAME tritonfrontend_bindings)
+
+set_target_properties(
+    py-bindings
+    PROPERTIES
+      BUILD_RPATH "$ORIGIN:/opt/tritonserver/lib"
+)
+# ===================== End of Python Bindings ==============================
diff --git a/src/python/tritonfrontend/__init__.py b/src/python/tritonfrontend/__init__.py
new file mode 100644
index 0000000000..48eaf64e8b
--- /dev/null
+++ b/src/python/tritonfrontend/__init__.py
@@ -0,0 +1,33 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# triton/server/src/python/tritonfrontend/__init__.py
+
+import builtins
+from importlib.metadata import PackageNotFoundError, version
+
+from tritonfrontend._api._kservegrpc import KServeGrpc
+from tritonfrontend._api._kservehttp import KServeHttp
diff --git a/src/python/tritonfrontend/__init__.pyi b/src/python/tritonfrontend/__init__.pyi
new file mode 100644
index 0000000000..17847e4038
--- /dev/null
+++ b/src/python/tritonfrontend/__init__.pyi
@@ -0,0 +1 @@
+# Need to automate stubgen process as a part of build: https://github.com/triton-inference-server/server/pull/7501#discussion_r1720135228
diff --git a/src/python/tritonfrontend/_api/__init__.py b/src/python/tritonfrontend/_api/__init__.py
new file mode 100644
index 0000000000..dc1c939c66
--- /dev/null
+++ b/src/python/tritonfrontend/_api/__init__.py
@@ -0,0 +1,25 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/src/python/tritonfrontend/_api/_error_mapping.py b/src/python/tritonfrontend/_api/_error_mapping.py
new file mode 100644
index 0000000000..39a1e9aeb1
--- /dev/null
+++ b/src/python/tritonfrontend/_api/_error_mapping.py
@@ -0,0 +1,48 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import tritonserver
+from tritonfrontend._c.tritonfrontend_bindings import (
+    AlreadyExistsError,
+    InternalError,
+    InvalidArgumentError,
+    NotFoundError,
+    TritonError,
+    UnavailableError,
+    UnknownError,
+    UnsupportedError,
+)
+
+ERROR_MAPPING = {
+    TritonError: tritonserver.TritonError,
+    NotFoundError: tritonserver.NotFoundError,
+    UnknownError: tritonserver.UnknownError,
+    InternalError: tritonserver.InternalError,
+    InvalidArgumentError: tritonserver.InvalidArgumentError,
+    UnavailableError: tritonserver.UnavailableError,
+    AlreadyExistsError: tritonserver.AlreadyExistsError,
+    UnsupportedError: tritonserver.UnsupportedError,
+}
diff --git a/src/python/tritonfrontend/_api/_kservegrpc.py b/src/python/tritonfrontend/_api/_kservegrpc.py
new file mode 100644
index 0000000000..5471613340
--- /dev/null
+++ b/src/python/tritonfrontend/_api/_kservegrpc.py
@@ -0,0 +1,137 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import sys
+from enum import IntEnum
+from typing import Union
+
+import tritonserver
+from pydantic import Field
+from pydantic.dataclasses import dataclass
+from tritonfrontend._api._error_mapping import ERROR_MAPPING
+from tritonfrontend._c.tritonfrontend_bindings import (
+    InvalidArgumentError,
+    TritonError,
+    TritonFrontendGrpc,
+)
+
+
+# Enum (mirroring C++ format)
+class Grpc_compression_level(IntEnum):
+    NONE = 0
+    LOW = 1
+    MED = 2
+    HIGH = 3
+    COUNT = 4
+
+
+class KServeGrpc:
+    Grpc_compression_level = (
+        Grpc_compression_level  # Include the enum as a class attribute
+    )
+
+    # triton::server::grpc::Options
+    @dataclass
+    class Options:
+        # triton::server::grpc::SocketOptions
+        address: str = "0.0.0.0"
+        port: int = Field(8001, ge=0, le=65535)
+        reuse_port: bool = False
+        # triton::server::grpc::SslOptions
+        use_ssl: bool = False
+        server_cert: str = ""
+        server_key: str = ""
+        root_cert: str = ""
+        use_mutual_auth: bool = False
+        # triton::server::grpc::KeepAliveOptions
+        keepalive_time_ms: int = Field(7_200_000, ge=0)
+        keepalive_timeout_ms: int = Field(20_000, ge=0)
+        keepalive_permit_without_calls: bool = False
+        http2_max_pings_without_data: int = Field(2, ge=0)
+        http2_min_recv_ping_interval_without_data_ms: int = Field(300_000, ge=0)
+        http2_max_ping_strikes: int = Field(2, ge=0)
+        max_connection_age_ms: int = Field(0, ge=0)
+        max_connection_age_grace_ms: int = Field(0, ge=0)
+
+        # triton::server::grpc::Options
+
+        infer_compression_level: Union[
+            int, Grpc_compression_level
+        ] = Grpc_compression_level.NONE
+        infer_allocation_pool_size: int = Field(8, ge=0)
+        forward_header_pattern: str = ""
+        # DLIS-7215: Add restricted protocol support
+        # restricted_protocols: str = ""
+
+        def __post_init__(self):
+            if isinstance(self.infer_compression_level, Grpc_compression_level):
+                self.infer_compression_level = self.infer_compression_level.value
+
+    class Server:
+        def __init__(self, server: tritonserver, options: "KServeGrpc.Options" = None):
+            try:
+                server_ptr = server._ptr()  # TRITONSERVER_Server pointer
+
+                # If no options provided, default options are selected
+                if options is None:
+                    options = KServeGrpc.Options()
+
+                if not isinstance(options, KServeGrpc.Options):
+                    raise InvalidArgumentError(
+                        "Incorrect type for options. options argument must be of type KServeGrpc.Options"
+                    )
+
+                # Converts dataclass instance -> python dictionary -> unordered_map<string, std::variant<...>>
+                options_dict: dict[str, Union[int, bool, str]] = options.__dict__
+
+                self.triton_frontend = TritonFrontendGrpc(server_ptr, options_dict)
+            except TritonError:
+                exc_type, exc_value, _ = sys.exc_info()
+                # raise ... from None masks the tritonfrontend Error from being added in traceback
+                raise ERROR_MAPPING[exc_type](exc_value) from None
+
+        def __enter__(self):
+            self.triton_frontend.start()
+            return self
+
+        def __exit__(self, exc_type, exc_value, traceback):
+            self.triton_frontend.stop()
+            if exc_type:
+                raise ERROR_MAPPING[exc_type](exc_value) from None
+
+        def start(self):
+            try:
+                self.triton_frontend.start()
+            except TritonError:
+                exc_type, exc_value, _ = sys.exc_info()
+                raise ERROR_MAPPING[exc_type](exc_value) from None
+
+        def stop(self):
+            try:
+                self.triton_frontend.stop()
+            except TritonError:
+                exc_type, exc_value, _ = sys.exc_info()
+                raise ERROR_MAPPING[exc_type](exc_value) from None
diff --git a/src/python/tritonfrontend/_api/_kservegrpc.pyi b/src/python/tritonfrontend/_api/_kservegrpc.pyi
new file mode 100644
index 0000000000..c81d3d6afc
--- /dev/null
+++ b/src/python/tritonfrontend/_api/_kservegrpc.pyi
@@ -0,0 +1,74 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from enum import IntEnum
+
+import tritonserver
+from _typeshed import Incomplete
+from tritonfrontend._c.tritonfrontend_bindings import (
+    InvalidArgumentError as InvalidArgumentError,
+)
+from tritonfrontend._c.tritonfrontend_bindings import (
+    TritonFrontendGrpc as TritonFrontendGrpc,
+)
+
+class Grpc_compression_level(IntEnum):
+    NONE = 0
+    LOW = 1
+    MED = 2
+    HIGH = 3
+    COUNT = 4
+
+class KServeGrpc:
+    Grpc_compression_level = Grpc_compression_level
+    class Options:
+        address: str
+        port: int
+        reuse_port: bool
+        use_ssl: bool
+        server_cert: str
+        server_key: str
+        root_cert: str
+        use_mutual_auth: bool
+        keepalive_time_ms: int
+        keepalive_timeout_ms: int
+        keepalive_permit_without_calls: bool
+        http2_max_pings_without_data: int
+        http2_min_recv_ping_interval_without_data_ms: int
+        http2_max_ping_strikes: int
+        max_connection_age_ms: int
+        max_connection_age_grace_ms: int
+        infer_compression_level: int | Grpc_compression_level
+        infer_allocation_pool_size: int
+        forward_header_pattern: str
+        def __post_init__(self) -> None: ...
+    class Server:
+        triton_frontend: Incomplete
+        def __init__(self, server: tritonserver, options: KServeGrpc.Options = None) -> None: ...
+        def __enter__(self): ...
+        def __exit__(self, exc_type: type[BaseException] | None, exc_value: BaseException | None, traceback: types.TracebackType | None) -> None: ...
+        def start(self): ...
+        def stop(self): ...
diff --git a/src/python/tritonfrontend/_api/_kservehttp.py b/src/python/tritonfrontend/_api/_kservehttp.py
new file mode 100644
index 0000000000..6a2524986a
--- /dev/null
+++ b/src/python/tritonfrontend/_api/_kservehttp.py
@@ -0,0 +1,97 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import sys
+from typing import Union
+
+import tritonserver
+from pydantic import Field
+from pydantic.dataclasses import dataclass
+from tritonfrontend._api._error_mapping import ERROR_MAPPING
+from tritonfrontend._c.tritonfrontend_bindings import (
+    InvalidArgumentError,
+    TritonError,
+    TritonFrontendHttp,
+)
+
+
+class KServeHttp:
+    @dataclass
+    class Options:
+        address: str = "0.0.0.0"
+        port: int = Field(8000, ge=0, le=65535)
+        reuse_port: bool = False
+        thread_count: int = Field(8, ge=0)
+        header_forward_pattern: str = ""
+        # DLIS-7215: Add restricted protocol support
+        # restricted_protocols: list
+
+    class Server:
+        def __init__(self, server: tritonserver, options: "KServeHttp.Options" = None):
+            try:
+                server_ptr = server._ptr()  # TRITONSERVER_Server pointer
+
+                # If no options provided, default options are selected
+                if options is None:
+                    options = KServeHttp.Options()
+
+                if not isinstance(options, KServeHttp.Options):
+                    raise InvalidArgumentError(
+                        "Incorrect type for options. options argument must be of type KServeHttp.Options"
+                    )
+
+                options_dict: dict[str, Union[int, bool, str]] = options.__dict__
+                # Converts dataclass instance -> python dictionary -> unordered_map<string, std::variant<...>>
+
+                self.triton_frontend = TritonFrontendHttp(server_ptr, options_dict)
+            except TritonError:
+                exc_type, exc_value, _ = sys.exc_info()
+                # raise ... from None masks the tritonfrontend Error from being added in traceback
+                raise ERROR_MAPPING[exc_type](exc_value) from None
+
+        def __enter__(self):
+            self.triton_frontend.start()
+            return self
+
+        def __exit__(self, exc_type, exc_value, traceback):
+            self.triton_frontend.stop()
+            if exc_type:
+                raise ERROR_MAPPING[exc_type](exc_value) from None
+
+        def start(self):
+            try:
+                self.triton_frontend.start()
+            except TritonError:
+                exc_type, exc_value, _ = sys.exc_info()
+                raise ERROR_MAPPING[exc_type](exc_value) from None
+
+        def stop(self):
+            try:
+                self.triton_frontend.stop()
+            except TritonError:
+                exc_type, exc_value, _ = sys.exc_info()
+                raise ERROR_MAPPING[exc_type](exc_value) from None
diff --git a/src/python/tritonfrontend/_api/_kservehttp.pyi b/src/python/tritonfrontend/_api/_kservehttp.pyi
new file mode 100644
index 0000000000..60f3997f39
--- /dev/null
+++ b/src/python/tritonfrontend/_api/_kservehttp.pyi
@@ -0,0 +1,49 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import tritonserver
+from _typeshed import Incomplete
+from tritonfrontend._c.tritonfrontend_bindings import (
+    InvalidArgumentError as InvalidArgumentError,
+)
+from tritonfrontend._c.tritonfrontend_bindings import (
+    TritonFrontendHttp as TritonFrontendHttp,
+)
+
+class KServeHttp:
+    class Options:
+        address: str
+        port: int
+        reuse_port: bool
+        thread_count: int
+        header_forward_pattern: str
+    class Server:
+        triton_frontend: Incomplete
+        def __init__(self, server: tritonserver, options: KServeHttp.Options = None) -> None: ...
+        def __enter__(self): ...
+        def __exit__(self, exc_type: type[BaseException] | None, exc_value: BaseException | None, traceback: types.TracebackType | None) -> None: ...
+        def start(self) -> None: ...
+        def stop(self) -> None: ...
diff --git a/src/python/tritonfrontend/_c/__init__.py b/src/python/tritonfrontend/_c/__init__.py
new file mode 100644
index 0000000000..3e892ede64
--- /dev/null
+++ b/src/python/tritonfrontend/_c/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from .tritonfrontend_bindings import *
diff --git a/src/python/tritonfrontend/_c/__init__.pyi b/src/python/tritonfrontend/_c/__init__.pyi
new file mode 100644
index 0000000000..99eaf9dace
--- /dev/null
+++ b/src/python/tritonfrontend/_c/__init__.pyi
@@ -0,0 +1,27 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from tritonfrontend._c.tritonfrontend_bindings import *
diff --git a/src/python/tritonfrontend/_c/tritonfrontend.h b/src/python/tritonfrontend/_c/tritonfrontend.h
new file mode 100644
index 0000000000..172147f566
--- /dev/null
+++ b/src/python/tritonfrontend/_c/tritonfrontend.h
@@ -0,0 +1,139 @@
+// Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include <memory>  // For shared_ptr
+#include <unordered_map>
+#include <variant>
+
+#include "../../../common.h"
+#include "../../../restricted_features.h"
+#include "../../../shared_memory_manager.h"
+#include "../../../tracer.h"
+#include "triton/common/logging.h"
+#include "triton/core/tritonserver.h"
+
+
+struct TRITONSERVER_Server {};
+
+namespace triton { namespace server { namespace python {
+
+// base exception for all Triton error code
+struct TritonError : public std::runtime_error {
+  explicit TritonError(const std::string& what) : std::runtime_error(what) {}
+};
+
+// triton::core::python exceptions map 1:1 to TRITONSERVER_Error_Code.
+struct UnknownError : public TritonError {
+  explicit UnknownError(const std::string& what) : TritonError(what) {}
+};
+struct InternalError : public TritonError {
+  explicit InternalError(const std::string& what) : TritonError(what) {}
+};
+struct NotFoundError : public TritonError {
+  explicit NotFoundError(const std::string& what) : TritonError(what) {}
+};
+struct InvalidArgumentError : public TritonError {
+  explicit InvalidArgumentError(const std::string& what) : TritonError(what) {}
+};
+struct UnavailableError : public TritonError {
+  explicit UnavailableError(const std::string& what) : TritonError(what) {}
+};
+struct UnsupportedError : public TritonError {
+  explicit UnsupportedError(const std::string& what) : TritonError(what) {}
+};
+struct AlreadyExistsError : public TritonError {
+  explicit AlreadyExistsError(const std::string& what) : TritonError(what) {}
+};
+
+void
+ThrowIfError(TRITONSERVER_Error* err)
+{
+  if (err == nullptr) {
+    return;
+  }
+  std::shared_ptr<TRITONSERVER_Error> managed_err(
+      err, TRITONSERVER_ErrorDelete);
+  std::string msg = TRITONSERVER_ErrorMessage(err);
+  switch (TRITONSERVER_ErrorCode(err)) {
+    case TRITONSERVER_ERROR_INTERNAL:
+      throw InternalError(std::move(msg));
+    case TRITONSERVER_ERROR_NOT_FOUND:
+      throw NotFoundError(std::move(msg));
+    case TRITONSERVER_ERROR_INVALID_ARG:
+      throw InvalidArgumentError(std::move(msg));
+    case TRITONSERVER_ERROR_UNAVAILABLE:
+      throw UnavailableError(std::move(msg));
+    case TRITONSERVER_ERROR_UNSUPPORTED:
+      throw UnsupportedError(std::move(msg));
+    case TRITONSERVER_ERROR_ALREADY_EXISTS:
+      throw AlreadyExistsError(std::move(msg));
+    default:
+      throw UnknownError(std::move(msg));
+  }
+}
+
+
+template <typename Base, typename FrontendServer>
+class TritonFrontend {
+ private:
+  std::shared_ptr<TRITONSERVER_Server> server_;
+  std::unique_ptr<Base> service;
+  triton::server::RestrictedFeatures restricted_features;
+  // TODO: [DLIS-7194] Add support for TraceManager & SharedMemoryManager
+  // triton::server::TraceManager trace_manager_;
+  // triton::server::SharedMemoryManager shm_manager_;
+
+ public:
+  TritonFrontend(uintptr_t server_mem_addr, UnorderedMapType data)
+  {
+    TRITONSERVER_Server* server_ptr =
+        reinterpret_cast<TRITONSERVER_Server*>(server_mem_addr);
+
+    server_.reset(server_ptr, EmptyDeleter);
+
+    ThrowIfError(FrontendServer::Create(
+        server_, data, nullptr /* TraceManager */,
+        nullptr /* SharedMemoryManager */, restricted_features, &service));
+  };
+
+  // TODO: [DLIS-7194] Add support for TraceManager & SharedMemoryManager
+  // TritonFrontend(
+  //     uintptr_t server_mem_addr, UnorderedMapType data,
+  //     TraceManager trace_manager, SharedMemoryManager shm_manager)
+
+  void StartService() { ThrowIfError(service->Start()); };
+  void StopService() { ThrowIfError(service->Stop()); };
+
+  // The frontend does not own the TRITONSERVER_Server* object.
+  // Hence, deleting the underlying server instance,
+  // will cause a double-free when the core bindings attempt to
+  // delete the TRITONSERVER_Server instance.
+  static void EmptyDeleter(TRITONSERVER_Server* obj){};
+};
+
+}}}  // namespace triton::server::python
diff --git a/src/python/tritonfrontend/_c/tritonfrontend_bindings.pyi b/src/python/tritonfrontend/_c/tritonfrontend_bindings.pyi
new file mode 100644
index 0000000000..535693a5cb
--- /dev/null
+++ b/src/python/tritonfrontend/_c/tritonfrontend_bindings.pyi
@@ -0,0 +1,44 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from tritonfrontend import AlreadyExistsError as AlreadyExistsError
+from tritonfrontend import InternalError as InternalError
+from tritonfrontend import InvalidArgumentError as InvalidArgumentError
+from tritonfrontend import NotFoundError as NotFoundError
+from tritonfrontend import TritonError as TritonError
+from tritonfrontend import UnavailableError as UnavailableError
+from tritonfrontend import UnknownError as UnknownError
+from tritonfrontend import UnsupportedError as UnsupportedError
+
+class TritonFrontendGrpc:
+    def __init__(self, arg0: int, arg1: dict[str, bool | int | str]) -> None: ...
+    def start(self) -> None: ...
+    def stop(self) -> None: ...
+
+class TritonFrontendHttp:
+    def __init__(self, arg0: int, arg1: dict[str, bool | int | str]) -> None: ...
+    def start(self) -> None: ...
+    def stop(self) -> None: ...
diff --git a/src/python/tritonfrontend/_c/tritonfrontend_pybind.cc b/src/python/tritonfrontend/_c/tritonfrontend_pybind.cc
new file mode 100644
index 0000000000..86a0ac1c41
--- /dev/null
+++ b/src/python/tritonfrontend/_c/tritonfrontend_pybind.cc
@@ -0,0 +1,76 @@
+// Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "../../../grpc/grpc_server.h"
+#include "../../../http_server.h"
+#include "triton/core/tritonserver.h"
+#include "tritonfrontend.h"
+
+
+namespace py = pybind11;
+
+namespace triton { namespace server { namespace python {
+
+
+PYBIND11_MODULE(tritonfrontend_bindings, m)
+{
+  m.doc() = "Python bindings for Triton Inference Server Frontend Endpoints";
+
+  auto tfe = py::register_exception<TritonError>(m, "TritonError");
+  py::register_exception<UnknownError>(m, "UnknownError", tfe.ptr());
+  py::register_exception<InternalError>(m, "InternalError", tfe.ptr());
+  py::register_exception<NotFoundError>(m, "NotFoundError", tfe.ptr());
+  py::register_exception<InvalidArgumentError>(
+      m, "InvalidArgumentError", tfe.ptr());
+  py::register_exception<UnavailableError>(m, "UnavailableError", tfe.ptr());
+  py::register_exception<UnsupportedError>(m, "UnsupportedError", tfe.ptr());
+  py::register_exception<AlreadyExistsError>(
+      m, "AlreadyExistsError", tfe.ptr());
+
+
+  py::class_<TritonFrontend<HTTPServer, HTTPAPIServer>>(m, "TritonFrontendHttp")
+      .def(py::init<uintptr_t, UnorderedMapType>())
+      .def("start", &TritonFrontend<HTTPServer, HTTPAPIServer>::StartService)
+      .def("stop", &TritonFrontend<HTTPServer, HTTPAPIServer>::StopService);
+
+  py::class_<TritonFrontend<
+      triton::server::grpc::Server, triton::server::grpc::Server>>(
+      m, "TritonFrontendGrpc")
+      .def(py::init<uintptr_t, UnorderedMapType>())
+      .def(
+          "start", &TritonFrontend<
+                       triton::server::grpc::Server,
+                       triton::server::grpc::Server>::StartService)
+      .def(
+          "stop", &TritonFrontend<
+                      triton::server::grpc::Server,
+                      triton::server::grpc::Server>::StopService);
+}
+
+}}}  // namespace triton::server::python
diff --git a/src/python/tritonfrontend/py.typed b/src/python/tritonfrontend/py.typed
new file mode 100644
index 0000000000..e69de29bb2

From a6fff975a214ff00221790dd0a5521fb05ce3ac9 Mon Sep 17 00:00:00 2001
From: KrishnanPrash <140860868+KrishnanPrash@users.noreply.github.com>
Date: Tue, 3 Sep 2024 17:53:56 -0500
Subject: [PATCH 23/44] fix: Adding copyright info (#7591)

---
 docs/customization_guide/tritonfrontend.md | 27 ++++++++++++++++++++++
 qa/L0_python_api/test_kserve.py            | 26 +++++++++++++++++++++
 src/python/tritonfrontend/__init__.pyi     | 26 +++++++++++++++++++++
 3 files changed, 79 insertions(+)

diff --git a/docs/customization_guide/tritonfrontend.md b/docs/customization_guide/tritonfrontend.md
index caaac9308d..0ec4b32749 100644
--- a/docs/customization_guide/tritonfrontend.md
+++ b/docs/customization_guide/tritonfrontend.md
@@ -1,3 +1,30 @@
+<!--
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+-->
 ### Triton Server (tritonfrontend) Bindings
 
 The `tritonfrontend` python package is a set of bindings to Triton's existing frontends implemented in C++. Currently, `tritonfrontend` supports starting up `KServeHttp` and `KServeGrpc` frontends. These bindings used in-combination with Triton's Python In-Process API ([`tritonserver`](https://github.com/triton-inference-server/core/tree/main/python/tritonserver)) and [`tritonclient`](https://github.com/triton-inference-server/client/tree/main/src/python/library) extend the ability to use Triton's full feature set with a couple of lines of Python.
diff --git a/qa/L0_python_api/test_kserve.py b/qa/L0_python_api/test_kserve.py
index ab77783d0c..703d86ca43 100644
--- a/qa/L0_python_api/test_kserve.py
+++ b/qa/L0_python_api/test_kserve.py
@@ -1,3 +1,29 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 import time
 from functools import partial
 
diff --git a/src/python/tritonfrontend/__init__.pyi b/src/python/tritonfrontend/__init__.pyi
index 17847e4038..0afb0cb886 100644
--- a/src/python/tritonfrontend/__init__.pyi
+++ b/src/python/tritonfrontend/__init__.pyi
@@ -1 +1,27 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 # Need to automate stubgen process as a part of build: https://github.com/triton-inference-server/server/pull/7501#discussion_r1720135228

From ca8ae28bbf1e1de0d7b1d1cb71f930fccdac5a84 Mon Sep 17 00:00:00 2001
From: Yingge He <157551214+yinggeh@users.noreply.github.com>
Date: Wed, 4 Sep 2024 13:54:43 -0700
Subject: [PATCH 24/44] test: Refactor core input size checks (#7592)

---
 qa/L0_input_validation/input_validation_test.py |  4 ++--
 qa/L0_input_validation/test.sh                  | 17 +++++++++++------
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/qa/L0_input_validation/input_validation_test.py b/qa/L0_input_validation/input_validation_test.py
index 33360b7a08..8e7f58bb0c 100755
--- a/qa/L0_input_validation/input_validation_test.py
+++ b/qa/L0_input_validation/input_validation_test.py
@@ -195,7 +195,7 @@ def get_input_array(input_size, np_dtype):
             triton_client.infer(model_name=model_name, inputs=inputs)
         err_str = str(e.exception)
         self.assertIn(
-            f"expected {input_size} string elements for inference input 'INPUT1', got {input_size-2}",
+            f"expected {input_size} string elements for inference input 'INPUT1' for model '{model_name}', got {input_size-2}",
             err_str,
         )
 
@@ -208,7 +208,7 @@ def get_input_array(input_size, np_dtype):
             triton_client.infer(model_name=model_name, inputs=inputs)
         err_str = str(e.exception)
         self.assertIn(
-            f"expected {input_size} string elements for inference input 'INPUT1', got {input_size+2}",
+            f"unexpected number of string elements {input_size+1} for inference input 'INPUT1' for model '{model_name}', expecting {input_size}",
             err_str,
         )
 
diff --git a/qa/L0_input_validation/test.sh b/qa/L0_input_validation/test.sh
index fc70abd969..22e0560959 100755
--- a/qa/L0_input_validation/test.sh
+++ b/qa/L0_input_validation/test.sh
@@ -68,7 +68,9 @@ set +e
 python3 -m pytest --junitxml="input_validation.report.xml" $TEST_PY::InputValTest >> $CLIENT_LOG 2>&1
 
 if [ $? -ne 0 ]; then
-    echo -e "\n***\n*** input_validation_test.py FAILED. \n***"
+    cat $CLIENT_LOG
+    cat $SERVER_LOG
+    echo -e "\n***\n*** input_validation_test.py::InputValTest FAILED. \n***"
     RET=1
 fi
 set -e
@@ -138,7 +140,9 @@ set +e
 python3 -m pytest --junitxml="input_shape_validation.report.xml" $TEST_PY::InputShapeTest >> $CLIENT_LOG 2>&1
 
 if [ $? -ne 0 ]; then
-    echo -e "\n***\n*** input_validation_test.py FAILED. \n***"
+    cat $CLIENT_LOG
+    cat $SERVER_LOG
+    echo -e "\n***\n*** input_validation_test.py::InputShapeTest FAILED. \n***"
     RET=1
 fi
 set -e
@@ -147,10 +151,13 @@ kill $SERVER_PID
 wait $SERVER_PID
 
 # input_byte_size_test
+cp -r /data/inferenceserver/${REPO_VERSION}/qa_identity_model_repository/{savedmodel_zero_1_float32,savedmodel_zero_1_object} ./models
+
 set +e
-LD_LIBRARY_PATH=/opt/tritonserver/lib:$LD_LIBRARY_PATH $TEST_EXEC >>$TEST_LOG 2>&1
+LD_LIBRARY_PATH=/opt/tritonserver/lib:$LD_LIBRARY_PATH $TEST_EXEC >> $TEST_LOG 2>&1
 if [ $? -ne 0 ]; then
-    echo -e "\n***\n*** Query Unit Test Failed\n***"
+    cat $TEST_LOG
+    echo -e "\n***\n*** input_byte_size_test FAILED\n***"
     RET=1
 fi
 set -e
@@ -158,8 +165,6 @@ set -e
 if [ $RET -eq 0 ]; then
     echo -e "\n***\n*** Input Validation Test Passed\n***"
 else
-    cat $CLIENT_LOG
-    cat $SERVER_LOG
     echo -e "\n***\n*** Input Validation Test FAILED\n***"
 fi
 

From be557b6ffc8d180b86ddbd0e1ddad615dd913df2 Mon Sep 17 00:00:00 2001
From: Francesco Petrini <francescogpetrini@gmail.com>
Date: Fri, 6 Sep 2024 17:13:23 -0700
Subject: [PATCH 25/44] Don't Build `tritonfrontend` for Windows. (#7599)

Don't Build `tritonfrontend` for Windows.
---
 src/CMakeLists.txt | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 2e0380470a..9488fc6233 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -782,8 +782,11 @@ if (NOT WIN32)
   endif() # TRITON_ENABLE_GPU
 endif() # NOT WIN32
 
-# tritonfrontend python package
-add_subdirectory(python)
+# DLIS-7292: Extend tritonfrontend to build for Windows
+if (NOT WIN32)
+  # tritonfrontend python package
+  add_subdirectory(python)
+endif (NOT WIN32)
 
 # Currently unit tests do not build for windows...
 if ( NOT WIN32)

From edd0ac1b02f415a658758410903900fc5017e4f8 Mon Sep 17 00:00:00 2001
From: Sai Kiran Polisetty <spolisetty@nvidia.com>
Date: Wed, 11 Sep 2024 09:08:29 +0530
Subject: [PATCH 26/44] fix: Add reference count tracking for shared memory
 regions (#7567)

Co-authored-by: GuanLuo <41310872+GuanLuo@users.noreply.github.com>
---
 .../cuda_shared_memory_test.py                | 312 ++++++++++++++----
 qa/L0_cuda_shared_memory/test.sh              |  41 +++
 qa/L0_shared_memory/shared_memory_test.py     | 290 ++++++++++++----
 qa/L0_shared_memory/test.sh                   |  40 +++
 qa/L0_trt_shape_tensors/test.sh               |   2 +-
 .../execute_delayed_model/config.pbtxt        |  55 +++
 .../execute_delayed_model/model.py            |  72 ++++
 src/grpc/infer_handler.cc                     |  54 +--
 src/grpc/infer_handler.h                      |  32 +-
 src/grpc/stream_infer_handler.cc              |  36 +-
 src/http_server.cc                            |  13 +-
 src/http_server.h                             |  15 +
 src/shared_memory_manager.cc                  |  47 ++-
 src/shared_memory_manager.h                   |  99 +++---
 14 files changed, 886 insertions(+), 222 deletions(-)
 create mode 100644 qa/python_models/execute_delayed_model/config.pbtxt
 create mode 100644 qa/python_models/execute_delayed_model/model.py

diff --git a/qa/L0_cuda_shared_memory/cuda_shared_memory_test.py b/qa/L0_cuda_shared_memory/cuda_shared_memory_test.py
index 07f9c05a88..51137e8934 100755
--- a/qa/L0_cuda_shared_memory/cuda_shared_memory_test.py
+++ b/qa/L0_cuda_shared_memory/cuda_shared_memory_test.py
@@ -31,18 +31,20 @@
 sys.path.append("../common")
 
 import os
+import time
 import unittest
+from functools import partial
 
 import infer_util as iu
 import numpy as np
 import test_util as tu
 import tritonclient.grpc as grpcclient
 import tritonclient.http as httpclient
-import tritonshmutils.cuda_shared_memory as cshm
+import tritonclient.utils.cuda_shared_memory as cshm
 from tritonclient.utils import *
 
 
-class CudaSharedMemoryTest(tu.TestResultCollector):
+class CudaSharedMemoryTestBase(tu.TestResultCollector):
     DEFAULT_SHM_BYTE_SIZE = 64
 
     def setUp(self):
@@ -61,76 +63,6 @@ def _setup_client(self):
                 self.url, verbose=True
             )
 
-    def test_invalid_create_shm(self):
-        # Raises error since tried to create invalid cuda shared memory region
-        try:
-            shm_op0_handle = cshm.create_shared_memory_region("dummy_data", -1, 0)
-            cshm.destroy_shared_memory_region(shm_op0_handle)
-        except Exception as ex:
-            self.assertEqual(str(ex), "unable to create cuda shared memory handle")
-
-    def test_valid_create_set_register(self):
-        # Create a valid cuda shared memory region, fill data in it and register
-        shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0)
-        cshm.set_shared_memory_region(
-            shm_op0_handle, [np.array([1, 2], dtype=np.float32)]
-        )
-        self.triton_client.register_cuda_shared_memory(
-            "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8
-        )
-        shm_status = self.triton_client.get_cuda_shared_memory_status()
-        if self.protocol == "http":
-            self.assertEqual(len(shm_status), 1)
-        else:
-            self.assertEqual(len(shm_status.regions), 1)
-        cshm.destroy_shared_memory_region(shm_op0_handle)
-
-    def test_unregister_before_register(self):
-        # Create a valid cuda shared memory region and unregister before register
-        shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0)
-        self.triton_client.unregister_cuda_shared_memory("dummy_data")
-        shm_status = self.triton_client.get_cuda_shared_memory_status()
-        if self.protocol == "http":
-            self.assertEqual(len(shm_status), 0)
-        else:
-            self.assertEqual(len(shm_status.regions), 0)
-        cshm.destroy_shared_memory_region(shm_op0_handle)
-
-    def test_unregister_after_register(self):
-        # Create a valid cuda shared memory region and unregister after register
-        shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0)
-        self.triton_client.register_cuda_shared_memory(
-            "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8
-        )
-        self.triton_client.unregister_cuda_shared_memory("dummy_data")
-        shm_status = self.triton_client.get_cuda_shared_memory_status()
-        if self.protocol == "http":
-            self.assertEqual(len(shm_status), 0)
-        else:
-            self.assertEqual(len(shm_status.regions), 0)
-        cshm.destroy_shared_memory_region(shm_op0_handle)
-
-    def test_reregister_after_register(self):
-        # Create a valid cuda shared memory region and unregister after register
-        shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0)
-        self.triton_client.register_cuda_shared_memory(
-            "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8
-        )
-        try:
-            self.triton_client.register_cuda_shared_memory(
-                "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8
-            )
-        except Exception as ex:
-            self.assertIn(
-                "shared memory region 'dummy_data' already in manager", str(ex)
-            )
-        shm_status = self.triton_client.get_cuda_shared_memory_status()
-        if self.protocol == "http":
-            self.assertEqual(len(shm_status), 1)
-        else:
-            self.assertEqual(len(shm_status.regions), 1)
-        cshm.destroy_shared_memory_region(shm_op0_handle)
-
     def _configure_server(
         self,
         create_byte_size=DEFAULT_SHM_BYTE_SIZE,
@@ -205,6 +137,78 @@ def _cleanup_server(self, shm_handles):
         for shm_handle in shm_handles:
             cshm.destroy_shared_memory_region(shm_handle)
 
+
+class CudaSharedMemoryTest(CudaSharedMemoryTestBase):
+    def test_invalid_create_shm(self):
+        # Raises error since tried to create invalid cuda shared memory region
+        try:
+            shm_op0_handle = cshm.create_shared_memory_region("dummy_data", -1, 0)
+            cshm.destroy_shared_memory_region(shm_op0_handle)
+        except Exception as ex:
+            self.assertEqual(str(ex), "unable to create cuda shared memory handle")
+
+    def test_valid_create_set_register(self):
+        # Create a valid cuda shared memory region, fill data in it and register
+        shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0)
+        cshm.set_shared_memory_region(
+            shm_op0_handle, [np.array([1, 2], dtype=np.float32)]
+        )
+        self.triton_client.register_cuda_shared_memory(
+            "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8
+        )
+        shm_status = self.triton_client.get_cuda_shared_memory_status()
+        if self.protocol == "http":
+            self.assertEqual(len(shm_status), 1)
+        else:
+            self.assertEqual(len(shm_status.regions), 1)
+        cshm.destroy_shared_memory_region(shm_op0_handle)
+
+    def test_unregister_before_register(self):
+        # Create a valid cuda shared memory region and unregister before register
+        shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0)
+        self.triton_client.unregister_cuda_shared_memory("dummy_data")
+        shm_status = self.triton_client.get_cuda_shared_memory_status()
+        if self.protocol == "http":
+            self.assertEqual(len(shm_status), 0)
+        else:
+            self.assertEqual(len(shm_status.regions), 0)
+        cshm.destroy_shared_memory_region(shm_op0_handle)
+
+    def test_unregister_after_register(self):
+        # Create a valid cuda shared memory region and unregister after register
+        shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0)
+        self.triton_client.register_cuda_shared_memory(
+            "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8
+        )
+        self.triton_client.unregister_cuda_shared_memory("dummy_data")
+        shm_status = self.triton_client.get_cuda_shared_memory_status()
+        if self.protocol == "http":
+            self.assertEqual(len(shm_status), 0)
+        else:
+            self.assertEqual(len(shm_status.regions), 0)
+        cshm.destroy_shared_memory_region(shm_op0_handle)
+
+    def test_reregister_after_register(self):
+        # Create a valid cuda shared memory region and unregister after register
+        shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0)
+        self.triton_client.register_cuda_shared_memory(
+            "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8
+        )
+        try:
+            self.triton_client.register_cuda_shared_memory(
+                "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8
+            )
+        except Exception as ex:
+            self.assertIn(
+                "shared memory region 'dummy_data' already in manager", str(ex)
+            )
+        shm_status = self.triton_client.get_cuda_shared_memory_status()
+        if self.protocol == "http":
+            self.assertEqual(len(shm_status), 1)
+        else:
+            self.assertEqual(len(shm_status.regions), 1)
+        cshm.destroy_shared_memory_region(shm_op0_handle)
+
     def test_unregister_after_inference(self):
         # Unregister after inference
         error_msg = []
@@ -396,5 +400,169 @@ def test_infer_byte_size_out_of_bound(self):
         self._cleanup_server(shm_handles)
 
 
+class TestCudaSharedMemoryUnregister(CudaSharedMemoryTestBase):
+    def _test_unregister_shm_fail(self):
+        second_client = httpclient.InferenceServerClient("localhost:8000", verbose=True)
+
+        with self.assertRaises(InferenceServerException) as ex:
+            second_client.unregister_cuda_shared_memory()
+        self.assertIn(
+            "Failed to unregister the following cuda shared memory regions: input0_data ,input1_data ,output0_data ,output1_data",
+            str(ex.exception),
+        )
+
+        with self.assertRaises(InferenceServerException) as ex:
+            second_client.unregister_cuda_shared_memory("input0_data")
+        self.assertIn(
+            "Cannot unregister shared memory region 'input0_data', it is currently in use.",
+            str(ex.exception),
+        )
+
+        with self.assertRaises(InferenceServerException) as ex:
+            second_client.unregister_cuda_shared_memory("input1_data")
+        self.assertIn(
+            "Cannot unregister shared memory region 'input1_data', it is currently in use.",
+            str(ex.exception),
+        )
+
+        with self.assertRaises(InferenceServerException) as ex:
+            second_client.unregister_cuda_shared_memory("output0_data")
+        self.assertIn(
+            "Cannot unregister shared memory region 'output0_data', it is currently in use.",
+            str(ex.exception),
+        )
+
+        with self.assertRaises(InferenceServerException) as ex:
+            second_client.unregister_cuda_shared_memory("output1_data")
+        self.assertIn(
+            "Cannot unregister shared memory region 'output1_data', it is currently in use.",
+            str(ex.exception),
+        )
+
+    def _test_shm_not_found(self):
+        second_client = httpclient.InferenceServerClient("localhost:8000", verbose=True)
+
+        with self.assertRaises(InferenceServerException) as ex:
+            second_client.get_cuda_shared_memory_status("input0_data")
+        self.assertIn(
+            "Unable to find cuda shared memory region: 'input0_data'",
+            str(ex.exception),
+        )
+
+        with self.assertRaises(InferenceServerException) as ex:
+            second_client.get_cuda_shared_memory_status("input1_data")
+        self.assertIn(
+            "Unable to find cuda shared memory region: 'input1_data'",
+            str(ex.exception),
+        )
+
+        with self.assertRaises(InferenceServerException) as ex:
+            second_client.get_cuda_shared_memory_status("output0_data")
+        self.assertIn(
+            "Unable to find cuda shared memory region: 'output0_data'",
+            str(ex.exception),
+        )
+
+        with self.assertRaises(InferenceServerException) as ex:
+            second_client.get_cuda_shared_memory_status("output1_data")
+        self.assertIn(
+            "Unable to find cuda shared memory region: 'output1_data'",
+            str(ex.exception),
+        )
+
+    def test_unregister_shm_during_inference_http(self):
+        try:
+            self.triton_client.unregister_cuda_shared_memory()
+            shm_handles = self._configure_server()
+
+            inputs = [
+                httpclient.InferInput("INPUT0", [1, 16], "INT32"),
+                httpclient.InferInput("INPUT1", [1, 16], "INT32"),
+            ]
+            outputs = [
+                httpclient.InferRequestedOutput("OUTPUT0", binary_data=True),
+                httpclient.InferRequestedOutput("OUTPUT1", binary_data=False),
+            ]
+
+            inputs[0].set_shared_memory("input0_data", self.DEFAULT_SHM_BYTE_SIZE)
+            inputs[1].set_shared_memory("input1_data", self.DEFAULT_SHM_BYTE_SIZE)
+            outputs[0].set_shared_memory("output0_data", self.DEFAULT_SHM_BYTE_SIZE)
+            outputs[1].set_shared_memory("output1_data", self.DEFAULT_SHM_BYTE_SIZE)
+
+            async_request = self.triton_client.async_infer(
+                model_name="simple", inputs=inputs, outputs=outputs
+            )
+
+            # Ensure inference started
+            time.sleep(2)
+
+            # Try unregister shm regions during inference
+            self._test_unregister_shm_fail()
+
+            # Blocking call
+            async_request.get_result()
+
+            # Try unregister shm regions after inference
+            self.triton_client.unregister_cuda_shared_memory()
+            self._test_shm_not_found()
+
+        finally:
+            self._cleanup_server(shm_handles)
+
+    def test_unregister_shm_during_inference_grpc(self):
+        try:
+            self.triton_client.unregister_cuda_shared_memory()
+            shm_handles = self._configure_server()
+
+            inputs = [
+                grpcclient.InferInput("INPUT0", [1, 16], "INT32"),
+                grpcclient.InferInput("INPUT1", [1, 16], "INT32"),
+            ]
+            outputs = [
+                grpcclient.InferRequestedOutput("OUTPUT0"),
+                grpcclient.InferRequestedOutput("OUTPUT1"),
+            ]
+
+            inputs[0].set_shared_memory("input0_data", self.DEFAULT_SHM_BYTE_SIZE)
+            inputs[1].set_shared_memory("input1_data", self.DEFAULT_SHM_BYTE_SIZE)
+            outputs[0].set_shared_memory("output0_data", self.DEFAULT_SHM_BYTE_SIZE)
+            outputs[1].set_shared_memory("output1_data", self.DEFAULT_SHM_BYTE_SIZE)
+
+            def callback(user_data, result, error):
+                if error:
+                    user_data.append(error)
+                else:
+                    user_data.append(result)
+
+            user_data = []
+
+            self.triton_client.async_infer(
+                model_name="simple",
+                inputs=inputs,
+                outputs=outputs,
+                callback=partial(callback, user_data),
+            )
+
+            # Ensure inference started
+            time.sleep(2)
+
+            # Try unregister shm regions during inference
+            self._test_unregister_shm_fail()
+
+            # Wait until the results are available in user_data
+            time_out = 20
+            while (len(user_data) == 0) and time_out > 0:
+                time_out = time_out - 1
+                time.sleep(1)
+            time.sleep(2)
+
+            # Try unregister shm regions after inference
+            self.triton_client.unregister_cuda_shared_memory()
+            self._test_shm_not_found()
+
+        finally:
+            self._cleanup_server(shm_handles)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_cuda_shared_memory/test.sh b/qa/L0_cuda_shared_memory/test.sh
index 02857b2153..b7126a9295 100755
--- a/qa/L0_cuda_shared_memory/test.sh
+++ b/qa/L0_cuda_shared_memory/test.sh
@@ -84,6 +84,47 @@ for i in \
     done
 done
 
+mkdir -p python_models/simple/1/
+cp ../python_models/execute_delayed_model/model.py ./python_models/simple/1/
+cp ../python_models/execute_delayed_model/config.pbtxt ./python_models/simple/
+sed -i 's/KIND_CPU/KIND_GPU/g' ./python_models/simple/config.pbtxt
+
+for client_type in http grpc; do
+    SERVER_ARGS="--model-repository=`pwd`/python_models --log-verbose=1 ${SERVER_ARGS_EXTRA}"
+    SERVER_LOG="./unregister_shm.$client_type.server.log"
+    run_server
+    if [ "$SERVER_PID" == "0" ]; then
+        echo -e "\n***\n*** Failed to start $SERVER\n***"
+        cat $SERVER_LOG
+        exit 1
+    fi
+
+    export CLIENT_TYPE=$client_type
+    CLIENT_LOG="./unregister_shm.$client_type.client.log"
+    set +e
+    python3 $SHM_TEST TestCudaSharedMemoryUnregister.test_unregister_shm_during_inference_$client_type >>$CLIENT_LOG 2>&1
+    if [ $? -ne 0 ]; then
+        cat $CLIENT_LOG
+        echo -e "\n***\n*** Test Failed\n***"
+        RET=1
+    else
+        check_test_results $TEST_RESULT_FILE 1
+        if [ $? -ne 0 ]; then
+            cat $TEST_RESULT_FILE
+            echo -e "\n***\n*** Test Result Verification Failed\n***"
+            RET=1
+        fi
+    fi
+
+    kill $SERVER_PID
+    wait $SERVER_PID
+    if [ $? -ne 0 ]; then
+        echo -e "\n***\n*** Test Server shut down non-gracefully\n***"
+        RET=1
+    fi
+    set -e
+    done
+
 if [ $RET -eq 0 ]; then
     echo -e "\n***\n*** Test Passed\n***"
 else
diff --git a/qa/L0_shared_memory/shared_memory_test.py b/qa/L0_shared_memory/shared_memory_test.py
index c38ecb4814..871fca9b2a 100755
--- a/qa/L0_shared_memory/shared_memory_test.py
+++ b/qa/L0_shared_memory/shared_memory_test.py
@@ -31,7 +31,9 @@
 sys.path.append("../common")
 
 import os
+import time
 import unittest
+from functools import partial
 
 import infer_util as iu
 import numpy as np
@@ -43,7 +45,7 @@
 from tritonclient import utils
 
 
-class SharedMemoryTest(tu.TestResultCollector):
+class SystemSharedMemoryTestBase(tu.TestResultCollector):
     DEFAULT_SHM_BYTE_SIZE = 64
 
     def setUp(self):
@@ -62,6 +64,68 @@ def _setup_client(self):
                 self.url, verbose=True
             )
 
+    def _configure_server(
+        self,
+        create_byte_size=DEFAULT_SHM_BYTE_SIZE,
+        register_byte_size=DEFAULT_SHM_BYTE_SIZE,
+        register_offset=0,
+    ):
+        """Creates and registers shared memory regions for testing.
+
+        Parameters
+        ----------
+        create_byte_size: int
+            Size of each system shared memory region to create.
+            NOTE: This should be sufficiently large to hold the inputs/outputs
+                  stored in shared memory.
+
+        register_byte_size: int
+            Size of each system shared memory region to register with server.
+            NOTE: The (offset + register_byte_size) should be less than or equal
+            to the create_byte_size. Otherwise an exception will be raised for
+            an invalid set of registration args.
+
+        register_offset: int
+            Offset into the shared memory object to start the registered region.
+
+        """
+        shm_ip0_handle = shm.create_shared_memory_region(
+            "input0_data", "/input0_data", create_byte_size
+        )
+        shm_ip1_handle = shm.create_shared_memory_region(
+            "input1_data", "/input1_data", create_byte_size
+        )
+        shm_op0_handle = shm.create_shared_memory_region(
+            "output0_data", "/output0_data", create_byte_size
+        )
+        shm_op1_handle = shm.create_shared_memory_region(
+            "output1_data", "/output1_data", create_byte_size
+        )
+        # Implicit assumption that input and output byte_sizes are 64 bytes for now
+        input0_data = np.arange(start=0, stop=16, dtype=np.int32)
+        input1_data = np.ones(shape=16, dtype=np.int32)
+        shm.set_shared_memory_region(shm_ip0_handle, [input0_data])
+        shm.set_shared_memory_region(shm_ip1_handle, [input1_data])
+        self.triton_client.register_system_shared_memory(
+            "input0_data", "/input0_data", register_byte_size, offset=register_offset
+        )
+        self.triton_client.register_system_shared_memory(
+            "input1_data", "/input1_data", register_byte_size, offset=register_offset
+        )
+        self.triton_client.register_system_shared_memory(
+            "output0_data", "/output0_data", register_byte_size, offset=register_offset
+        )
+        self.triton_client.register_system_shared_memory(
+            "output1_data", "/output1_data", register_byte_size, offset=register_offset
+        )
+        return [shm_ip0_handle, shm_ip1_handle, shm_op0_handle, shm_op1_handle]
+
+    def _cleanup_server(self, shm_handles):
+        for shm_handle in shm_handles:
+            shm.destroy_shared_memory_region(shm_handle)
+
+
+class SharedMemoryTest(SystemSharedMemoryTestBase):
     def test_invalid_create_shm(self):
         # Raises error since tried to create invalid system shared memory region
         try:
@@ -128,66 +192,6 @@ def test_reregister_after_register(self):
             self.assertTrue(len(shm_status.regions) == 1)
         shm.destroy_shared_memory_region(shm_op0_handle)
 
-    def _configure_server(
-        self,
-        create_byte_size=DEFAULT_SHM_BYTE_SIZE,
-        register_byte_size=DEFAULT_SHM_BYTE_SIZE,
-        register_offset=0,
-    ):
-        """Creates and registers shared memory regions for testing.
-
-        Parameters
-        ----------
-        create_byte_size: int
-            Size of each system shared memory region to create.
-            NOTE: This should be sufficiently large to hold the inputs/outputs
-                  stored in shared memory.
-
-        register_byte_size: int
-            Size of each system shared memory region to register with server.
-            NOTE: The (offset + register_byte_size) should be less than or equal
-            to the create_byte_size. Otherwise an exception will be raised for
-            an invalid set of registration args.
-
-        register_offset: int
-            Offset into the shared memory object to start the registered region.
-
-        """
-        shm_ip0_handle = shm.create_shared_memory_region(
-            "input0_data", "/input0_data", create_byte_size
-        )
-        shm_ip1_handle = shm.create_shared_memory_region(
-            "input1_data", "/input1_data", create_byte_size
-        )
-        shm_op0_handle = shm.create_shared_memory_region(
-            "output0_data", "/output0_data", create_byte_size
-        )
-        shm_op1_handle = shm.create_shared_memory_region(
-            "output1_data", "/output1_data", create_byte_size
-        )
-        # Implicit assumption that input and output byte_sizes are 64 bytes for now
-        input0_data = np.arange(start=0, stop=16, dtype=np.int32)
-        input1_data = np.ones(shape=16, dtype=np.int32)
-        shm.set_shared_memory_region(shm_ip0_handle, [input0_data])
-        shm.set_shared_memory_region(shm_ip1_handle, [input1_data])
-        self.triton_client.register_system_shared_memory(
-            "input0_data", "/input0_data", register_byte_size, offset=register_offset
-        )
-        self.triton_client.register_system_shared_memory(
-            "input1_data", "/input1_data", register_byte_size, offset=register_offset
-        )
-        self.triton_client.register_system_shared_memory(
-            "output0_data", "/output0_data", register_byte_size, offset=register_offset
-        )
-        self.triton_client.register_system_shared_memory(
-            "output1_data", "/output1_data", register_byte_size, offset=register_offset
-        )
-        return [shm_ip0_handle, shm_ip1_handle, shm_op0_handle, shm_op1_handle]
-
-    def _cleanup_server(self, shm_handles):
-        for shm_handle in shm_handles:
-            shm.destroy_shared_memory_region(shm_handle)
-
     def test_unregister_after_inference(self):
         # Unregister after inference
         error_msg = []
@@ -443,5 +447,169 @@ def test_python_client_leak(self):
         )
 
 
+class TestSharedMemoryUnregister(SystemSharedMemoryTestBase):
+    def _test_unregister_shm_fail(self):
+        second_client = httpclient.InferenceServerClient("localhost:8000", verbose=True)
+
+        with self.assertRaises(utils.InferenceServerException) as ex:
+            second_client.unregister_system_shared_memory()
+        self.assertIn(
+            "Failed to unregister the following system shared memory regions: input0_data ,input1_data ,output0_data ,output1_data",
+            str(ex.exception),
+        )
+
+        with self.assertRaises(utils.InferenceServerException) as ex:
+            second_client.unregister_system_shared_memory("input0_data")
+        self.assertIn(
+            "Cannot unregister shared memory region 'input0_data', it is currently in use.",
+            str(ex.exception),
+        )
+
+        with self.assertRaises(utils.InferenceServerException) as ex:
+            second_client.unregister_system_shared_memory("input1_data")
+        self.assertIn(
+            "Cannot unregister shared memory region 'input1_data', it is currently in use.",
+            str(ex.exception),
+        )
+
+        with self.assertRaises(utils.InferenceServerException) as ex:
+            second_client.unregister_system_shared_memory("output0_data")
+        self.assertIn(
+            "Cannot unregister shared memory region 'output0_data', it is currently in use.",
+            str(ex.exception),
+        )
+
+        with self.assertRaises(utils.InferenceServerException) as ex:
+            second_client.unregister_system_shared_memory("output1_data")
+        self.assertIn(
+            "Cannot unregister shared memory region 'output1_data', it is currently in use.",
+            str(ex.exception),
+        )
+
+    def _test_shm_not_found(self):
+        second_client = httpclient.InferenceServerClient("localhost:8000", verbose=True)
+
+        with self.assertRaises(utils.InferenceServerException) as ex:
+            second_client.get_system_shared_memory_status("input0_data")
+        self.assertIn(
+            "Unable to find system shared memory region: 'input0_data'",
+            str(ex.exception),
+        )
+
+        with self.assertRaises(utils.InferenceServerException) as ex:
+            second_client.get_system_shared_memory_status("input1_data")
+        self.assertIn(
+            "Unable to find system shared memory region: 'input1_data'",
+            str(ex.exception),
+        )
+
+        with self.assertRaises(utils.InferenceServerException) as ex:
+            second_client.get_system_shared_memory_status("output0_data")
+        self.assertIn(
+            "Unable to find system shared memory region: 'output0_data'",
+            str(ex.exception),
+        )
+
+        with self.assertRaises(utils.InferenceServerException) as ex:
+            second_client.get_system_shared_memory_status("output1_data")
+        self.assertIn(
+            "Unable to find system shared memory region: 'output1_data'",
+            str(ex.exception),
+        )
+
+    def test_unregister_shm_during_inference_http(self):
+        try:
+            self.triton_client.unregister_system_shared_memory()
+            shm_handles = self._configure_server()
+
+            inputs = [
+                httpclient.InferInput("INPUT0", [1, 16], "INT32"),
+                httpclient.InferInput("INPUT1", [1, 16], "INT32"),
+            ]
+            outputs = [
+                httpclient.InferRequestedOutput("OUTPUT0", binary_data=True),
+                httpclient.InferRequestedOutput("OUTPUT1", binary_data=False),
+            ]
+
+            inputs[0].set_shared_memory("input0_data", self.DEFAULT_SHM_BYTE_SIZE)
+            inputs[1].set_shared_memory("input1_data", self.DEFAULT_SHM_BYTE_SIZE)
+            outputs[0].set_shared_memory("output0_data", self.DEFAULT_SHM_BYTE_SIZE)
+            outputs[1].set_shared_memory("output1_data", self.DEFAULT_SHM_BYTE_SIZE)
+
+            async_request = self.triton_client.async_infer(
+                model_name="simple", inputs=inputs, outputs=outputs
+            )
+
+            # Ensure inference started
+            time.sleep(2)
+
+            # Try unregister shm regions during inference
+            self._test_unregister_shm_fail()
+
+            # Blocking call
+            async_request.get_result()
+
+            # Try unregister shm regions after inference
+            self.triton_client.unregister_system_shared_memory()
+            self._test_shm_not_found()
+
+        finally:
+            self._cleanup_server(shm_handles)
+
+    def test_unregister_shm_during_inference_grpc(self):
+        try:
+            self.triton_client.unregister_system_shared_memory()
+            shm_handles = self._configure_server()
+
+            inputs = [
+                grpcclient.InferInput("INPUT0", [1, 16], "INT32"),
+                grpcclient.InferInput("INPUT1", [1, 16], "INT32"),
+            ]
+            outputs = [
+                grpcclient.InferRequestedOutput("OUTPUT0"),
+                grpcclient.InferRequestedOutput("OUTPUT1"),
+            ]
+
+            inputs[0].set_shared_memory("input0_data", self.DEFAULT_SHM_BYTE_SIZE)
+            inputs[1].set_shared_memory("input1_data", self.DEFAULT_SHM_BYTE_SIZE)
+            outputs[0].set_shared_memory("output0_data", self.DEFAULT_SHM_BYTE_SIZE)
+            outputs[1].set_shared_memory("output1_data", self.DEFAULT_SHM_BYTE_SIZE)
+
+            def callback(user_data, result, error):
+                if error:
+                    user_data.append(error)
+                else:
+                    user_data.append(result)
+
+            user_data = []
+
+            self.triton_client.async_infer(
+                model_name="simple",
+                inputs=inputs,
+                outputs=outputs,
+                callback=partial(callback, user_data),
+            )
+
+            # Ensure inference started
+            time.sleep(2)
+
+            # Try unregister shm regions during inference
+            self._test_unregister_shm_fail()
+
+            # Wait until the results are available in user_data
+            time_out = 20
+            while (len(user_data) == 0) and time_out > 0:
+                time_out = time_out - 1
+                time.sleep(1)
+            time.sleep(2)
+
+            # Try unregister shm regions after inference
+            self.triton_client.unregister_system_shared_memory()
+            self._test_shm_not_found()
+
+        finally:
+            self._cleanup_server(shm_handles)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_shared_memory/test.sh b/qa/L0_shared_memory/test.sh
index ba6a2fa8f2..e711de9cff 100755
--- a/qa/L0_shared_memory/test.sh
+++ b/qa/L0_shared_memory/test.sh
@@ -95,6 +95,46 @@ for i in \
     done
 done
 
+mkdir -p python_models/simple/1/
+cp ../python_models/execute_delayed_model/model.py ./python_models/simple/1/
+cp ../python_models/execute_delayed_model/config.pbtxt ./python_models/simple/
+
+for client_type in http grpc; do
+    SERVER_ARGS="--model-repository=`pwd`/python_models --log-verbose=1 ${SERVER_ARGS_EXTRA}"
+    SERVER_LOG="./unregister_shm.$client_type.server.log"
+    run_server
+    if [ "$SERVER_PID" == "0" ]; then
+        echo -e "\n***\n*** Failed to start $SERVER\n***"
+        cat $SERVER_LOG
+        exit 1
+    fi
+
+    export CLIENT_TYPE=$client_type
+    CLIENT_LOG="./unregister_shm.$client_type.client.log"
+    set +e
+    python3 $SHM_TEST TestSharedMemoryUnregister.test_unregister_shm_during_inference_$client_type >>$CLIENT_LOG 2>&1
+    if [ $? -ne 0 ]; then
+        cat $CLIENT_LOG
+        echo -e "\n***\n*** Test Failed\n***"
+        RET=1
+    else
+        check_test_results $TEST_RESULT_FILE 1
+        if [ $? -ne 0 ]; then
+            cat $TEST_RESULT_FILE
+            echo -e "\n***\n*** Test Result Verification Failed\n***"
+            RET=1
+        fi
+    fi
+
+    kill $SERVER_PID
+    wait $SERVER_PID
+    if [ $? -ne 0 ]; then
+        echo -e "\n***\n*** Test Server shut down non-gracefully\n***"
+        RET=1
+    fi
+    set -e
+    done
+
 if [ $RET -eq 0 ]; then
     echo -e "\n***\n*** Test Passed\n***"
 else
diff --git a/qa/L0_trt_shape_tensors/test.sh b/qa/L0_trt_shape_tensors/test.sh
index f08ed339b0..548ebb55af 100755
--- a/qa/L0_trt_shape_tensors/test.sh
+++ b/qa/L0_trt_shape_tensors/test.sh
@@ -45,7 +45,7 @@ CLIENT_LOG="./client.log"
 SHAPE_TENSOR_TEST=trt_shape_tensor_test.py
 
 SERVER=/opt/tritonserver/bin/tritonserver
-SERVER_ARGS="--model-repository=`pwd`/models"
+SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=1"
 SERVER_LOG="./inference_server.log"
 source ../common/util.sh
 
diff --git a/qa/python_models/execute_delayed_model/config.pbtxt b/qa/python_models/execute_delayed_model/config.pbtxt
new file mode 100644
index 0000000000..0a4ee59d3e
--- /dev/null
+++ b/qa/python_models/execute_delayed_model/config.pbtxt
@@ -0,0 +1,55 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+name: "simple"
+backend: "python"
+max_batch_size: 8
+input [
+  {
+    name: "INPUT0"
+    data_type: TYPE_INT32
+    dims: [ 16 ]
+  },
+  {
+    name: "INPUT1"
+    data_type: TYPE_INT32
+    dims: [ 16 ]
+  }
+]
+output [
+  {
+    name: "OUTPUT0"
+    data_type: TYPE_INT32
+    dims: [ 16 ]
+  },
+  {
+    name: "OUTPUT1"
+    data_type: TYPE_INT32
+    dims: [ 16 ]
+  }
+]
+
+instance_group [ { kind: KIND_CPU }]
diff --git a/qa/python_models/execute_delayed_model/model.py b/qa/python_models/execute_delayed_model/model.py
new file mode 100644
index 0000000000..055b321a93
--- /dev/null
+++ b/qa/python_models/execute_delayed_model/model.py
@@ -0,0 +1,72 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import json
+import time
+
+import triton_python_backend_utils as pb_utils
+
+
+class TritonPythonModel:
+    def initialize(self, args):
+        self.model_config = model_config = json.loads(args["model_config"])
+        output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0")
+        output1_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT1")
+        self.output0_dtype = pb_utils.triton_string_to_numpy(
+            output0_config["data_type"]
+        )
+        self.output1_dtype = pb_utils.triton_string_to_numpy(
+            output1_config["data_type"]
+        )
+
+    def execute(self, requests):
+        output0_dtype = self.output0_dtype
+        output1_dtype = self.output1_dtype
+        responses = []
+
+        time.sleep(15)
+
+        for request in requests:
+            in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0")
+            in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1")
+
+            out_0, out_1 = (
+                in_0.as_numpy() + in_1.as_numpy(),
+                in_0.as_numpy() - in_1.as_numpy(),
+            )
+
+            out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0.astype(output0_dtype))
+            out_tensor_1 = pb_utils.Tensor("OUTPUT1", out_1.astype(output1_dtype))
+
+            inference_response = pb_utils.InferenceResponse(
+                output_tensors=[out_tensor_0, out_tensor_1]
+            )
+            responses.append(inference_response)
+
+        return responses
+
+    def finalize(self):
+        print("Cleaning up...")
diff --git a/src/grpc/infer_handler.cc b/src/grpc/infer_handler.cc
index 916230381b..c4ba9338cb 100644
--- a/src/grpc/infer_handler.cc
+++ b/src/grpc/infer_handler.cc
@@ -158,18 +158,6 @@ InferResponseFree(
   return nullptr;  // Success
 }
 
-TRITONSERVER_Error* InferGRPCToInputHelper(
-    const std::string& input_name, const std::string& model_name,
-    const TRITONSERVER_DataType tensor_dt, const TRITONSERVER_DataType input_dt,
-    const size_t binary_data_byte_size);
-
-TRITONSERVER_Error* InferGRPCToInput(
-    const std::shared_ptr<TRITONSERVER_Server>& tritonserver,
-    const std::shared_ptr<SharedMemoryManager>& shm_manager,
-    const inference::ModelInferRequest& request,
-    std::list<std::string>* serialized_data,
-    TRITONSERVER_InferenceRequest* inference_request);
-
 TRITONSERVER_Error*
 InferGRPCToInputHelper(
     const std::string& input_name, const std::string& model_name,
@@ -391,7 +379,9 @@ InferGRPCToInput(
     const std::shared_ptr<SharedMemoryManager>& shm_manager,
     const inference::ModelInferRequest& request,
     std::list<std::string>* serialized_data,
-    TRITONSERVER_InferenceRequest* inference_request)
+    TRITONSERVER_InferenceRequest* inference_request,
+    std::vector<std::shared_ptr<const SharedMemoryManager::SharedMemoryInfo>>*
+        shm_regions_info)
 {
   // Verify that the batch-byte-size of each input matches the size of
   // the provided tensor data (provided raw or from shared memory)
@@ -432,9 +422,14 @@ InferGRPCToInput(
                 .c_str());
       }
       void* tmp;
+      std::shared_ptr<const SharedMemoryManager::SharedMemoryInfo> shm_info =
+          nullptr;
       RETURN_IF_ERR(shm_manager->GetMemoryInfo(
-          region_name, offset, byte_size, &tmp, &memory_type, &memory_type_id));
+          region_name, offset, byte_size, &tmp, &memory_type, &memory_type_id,
+          &shm_info));
       base = tmp;
+      shm_regions_info->emplace_back(shm_info);
+
       if (memory_type == TRITONSERVER_MEMORY_GPU) {
 #ifdef TRITON_ENABLE_GPU
         RETURN_IF_ERR(shm_manager->GetCUDAHandle(
@@ -911,18 +906,32 @@ ModelInferHandler::Execute(InferHandler::State* state)
   // tensors are present in the request.
   std::list<std::string> serialized_data;
 
+  // Maintain shared pointers(read-only reference) to the shared memory block's
+  // information for the shared memory regions used by the request. These
+  // pointers will automatically increase the usage count, preventing
+  // unregistration of the shared memory. This vector must be cleared in the
+  // `InferResponseComplete` callback (after inference) to decrease the count
+  // and permit unregistration. The vector will be included in
+  // `response_release_payload` for the callback.
+  std::vector<std::shared_ptr<const SharedMemoryManager::SharedMemoryInfo>>
+      shm_regions_info;
+
   if (err == nullptr) {
     err = InferGRPCToInput(
-        tritonserver_, shm_manager_, request, &serialized_data, irequest);
+        tritonserver_, shm_manager_, request, &serialized_data, irequest,
+        &shm_regions_info);
   }
   if (err == nullptr) {
     err = InferAllocatorPayload<inference::ModelInferResponse>(
         tritonserver_, shm_manager_, request, std::move(serialized_data),
-        response_queue, &state->alloc_payload_);
+        response_queue, &state->alloc_payload_, &shm_regions_info);
   }
 
   auto request_release_payload =
       std::make_unique<RequestReleasePayload>(state->inference_request_);
+  auto response_release_payload = std::make_unique<ResponseReleasePayload>(
+      state, std::move(shm_regions_info));
+
   if (err == nullptr) {
     err = TRITONSERVER_InferenceRequestSetReleaseCallback(
         irequest, InferRequestComplete,
@@ -932,7 +941,8 @@ ModelInferHandler::Execute(InferHandler::State* state)
     err = TRITONSERVER_InferenceRequestSetResponseCallback(
         irequest, allocator_,
         &state->alloc_payload_ /* response_allocator_userp */,
-        InferResponseComplete, reinterpret_cast<void*>(state));
+        InferResponseComplete,
+        response_release_payload.get() /* response_userp */);
   }
   // Get request ID for logging in case of error.
   const char* request_id = "";
@@ -970,8 +980,9 @@ ModelInferHandler::Execute(InferHandler::State* state)
   // to handle gRPC stream cancellation.
   if (err == nullptr) {
     state->context_->InsertInflightState(state);
-    // The payload will be cleaned in request release callback.
+    // The payload will be cleaned in release callback.
     request_release_payload.release();
+    response_release_payload.release();
   } else {
     // If error go immediately to COMPLETE.
     LOG_VERBOSE(1) << "[request id: " << request_id << "] "
@@ -1000,7 +1011,9 @@ ModelInferHandler::InferResponseComplete(
     TRITONSERVER_InferenceResponse* iresponse, const uint32_t flags,
     void* userp)
 {
-  State* state = reinterpret_cast<State*>(userp);
+  ResponseReleasePayload* response_release_payload(
+      static_cast<ResponseReleasePayload*>(userp));
+  auto state = response_release_payload->state_;
 
   // There are multiple handlers registered in the gRPC service
   // Hence, we would need to properly synchronize this thread
@@ -1042,6 +1055,7 @@ ModelInferHandler::InferResponseComplete(
     // in the next cycle.
     state->context_->PutTaskBackToQueue(state);
 
+    delete response_release_payload;
     return;
   }
 
@@ -1104,6 +1118,8 @@ ModelInferHandler::InferResponseComplete(
   if (response_created) {
     delete response;
   }
+
+  delete response_release_payload;
 }
 
 }}}  // namespace triton::server::grpc
diff --git a/src/grpc/infer_handler.h b/src/grpc/infer_handler.h
index 51307d4ae0..87536dd173 100644
--- a/src/grpc/infer_handler.h
+++ b/src/grpc/infer_handler.h
@@ -299,7 +299,9 @@ InferAllocatorPayload(
     const inference::ModelInferRequest& request,
     std::list<std::string>&& serialized_data,
     std::shared_ptr<ResponseQueue<ResponseType>> response_queue,
-    AllocPayload<ResponseType>* alloc_payload)
+    AllocPayload<ResponseType>* alloc_payload,
+    std::vector<std::shared_ptr<const SharedMemoryManager::SharedMemoryInfo>>*
+        shm_regions_info)
 {
   alloc_payload->response_queue_ = response_queue;
   alloc_payload->shm_map_.clear();
@@ -335,9 +337,12 @@ InferAllocatorPayload(
       void* base;
       TRITONSERVER_MemoryType memory_type;
       int64_t memory_type_id;
+      std::shared_ptr<const SharedMemoryManager::SharedMemoryInfo> shm_info =
+          nullptr;
       RETURN_IF_ERR(shm_manager->GetMemoryInfo(
-          region_name, offset, byte_size, &base, &memory_type,
-          &memory_type_id));
+          region_name, offset, byte_size, &base, &memory_type, &memory_type_id,
+          &shm_info));
+      shm_regions_info->emplace_back(shm_info);
 
       if (memory_type == TRITONSERVER_MEMORY_GPU) {
 #ifdef TRITON_ENABLE_GPU
@@ -373,7 +378,9 @@ TRITONSERVER_Error* InferGRPCToInput(
     const std::shared_ptr<SharedMemoryManager>& shm_manager,
     const inference::ModelInferRequest& request,
     std::list<std::string>* serialized_data,
-    TRITONSERVER_InferenceRequest* inference_request);
+    TRITONSERVER_InferenceRequest* inference_request,
+    std::vector<std::shared_ptr<const SharedMemoryManager::SharedMemoryInfo>>*
+        shm_regions_info);
 
 TRITONSERVER_Error* ResponseAllocatorHelper(
     TRITONSERVER_ResponseAllocator* allocator, const char* tensor_name,
@@ -1263,6 +1270,23 @@ class InferHandler : public HandlerBase {
     delete state;
   }
 
+  // Simple structure that carries the payload needed for
+  // response release callback.
+  struct ResponseReleasePayload final {
+    State* state_;
+    std::vector<std::shared_ptr<const SharedMemoryManager::SharedMemoryInfo>>
+        shm_regions_info_;
+
+    ResponseReleasePayload(
+        State* state,
+        std::vector<
+            std::shared_ptr<const SharedMemoryManager::SharedMemoryInfo>>&&
+            shm_regions_info)
+        : state_(state), shm_regions_info_(std::move(shm_regions_info))
+    {
+    }
+  };
+
   virtual void StartNewRequest() = 0;
   virtual bool Process(State* state, bool rpc_ok) = 0;
   bool ExecutePrecondition(InferHandler::State* state);
diff --git a/src/grpc/stream_infer_handler.cc b/src/grpc/stream_infer_handler.cc
index 6651eca813..1f554db83c 100644
--- a/src/grpc/stream_infer_handler.cc
+++ b/src/grpc/stream_infer_handler.cc
@@ -282,18 +282,32 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok)
     // tensors are present in the request.
     std::list<std::string> serialized_data;
 
+    // Maintain shared pointers(read-only reference) to the shared memory
+    // block's information for the shared memory regions used by the request.
+    // These pointers will automatically increase the usage count, preventing
+    // unregistration of the shared memory. This vector must be cleared in the
+    // `StreamInferResponseComplete` callback (after inference) to decrease the
+    // count and permit unregistration. The vector will be included in
+    // `response_release_payload` for the callback.
+    std::vector<std::shared_ptr<const SharedMemoryManager::SharedMemoryInfo>>
+        shm_regions_info;
+
     if (err == nullptr) {
       err = InferGRPCToInput(
-          tritonserver_, shm_manager_, request, &serialized_data, irequest);
+          tritonserver_, shm_manager_, request, &serialized_data, irequest,
+          &shm_regions_info);
     }
     if (err == nullptr) {
       err = InferAllocatorPayload<inference::ModelStreamInferResponse>(
           tritonserver_, shm_manager_, request, std::move(serialized_data),
-          response_queue_, &state->alloc_payload_);
+          response_queue_, &state->alloc_payload_, &shm_regions_info);
     }
 
     auto request_release_payload =
         std::make_unique<RequestReleasePayload>(state->inference_request_);
+    auto response_release_payload = std::make_unique<ResponseReleasePayload>(
+        state, std::move(shm_regions_info));
+
     if (err == nullptr) {
       err = TRITONSERVER_InferenceRequestSetReleaseCallback(
           irequest, InferRequestComplete,
@@ -303,7 +317,8 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok)
       err = TRITONSERVER_InferenceRequestSetResponseCallback(
           irequest, allocator_,
           &state->alloc_payload_ /* response_allocator_userp */,
-          StreamInferResponseComplete, reinterpret_cast<void*>(state));
+          StreamInferResponseComplete,
+          response_release_payload.get() /* response_userp */);
     }
 
     if (err == nullptr) {
@@ -330,8 +345,9 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok)
     // irequest to handle gRPC stream cancellation.
     if (err == nullptr) {
       state->context_->InsertInflightState(state);
-      // The payload will be cleaned in request release callback.
+      // The payload will be cleaned in release callback.
       request_release_payload.release();
+      response_release_payload.release();
     } else {
       // If there was an error then enqueue the error response and show
       // it to be ready for writing.
@@ -594,7 +610,10 @@ ModelStreamInferHandler::StreamInferResponseComplete(
     TRITONSERVER_InferenceResponse* iresponse, const uint32_t flags,
     void* userp)
 {
-  State* state = reinterpret_cast<State*>(userp);
+  ResponseReleasePayload* response_release_payload(
+      static_cast<ResponseReleasePayload*>(userp));
+  auto state = response_release_payload->state_;
+
   // Ignore Response from CORE in case GRPC Strict as we dont care about
   if (state->context_->gRPCErrorTracker_->triton_grpc_error_) {
     std::lock_guard<std::recursive_mutex> lock(state->context_->mu_);
@@ -648,6 +667,7 @@ ModelStreamInferHandler::StreamInferResponseComplete(
     if (is_complete) {
       state->step_ = Steps::CANCELLED;
       state->context_->PutTaskBackToQueue(state);
+      delete response_release_payload;
     }
 
     state->complete_ = is_complete;
@@ -695,6 +715,7 @@ ModelStreamInferHandler::StreamInferResponseComplete(
         LOG_TRITONSERVER_ERROR(
             TRITONSERVER_InferenceResponseDelete(iresponse),
             "deleting GRPC inference response");
+        delete response_release_payload;
         return;
       }
     }
@@ -774,6 +795,7 @@ ModelStreamInferHandler::StreamInferResponseComplete(
     if (is_complete) {
       state->step_ = Steps::CANCELLED;
       state->context_->PutTaskBackToQueue(state);
+      delete response_release_payload;
     }
 
     state->complete_ = is_complete;
@@ -818,6 +840,10 @@ ModelStreamInferHandler::StreamInferResponseComplete(
     }
     state->complete_ = is_complete;
   }
+
+  if (is_complete) {
+    delete response_release_payload;
+  }
 }
 
 // Changes the state of grpc_stream_error_state_ to ERROR_HANDLING_COMPLETE,
diff --git a/src/http_server.cc b/src/http_server.cc
index cfd1da88ae..2fa395fc98 100644
--- a/src/http_server.cc
+++ b/src/http_server.cc
@@ -2681,9 +2681,13 @@ HTTPAPIServer::ParseJsonTritonIO(
         void* base;
         TRITONSERVER_MemoryType memory_type;
         int64_t memory_type_id;
+        std::shared_ptr<const SharedMemoryManager::SharedMemoryInfo> shm_info =
+            nullptr;
         RETURN_IF_ERR(shm_manager_->GetMemoryInfo(
             shm_region, shm_offset, byte_size, &base, &memory_type,
-            &memory_type_id));
+            &memory_type_id, &shm_info));
+        infer_req->AddShmRegionInfo(shm_info);
+
         if (memory_type == TRITONSERVER_MEMORY_GPU) {
 #ifdef TRITON_ENABLE_GPU
           cudaIpcMemHandle_t* cuda_handle;
@@ -2796,9 +2800,12 @@ HTTPAPIServer::ParseJsonTritonIO(
         void* base;
         TRITONSERVER_MemoryType memory_type;
         int64_t memory_type_id;
+        std::shared_ptr<const SharedMemoryManager::SharedMemoryInfo> shm_info =
+            nullptr;
         RETURN_IF_ERR(shm_manager_->GetMemoryInfo(
-            shm_region, offset, byte_size, &base, &memory_type,
-            &memory_type_id));
+            shm_region, offset, byte_size, &base, &memory_type, &memory_type_id,
+            &shm_info));
+        infer_req->AddShmRegionInfo(shm_info);
 
         if (memory_type == TRITONSERVER_MEMORY_GPU) {
 #ifdef TRITON_ENABLE_GPU
diff --git a/src/http_server.h b/src/http_server.h
index 3ad3d60cc4..3949f97e27 100644
--- a/src/http_server.h
+++ b/src/http_server.h
@@ -311,6 +311,13 @@ class HTTPAPIServer : public HTTPServer {
 
     static void ReplyCallback(evthr_t* thr, void* arg, void* shared);
 
+    void AddShmRegionInfo(
+        const std::shared_ptr<const SharedMemoryManager::SharedMemoryInfo>&
+            shm_info)
+    {
+      shm_regions_info_.push_back(shm_info);
+    }
+
    protected:
     TRITONSERVER_Server* server_{nullptr};
     evhtp_request_t* req_{nullptr};
@@ -330,6 +337,14 @@ class HTTPAPIServer : public HTTPServer {
     // TRITONSERVER_ServerInferAsync (except for cancellation).
     std::shared_ptr<TRITONSERVER_InferenceRequest> triton_request_{nullptr};
 
+    // Maintain shared pointers(read-only reference) to the shared memory
+    // block's information for the shared memory regions used by the request.
+    // These pointers will automatically increase the usage count, preventing
+    // unregistration of the shared memory. This vector must be cleared when no
+    // longer needed to decrease the count and permit unregistration.
+    std::vector<std::shared_ptr<const SharedMemoryManager::SharedMemoryInfo>>
+        shm_regions_info_;
+
     evhtp_res response_code_{EVHTP_RES_OK};
   };
 
diff --git a/src/shared_memory_manager.cc b/src/shared_memory_manager.cc
index 1f4a77e887..7b845709a1 100644
--- a/src/shared_memory_manager.cc
+++ b/src/shared_memory_manager.cc
@@ -69,7 +69,8 @@ TRITONSERVER_Error*
 SharedMemoryManager::GetMemoryInfo(
     const std::string& name, size_t offset, size_t byte_size,
     void** shm_mapped_addr, TRITONSERVER_MemoryType* memory_type,
-    int64_t* device_id)
+    int64_t* device_id,
+    std::shared_ptr<const SharedMemoryManager::SharedMemoryInfo>* shm_info)
 {
   return TRITONSERVER_ErrorNew(
       TRITONSERVER_ERROR_UNSUPPORTED,
@@ -408,9 +409,9 @@ SharedMemoryManager::RegisterSystemSharedMemory(
   }
 
   shared_memory_map_.insert(std::make_pair(
-      name, std::unique_ptr<SharedMemoryInfo>(new SharedMemoryInfo(
+      name, std::make_shared<SharedMemoryManager::SharedMemoryInfo>(
                 name, shm_key, offset, byte_size, shm_fd, mapped_addr,
-                TRITONSERVER_MEMORY_CPU, 0))));
+                TRITONSERVER_MEMORY_CPU, 0)));
 
   return nullptr;  // success
 }
@@ -444,9 +445,9 @@ SharedMemoryManager::RegisterCUDASharedMemory(
       name, reinterpret_cast<CUdeviceptr>(mapped_addr), byte_size));
 
   shared_memory_map_.insert(std::make_pair(
-      name, std::unique_ptr<CUDASharedMemoryInfo>(new CUDASharedMemoryInfo(
+      name, std::make_shared<SharedMemoryManager::CUDASharedMemoryInfo>(
                 name, "", 0, byte_size, 0, mapped_addr, TRITONSERVER_MEMORY_GPU,
-                device_id, cuda_shm_handle))));
+                device_id, cuda_shm_handle)));
 
   return nullptr;  // success
 }
@@ -456,7 +457,8 @@ TRITONSERVER_Error*
 SharedMemoryManager::GetMemoryInfo(
     const std::string& name, size_t offset, size_t byte_size,
     void** shm_mapped_addr, TRITONSERVER_MemoryType* memory_type,
-    int64_t* device_id)
+    int64_t* device_id,
+    std::shared_ptr<const SharedMemoryManager::SharedMemoryInfo>* shm_info)
 {
   // protect shared_memory_map_ from concurrent access
   std::lock_guard<std::mutex> lock(mu_);
@@ -494,6 +496,10 @@ SharedMemoryManager::GetMemoryInfo(
             .c_str());
   }
 
+  if (shm_info != nullptr) {
+    *shm_info = std::static_pointer_cast<const SharedMemoryInfo>(it->second);
+  }
+
   if (it->second->kind_ == TRITONSERVER_MEMORY_CPU) {
     *shm_mapped_addr = (void*)((uint8_t*)it->second->mapped_addr_ +
                                it->second->offset_ + offset);
@@ -561,11 +567,19 @@ SharedMemoryManager::GetStatus(
   } else {
     auto it = shared_memory_map_.find(name);
     if (it == shared_memory_map_.end()) {
-      return TRITONSERVER_ErrorNew(
-          TRITONSERVER_ERROR_NOT_FOUND,
-          std::string(
-              "Unable to find system shared memory region: '" + name + "'")
-              .c_str());
+      if (memory_type == TRITONSERVER_MEMORY_GPU) {
+        return TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_NOT_FOUND,
+            std::string(
+                "Unable to find cuda shared memory region: '" + name + "'")
+                .c_str());
+      } else {
+        return TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_NOT_FOUND,
+            std::string(
+                "Unable to find system shared memory region: '" + name + "'")
+                .c_str());
+      }
     }
 
     if (it->second->kind_ != memory_type) {
@@ -632,6 +646,7 @@ SharedMemoryManager::UnregisterAll(TRITONSERVER_MemoryType memory_type)
         TRITONSERVER_Error* err = UnregisterHelper(it->first, memory_type);
         if (err != nullptr) {
           unregister_fails.push_back(it->first);
+          LOG_VERBOSE(1) << TRITONSERVER_ErrorMessage(err);
         }
       }
     }
@@ -645,6 +660,7 @@ SharedMemoryManager::UnregisterAll(TRITONSERVER_MemoryType memory_type)
         ;
         if (err != nullptr) {
           unregister_fails.push_back(it->first);
+          LOG_VERBOSE(1) << TRITONSERVER_ErrorMessage(err);
         }
       }
     }
@@ -669,6 +685,15 @@ SharedMemoryManager::UnregisterHelper(
   // Must hold the lock on register_mu_ while calling this function.
   auto it = shared_memory_map_.find(name);
   if (it != shared_memory_map_.end() && it->second->kind_ == memory_type) {
+    if (it->second.use_count() > 1) {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INTERNAL,
+          std::string(
+              "Cannot unregister shared memory region '" + name +
+              "', it is currently in use.")
+              .c_str());
+    }
+
     if (it->second->kind_ == TRITONSERVER_MEMORY_CPU) {
       RETURN_IF_ERR(
           UnmapSharedMemory(it->second->mapped_addr_, it->second->byte_size_));
diff --git a/src/shared_memory_manager.h b/src/shared_memory_manager.h
index 51eb0f0786..393fd29128 100644
--- a/src/shared_memory_manager.h
+++ b/src/shared_memory_manager.h
@@ -1,4 +1,4 @@
-// Copyright 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -50,6 +50,48 @@ class SharedMemoryManager {
   SharedMemoryManager() = default;
   ~SharedMemoryManager();
 
+  /// A struct that records the shared memory regions registered by the shared
+  /// memory manager.
+  struct SharedMemoryInfo {
+    SharedMemoryInfo(
+        const std::string& name, const std::string& shm_key,
+        const size_t offset, const size_t byte_size, int shm_fd,
+        void* mapped_addr, const TRITONSERVER_MemoryType kind,
+        const int64_t device_id)
+        : name_(name), shm_key_(shm_key), offset_(offset),
+          byte_size_(byte_size), shm_fd_(shm_fd), mapped_addr_(mapped_addr),
+          kind_(kind), device_id_(device_id)
+    {
+    }
+
+    std::string name_;
+    std::string shm_key_;
+    size_t offset_;
+    size_t byte_size_;
+    int shm_fd_;
+    void* mapped_addr_;
+    TRITONSERVER_MemoryType kind_;
+    int64_t device_id_;
+  };
+
+#ifdef TRITON_ENABLE_GPU
+  struct CUDASharedMemoryInfo : SharedMemoryInfo {
+    CUDASharedMemoryInfo(
+        const std::string& name, const std::string& shm_key,
+        const size_t offset, const size_t byte_size, int shm_fd,
+        void* mapped_addr, const TRITONSERVER_MemoryType kind,
+        const int64_t device_id, const cudaIpcMemHandle_t* cuda_ipc_handle)
+        : SharedMemoryInfo(
+              name, shm_key, offset, byte_size, shm_fd, mapped_addr, kind,
+              device_id),
+          cuda_ipc_handle_(*cuda_ipc_handle)
+    {
+    }
+
+    cudaIpcMemHandle_t cuda_ipc_handle_;
+  };
+#endif
+
   /// Add a shared memory block representing shared memory in system
   /// (CPU) memory to the manager. Return TRITONSERVER_ERROR_ALREADY_EXISTS
   /// if a shared memory block of the same name already exists in the manager.
@@ -90,11 +132,18 @@ class SharedMemoryManager {
   /// \param memory_type Returns the type of the memory
   /// \param device_id Returns the device id associated with the
   /// memory block
-  /// \return a TRITONSERVER_Error indicating success or failure.
+  /// \param shm_info Returns a shared pointer reference(read-only) to the
+  /// shared memory block's information.
+  /// This pointer will automatically increase the usage count, preventing
+  /// unregistration while the reference is held. The reference must be cleared
+  /// or set to nullptr when no longer needed, to decrease the count and allow
+  /// unregistration.
+  /// \return a TRITONSERVER_Error indicating success or
+  /// failure.
   TRITONSERVER_Error* GetMemoryInfo(
       const std::string& name, size_t offset, size_t byte_size,
       void** shm_mapped_addr, TRITONSERVER_MemoryType* memory_type,
-      int64_t* device_id);
+      int64_t* device_id, std::shared_ptr<const SharedMemoryInfo>* shm_info);
 
 #ifdef TRITON_ENABLE_GPU
   /// Get the CUDA memory handle associated with the block name.
@@ -139,50 +188,8 @@ class SharedMemoryManager {
   TRITONSERVER_Error* UnregisterHelper(
       const std::string& name, TRITONSERVER_MemoryType memory_type);
 
-  /// A struct that records the shared memory regions registered by the shared
-  /// memory manager.
-  struct SharedMemoryInfo {
-    SharedMemoryInfo(
-        const std::string& name, const std::string& shm_key,
-        const size_t offset, const size_t byte_size, int shm_fd,
-        void* mapped_addr, const TRITONSERVER_MemoryType kind,
-        const int64_t device_id)
-        : name_(name), shm_key_(shm_key), offset_(offset),
-          byte_size_(byte_size), shm_fd_(shm_fd), mapped_addr_(mapped_addr),
-          kind_(kind), device_id_(device_id)
-    {
-    }
-
-    std::string name_;
-    std::string shm_key_;
-    size_t offset_;
-    size_t byte_size_;
-    int shm_fd_;
-    void* mapped_addr_;
-    TRITONSERVER_MemoryType kind_;
-    int64_t device_id_;
-  };
-
-#ifdef TRITON_ENABLE_GPU
-  struct CUDASharedMemoryInfo : SharedMemoryInfo {
-    CUDASharedMemoryInfo(
-        const std::string& name, const std::string& shm_key,
-        const size_t offset, const size_t byte_size, int shm_fd,
-        void* mapped_addr, const TRITONSERVER_MemoryType kind,
-        const int64_t device_id, const cudaIpcMemHandle_t* cuda_ipc_handle)
-        : SharedMemoryInfo(
-              name, shm_key, offset, byte_size, shm_fd, mapped_addr, kind,
-              device_id),
-          cuda_ipc_handle_(*cuda_ipc_handle)
-    {
-    }
-
-    cudaIpcMemHandle_t cuda_ipc_handle_;
-  };
-#endif
-
   using SharedMemoryStateMap =
-      std::map<std::string, std::unique_ptr<SharedMemoryInfo>>;
+      std::map<std::string, std::shared_ptr<SharedMemoryInfo>>;
   // A map between the name and the details of the associated
   // shared memory block
   SharedMemoryStateMap shared_memory_map_;

From 363bcdcd03cddcd00979c7fd3315557328221c6d Mon Sep 17 00:00:00 2001
From: Francesco Petrini <francescogpetrini@gmail.com>
Date: Wed, 11 Sep 2024 16:27:28 -0700
Subject: [PATCH 27/44] build/test: RHEL8 EA3 (#7595)

---
 build.py                       | 11 +++++++----
 qa/L0_sequence_batcher/test.sh | 21 ++++++++++++++++++---
 2 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/build.py b/build.py
index 3195c50cbb..4d4d911468 100755
--- a/build.py
+++ b/build.py
@@ -1374,12 +1374,15 @@ def dockerfile_prepare_container_linux(argmap, backends, enable_gpu, target_mach
 
     if enable_gpu:
         df += install_dcgm_libraries(argmap["DCGM_VERSION"], target_machine)
-        df += """
+        # This segment will break the RHEL SBSA build. Need to determine whether
+        # this is necessary to incorporate.
+        if target_platform() != "rhel":
+            df += """
 # Extra defensive wiring for CUDA Compat lib
 RUN ln -sf ${_CUDA_COMPAT_PATH}/lib.real ${_CUDA_COMPAT_PATH}/lib \\
-      && echo ${_CUDA_COMPAT_PATH}/lib > /etc/ld.so.conf.d/00-cuda-compat.conf \\
-      && ldconfig \\
-      && rm -f ${_CUDA_COMPAT_PATH}/lib
+    && echo ${_CUDA_COMPAT_PATH}/lib > /etc/ld.so.conf.d/00-cuda-compat.conf \\
+    && ldconfig \\
+    && rm -f ${_CUDA_COMPAT_PATH}/lib
 """
     else:
         df += add_cpu_libs_to_linux_dockerfile(backends, target_machine)
diff --git a/qa/L0_sequence_batcher/test.sh b/qa/L0_sequence_batcher/test.sh
index 23ee387b55..ac34458b4e 100755
--- a/qa/L0_sequence_batcher/test.sh
+++ b/qa/L0_sequence_batcher/test.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2018-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -183,6 +183,16 @@ export USE_SINGLE_BUFFER
 #   models4 - four instances with batch-size 1
 rm -fr *.log  models{0,1,2,4} queue_delay_models && mkdir models{0,1,2,4} queue_delay_models
 
+# Search BACKENDS to determine if a backend should be tested
+function should_test_backend() {
+  local target_backend=$1
+  if [[ $(echo "${BACKENDS[@]}" | grep -c "${target_backend}") -ne 0 ]]; then
+    echo "true"
+    return
+  fi
+  echo "false"
+}
+
 # Get the datatype to use based on the backend
 function get_datatype () {
   local dtype="int32 bool"
@@ -827,8 +837,13 @@ fi
 
 ### Start Preserve Ordering Tests ###
 
-# Test only supported on windows currently due to use of python backend models
-if [ ${WINDOWS} -ne 1 ]; then
+# FIXME: Test only supported on windows currently due to use of python backend models.
+# Now that Windows supports the PYBE, we should check that this tests works once Windows
+# CI is stable.
+
+# These subtests use python models. They should not be executed if 'python' is not one
+# of the backends under test.
+if [[ $(should_test_backend "python") == "true" &&  !( -v WSL_DISTRO_NAME || -v MSYSTEM )]]; then
     # Test preserve ordering true/false and decoupled/non-decoupled
     TEST_CASE=SequenceBatcherPreserveOrderingTest
     MODEL_PATH=preserve_ordering_models

From 68d4c01e4491e6bb033a4063b67eb41b55cb4ea4 Mon Sep 17 00:00:00 2001
From: Sai Kiran Polisetty <spolisetty@nvidia.com>
Date: Tue, 17 Sep 2024 23:37:30 +0530
Subject: [PATCH 28/44] Fix: Add mutex lock for state completion check in gRPC
 streaming to prevent race condition (#7617)

---
 src/grpc/stream_infer_handler.cc | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/src/grpc/stream_infer_handler.cc b/src/grpc/stream_infer_handler.cc
index 1f554db83c..cf788b1e09 100644
--- a/src/grpc/stream_infer_handler.cc
+++ b/src/grpc/stream_infer_handler.cc
@@ -537,15 +537,18 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok)
     } else if (state->step_ == Steps::WRITEREADY) {
       // Finish the state if all the transactions associated with
       // the state have completed.
-      if (state->IsComplete()) {
-        state->context_->DecrementRequestCounter();
-        finished = Finish(state);
-      } else {
-        LOG_ERROR << "Should not print this! Decoupled should NOT write via "
-                     "WRITEREADY!";
-        // Remove the state from the completion queue
-        std::lock_guard<std::recursive_mutex> lock(state->step_mtx_);
-        state->step_ = Steps::ISSUED;
+      std::lock_guard<std::recursive_mutex> lk1(state->context_->mu_);
+      {
+        if (state->IsComplete()) {
+          state->context_->DecrementRequestCounter();
+          finished = Finish(state);
+        } else {
+          LOG_ERROR << "Should not print this! Decoupled should NOT write via "
+                       "WRITEREADY!";
+          // Remove the state from the completion queue
+          std::lock_guard<std::recursive_mutex> lock(state->step_mtx_);
+          state->step_ = Steps::ISSUED;
+        }
       }
     }
   }

From 7dde2688c3f320d8314bba158efc59b727abc666 Mon Sep 17 00:00:00 2001
From: vd-nv <vdong@nvidia.com>
Date: Fri, 20 Sep 2024 06:30:00 +0800
Subject: [PATCH 29/44] Update fetch_models.sh (#7621)

---
 docs/examples/fetch_models.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/examples/fetch_models.sh b/docs/examples/fetch_models.sh
index 5594878b3e..f5aaed85aa 100755
--- a/docs/examples/fetch_models.sh
+++ b/docs/examples/fetch_models.sh
@@ -37,4 +37,4 @@ mv /tmp/inception_v3_2016_08_28_frozen.pb model_repository/inception_graphdef/1/
 # ONNX densenet
 mkdir -p model_repository/densenet_onnx/1
 wget -O model_repository/densenet_onnx/1/model.onnx \
-     https://contentmamluswest001.blob.core.windows.net/content/14b2744cf8d6418c87ffddc3f3127242/9502630827244d60a1214f250e3bbca7/08aed7327d694b8dbaee2c97b8d0fcba/densenet121-1.2.onnx
+     https://github.com/onnx/models/raw/main/validated/vision/classification/densenet-121/model/densenet-7.onnx

From dbb064ff930c876b5de46088f675bd60756c2969 Mon Sep 17 00:00:00 2001
From: Harshini Komali <157742537+lkomali@users.noreply.github.com>
Date: Fri, 20 Sep 2024 14:47:08 -0700
Subject: [PATCH 30/44] ci: Set stability factor to a higher value (#7634)

---
 qa/L0_perf_analyzer_capi/test.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/qa/L0_perf_analyzer_capi/test.sh b/qa/L0_perf_analyzer_capi/test.sh
index d031e2cacf..3e3f9e4af6 100755
--- a/qa/L0_perf_analyzer_capi/test.sh
+++ b/qa/L0_perf_analyzer_capi/test.sh
@@ -56,7 +56,7 @@ SHAPETENSORADTAFILE=`pwd`/../common/perf_analyzer_input_data_json/shape_tensor_d
 
 ERROR_STRING="error | Request count: 0 | : 0 infer/sec"
 
-STABILITY_THRESHOLD="15"
+STABILITY_THRESHOLD="9999"
 
 source ../common/util.sh
 

From 92255d7a754679985e12649b5d8887259031e9d6 Mon Sep 17 00:00:00 2001
From: Olga Andreeva <124622579+oandreeva-nv@users.noreply.github.com>
Date: Tue, 1 Oct 2024 15:08:40 -0700
Subject: [PATCH 31/44] [docs] Removed vLLM meetup announcement (#7673)

---
 README.md | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/README.md b/README.md
index da80cc3a2b..63b23a9c15 100644
--- a/README.md
+++ b/README.md
@@ -28,17 +28,6 @@
 
 # Triton Inference Server
 
-📣 **vLLM x Triton Meetup at Fort Mason on Sept 9th 4:00 - 9:00 pm**
-
-We are excited to announce that we will be hosting our Triton user meetup with the vLLM team at
-[Fort Mason](https://maps.app.goo.gl/9Lr3fxRssrpQCGK58) on Sept 9th 4:00 - 9:00 pm. Join us for this
-exclusive event where you will learn about the newest vLLM and Triton features, get a
-glimpse into the roadmaps, and connect with fellow users, the NVIDIA Triton and vLLM teams. Seating is limited and registration confirmation
-is required to attend - please register [here](https://lu.ma/87q3nvnh) to join
-the meetup.
-
-___
-
 [![License](https://img.shields.io/badge/License-BSD3-lightgrey.svg)](https://opensource.org/licenses/BSD-3-Clause)
 
 [!WARNING]

From bfe2a2bec753e4e9002183b7817c849596b65dbc Mon Sep 17 00:00:00 2001
From: pvijayakrish <pvijayakrish@nvidia.com>
Date: Wed, 25 Sep 2024 16:33:21 -0700
Subject: [PATCH 32/44] Update the versions post 24.09 release.

---
 build.py                                       |  2 +-
 deploy/aws/values.yaml                         |  2 +-
 deploy/fleetcommand/Chart.yaml                 |  2 +-
 deploy/fleetcommand/values.yaml                |  6 +++---
 deploy/gcp/values.yaml                         |  2 +-
 .../perf-analyzer-script/triton_client.yaml    |  2 +-
 .../server-deployer/build_and_push.sh          |  6 +++---
 .../server-deployer/chart/triton/Chart.yaml    |  4 ++--
 .../server-deployer/chart/triton/values.yaml   |  6 +++---
 .../server-deployer/data-test/schema.yaml      |  2 +-
 .../server-deployer/schema.yaml                |  4 ++--
 .../gke-marketplace-app/trt-engine/README.md   |  6 +++---
 deploy/k8s-onprem/values.yaml                  |  2 +-
 deploy/oci/values.yaml                         |  2 +-
 docs/customization_guide/build.md              |  6 +++---
 docs/customization_guide/compose.md            | 18 +++++++++---------
 docs/customization_guide/test.md               |  2 +-
 docs/generate_docs.py                          |  4 ++--
 docs/user_guide/custom_operations.md           |  6 +++---
 docs/user_guide/performance_tuning.md          |  4 ++--
 qa/common/gen_jetson_trt_models                |  2 +-
 qa/common/gen_qa_custom_ops                    |  2 +-
 qa/common/gen_qa_model_repository              |  2 +-
 23 files changed, 47 insertions(+), 47 deletions(-)

diff --git a/build.py b/build.py
index 4d4d911468..36aaa161bc 100755
--- a/build.py
+++ b/build.py
@@ -72,7 +72,7 @@
 TRITON_VERSION_MAP = {
     "2.50.0dev": (
         "24.09dev",  # triton container
-        "24.08",  # upstream container
+        "24.09",  # upstream container
         "1.18.1",  # ORT
         "2024.0.0",  # ORT OpenVINO
         "2024.0.0",  # Standalone OpenVINO
diff --git a/deploy/aws/values.yaml b/deploy/aws/values.yaml
index 67ecba6c53..bd8ae0fe3b 100644
--- a/deploy/aws/values.yaml
+++ b/deploy/aws/values.yaml
@@ -27,7 +27,7 @@
 replicaCount: 1
 
 image:
-  imageName: nvcr.io/nvidia/tritonserver:24.08-py3
+  imageName: nvcr.io/nvidia/tritonserver:24.09-py3
   pullPolicy: IfNotPresent
   modelRepositoryPath: s3://triton-inference-server-repository/model_repository
   numGpus: 1
diff --git a/deploy/fleetcommand/Chart.yaml b/deploy/fleetcommand/Chart.yaml
index 68aaf8f405..8feee92b3c 100644
--- a/deploy/fleetcommand/Chart.yaml
+++ b/deploy/fleetcommand/Chart.yaml
@@ -26,7 +26,7 @@
 
 apiVersion: v1
 # appVersion is the Triton version; update when changing release
-appVersion: "2.49.0"
+appVersion: "2.50.0"
 description: Triton Inference Server (Fleet Command)
 name: triton-inference-server
 # version is the Chart version; update when changing anything in the chart
diff --git a/deploy/fleetcommand/values.yaml b/deploy/fleetcommand/values.yaml
index f3036d5bee..dc5f37ca3b 100644
--- a/deploy/fleetcommand/values.yaml
+++ b/deploy/fleetcommand/values.yaml
@@ -27,7 +27,7 @@
 replicaCount: 1
 
 image:
-  imageName: nvcr.io/nvidia/tritonserver:24.08-py3
+  imageName: nvcr.io/nvidia/tritonserver:24.09-py3
   pullPolicy: IfNotPresent
   numGpus: 1
   serverCommand: tritonserver
@@ -47,13 +47,13 @@ image:
     #
     # To set model control mode, uncomment and configure below
     # TODO: Fix the following url, it is invalid
-    # See https://github.com/triton-inference-server/server/blob/r24.08/docs/model_management.md
+    # See https://github.com/triton-inference-server/server/blob/r24.09/docs/model_management.md
     #  for more details
     #- --model-control-mode=explicit|poll|none
     #
     # Additional server args
     #
-    # see https://github.com/triton-inference-server/server/blob/r24.08/README.md
+    # see https://github.com/triton-inference-server/server/blob/r24.09/README.md
     #  for more details
 
 service:
diff --git a/deploy/gcp/values.yaml b/deploy/gcp/values.yaml
index c25bcf58ce..c5427c151e 100644
--- a/deploy/gcp/values.yaml
+++ b/deploy/gcp/values.yaml
@@ -27,7 +27,7 @@
 replicaCount: 1
 
 image:
-  imageName: nvcr.io/nvidia/tritonserver:24.08-py3
+  imageName: nvcr.io/nvidia/tritonserver:24.09-py3
   pullPolicy: IfNotPresent
   modelRepositoryPath: gs://triton-inference-server-repository/model_repository
   numGpus: 1
diff --git a/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml b/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml
index 4b896a1ac7..a63a12ce34 100644
--- a/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml
+++ b/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml
@@ -33,7 +33,7 @@ metadata:
   namespace: default
 spec:
   containers:
-  - image: nvcr.io/nvidia/tritonserver:24.08-py3-sdk
+  - image: nvcr.io/nvidia/tritonserver:24.09-py3-sdk
     imagePullPolicy: Always
     name: nv-triton-client
     securityContext:
diff --git a/deploy/gke-marketplace-app/server-deployer/build_and_push.sh b/deploy/gke-marketplace-app/server-deployer/build_and_push.sh
index cc5fa998b4..19d84816a0 100755
--- a/deploy/gke-marketplace-app/server-deployer/build_and_push.sh
+++ b/deploy/gke-marketplace-app/server-deployer/build_and_push.sh
@@ -27,9 +27,9 @@
 
 export REGISTRY=gcr.io/$(gcloud config get-value project | tr ':' '/')
 export APP_NAME=tritonserver
-export MAJOR_VERSION=2.49
-export MINOR_VERSION=2.49.0
-export NGC_VERSION=24.08-py3
+export MAJOR_VERSION=2.50
+export MINOR_VERSION=2.50.0
+export NGC_VERSION=24.09-py3
 
 docker pull nvcr.io/nvidia/$APP_NAME:$NGC_VERSION
 
diff --git a/deploy/gke-marketplace-app/server-deployer/chart/triton/Chart.yaml b/deploy/gke-marketplace-app/server-deployer/chart/triton/Chart.yaml
index 41e2e8cdb9..e9f8880a0b 100644
--- a/deploy/gke-marketplace-app/server-deployer/chart/triton/Chart.yaml
+++ b/deploy/gke-marketplace-app/server-deployer/chart/triton/Chart.yaml
@@ -25,7 +25,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 apiVersion: v1
-appVersion: "2.49"
+appVersion: "2.50"
 description: Triton Inference Server
 name: triton-inference-server
-version: 2.49.0
+version: 2.50.0
diff --git a/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml b/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml
index 7f8a96608f..450d8f735c 100644
--- a/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml
+++ b/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml
@@ -31,14 +31,14 @@ maxReplicaCount: 3
 tritonProtocol: HTTP
 # HPA GPU utilization autoscaling target
 HPATargetAverageValue: 85
-modelRepositoryPath: gs://triton_sample_models/24.08
-publishedVersion: '2.49.0'
+modelRepositoryPath: gs://triton_sample_models/24.09
+publishedVersion: '2.50.0'
 gcpMarketplace: true
 
 image:
   registry: gcr.io
   repository: nvidia-ngc-public/tritonserver
-  tag: 24.08-py3
+  tag: 24.09-py3
   pullPolicy: IfNotPresent
   # modify the model repository here to match your GCP storage bucket
   numGpus: 1
diff --git a/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml b/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml
index 356b3cce0f..16494b5261 100644
--- a/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml
+++ b/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml
@@ -27,7 +27,7 @@
 x-google-marketplace:
   schemaVersion: v2
   applicationApiVersion: v1beta1
-  publishedVersion: '2.49.0'
+  publishedVersion: '2.50.0'
   publishedVersionMetadata:
     releaseNote: >-
       Initial release.
diff --git a/deploy/gke-marketplace-app/server-deployer/schema.yaml b/deploy/gke-marketplace-app/server-deployer/schema.yaml
index 076f62e05b..f3525a52f1 100644
--- a/deploy/gke-marketplace-app/server-deployer/schema.yaml
+++ b/deploy/gke-marketplace-app/server-deployer/schema.yaml
@@ -27,7 +27,7 @@
 x-google-marketplace:
   schemaVersion: v2
   applicationApiVersion: v1beta1
-  publishedVersion: '2.49.0'
+  publishedVersion: '2.50.0'
   publishedVersionMetadata:
     releaseNote: >-
       Initial release.
@@ -89,7 +89,7 @@ properties:
   modelRepositoryPath:
     type: string
     title: Bucket where models are stored. Please make sure the user/service account to create the GKE app has permission to this GCS bucket. Read Triton documentation on configs and formatting details, supporting TensorRT, TensorFlow, Pytorch, Onnx ... etc.
-    default: gs://triton_sample_models/24.08
+    default: gs://triton_sample_models/24.09
   image.ldPreloadPath:
     type: string
     title: Leave this empty by default. Triton allows users to create custom layers for backend such as TensorRT plugin or Tensorflow custom ops, the compiled shared library must be provided via LD_PRELOAD environment variable.
diff --git a/deploy/gke-marketplace-app/trt-engine/README.md b/deploy/gke-marketplace-app/trt-engine/README.md
index aa8fa2a399..0c8012eb68 100644
--- a/deploy/gke-marketplace-app/trt-engine/README.md
+++ b/deploy/gke-marketplace-app/trt-engine/README.md
@@ -33,7 +33,7 @@
 ```
 docker run --gpus all -it --network host \
     --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 \
-    -v ~:/scripts nvcr.io/nvidia/tensorrt:24.08-py3
+    -v ~:/scripts nvcr.io/nvidia/tensorrt:24.09-py3
 
 pip install onnx six torch tf2onnx tensorflow
 
@@ -57,7 +57,7 @@ mkdir -p engines
 
 python3 builder.py -m models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/model.ckpt -o engines/bert_large_int8_bs1_s128.engine -b 1 -s 128 -c models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/ -v models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/vocab.txt --int8 --fp16 --strict --calib-num 1 -iln -imh
 
-gsutil cp bert_large_int8_bs1_s128.engine gs://triton_sample_models/24.08/bert/1/model.plan
+gsutil cp bert_large_int8_bs1_s128.engine gs://triton_sample_models/24.09/bert/1/model.plan
 ```
 
-For each Triton upgrade, container version used to generate the model, and the model path in GCS `gs://triton_sample_models/24.08/` should be updated accordingly with the correct version.
+For each Triton upgrade, container version used to generate the model, and the model path in GCS `gs://triton_sample_models/24.09/` should be updated accordingly with the correct version.
diff --git a/deploy/k8s-onprem/values.yaml b/deploy/k8s-onprem/values.yaml
index f0f28b68e1..ccee5e9c24 100644
--- a/deploy/k8s-onprem/values.yaml
+++ b/deploy/k8s-onprem/values.yaml
@@ -29,7 +29,7 @@ tags:
   loadBalancing: true
 
 image:
-  imageName: nvcr.io/nvidia/tritonserver:24.08-py3
+  imageName: nvcr.io/nvidia/tritonserver:24.09-py3
   pullPolicy: IfNotPresent
   modelRepositoryServer: < Replace with the IP Address of your file server >
   modelRepositoryPath: /srv/models
diff --git a/deploy/oci/values.yaml b/deploy/oci/values.yaml
index bf83490db4..55b8193ee2 100644
--- a/deploy/oci/values.yaml
+++ b/deploy/oci/values.yaml
@@ -27,7 +27,7 @@
 replicaCount: 1
 
 image:
-  imageName: nvcr.io/nvidia/tritonserver:24.08-py3
+  imageName: nvcr.io/nvidia/tritonserver:24.09-py3
   pullPolicy: IfNotPresent
   modelRepositoryPath: s3://https://<OCI_NAMESPACE>.compat.objectstorage.<OCI_REGION>.oraclecloud.com:443/triton-inference-server-repository
   numGpus: 1
diff --git a/docs/customization_guide/build.md b/docs/customization_guide/build.md
index f0f3bd99e2..56e5875776 100644
--- a/docs/customization_guide/build.md
+++ b/docs/customization_guide/build.md
@@ -173,7 +173,7 @@ $ ./build.py ... --repo-tag=common:<container tag> --repo-tag=core:<container ta
 
 If you are building on a release branch then `<container tag>` will
 default to the branch name. For example, if you are building on the
-r24.08 branch, `<container tag>` will default to r24.08. If you are
+r24.09 branch, `<container tag>` will default to r24.09. If you are
 building on any other branch (including the *main* branch) then
 `<container tag>` will default to "main". Therefore, you typically do
 not need to provide `<container tag>` at all (nor the preceding
@@ -334,8 +334,8 @@ python build.py --cmake-dir=<path/to/repo>/build --build-dir=/tmp/citritonbuild
 If you are building on *main* branch then `<container tag>` will
 default to "main". If you are building on a release branch then
 `<container tag>` will default to the branch name. For example, if you
-are building on the r24.08 branch, `<container tag>` will default to
-r24.08. Therefore, you typically do not need to provide `<container
+are building on the r24.09 branch, `<container tag>` will default to
+r24.09. Therefore, you typically do not need to provide `<container
 tag>` at all (nor the preceding colon). You can use a different
 `<container tag>` for a component to instead use the corresponding
 branch/tag in the build. For example, if you have a branch called
diff --git a/docs/customization_guide/compose.md b/docs/customization_guide/compose.md
index 563061c317..0c6afc1e0b 100644
--- a/docs/customization_guide/compose.md
+++ b/docs/customization_guide/compose.md
@@ -46,8 +46,8 @@ The `compose.py` script can be found in the
 Simply clone the repository and run `compose.py` to create a custom container.
 Note: Created container version will depend on the branch that was cloned.
 For example branch
- [r24.08](https://github.com/triton-inference-server/server/tree/r24.08)
-should be used to create a image based on the NGC 24.08 Triton release.
+ [r24.09](https://github.com/triton-inference-server/server/tree/r24.09)
+should be used to create a image based on the NGC 24.09 Triton release.
 
 `compose.py` provides `--backend`, `--repoagent` options that allow you to
 specify which backends and repository agents to include in the custom image.
@@ -79,20 +79,20 @@ For example, running
 ```
 python3 compose.py --backend pytorch --repoagent checksum
 ```
-on branch [r24.08](https://github.com/triton-inference-server/server/tree/r24.08) pulls:
-- `min` container `nvcr.io/nvidia/tritonserver:24.08-py3-min`
-- `full` container `nvcr.io/nvidia/tritonserver:24.08-py3`
+on branch [r24.09](https://github.com/triton-inference-server/server/tree/r24.09) pulls:
+- `min` container `nvcr.io/nvidia/tritonserver:24.09-py3-min`
+- `full` container `nvcr.io/nvidia/tritonserver:24.09-py3`
 
 Alternatively, users can specify the version of Triton container to pull from
 any branch by either:
 1. Adding flag `--container-version <container version>` to branch
 ```
-python3 compose.py --backend pytorch --repoagent checksum --container-version 24.08
+python3 compose.py --backend pytorch --repoagent checksum --container-version 24.09
 ```
 2. Specifying `--image min,<min container image name> --image full,<full container image name>`.
    The user is responsible for specifying compatible `min` and `full` containers.
 ```
-python3 compose.py --backend pytorch --repoagent checksum --image min,nvcr.io/nvidia/tritonserver:24.08-py3-min --image full,nvcr.io/nvidia/tritonserver:24.08-py3
+python3 compose.py --backend pytorch --repoagent checksum --image min,nvcr.io/nvidia/tritonserver:24.09-py3-min --image full,nvcr.io/nvidia/tritonserver:24.09-py3
 ```
 Method 1 and 2 will result in the same composed container. Furthermore,
 `--image` flag overrides the `--container-version` flag when both are specified.
@@ -103,8 +103,8 @@ Note:
 2. vLLM and TensorRT-LLM backends are currently not supported backends for
 `compose.py`. If you want to build additional backends on top of these backends,
 it would be better to [build it yourself](#build-it-yourself) by using
-`nvcr.io/nvidia/tritonserver:24.08-vllm-python-py3` or
-`nvcr.io/nvidia/tritonserver:24.08-trtllm-python-py3` as a `min` container.
+`nvcr.io/nvidia/tritonserver:24.09-vllm-python-py3` or
+`nvcr.io/nvidia/tritonserver:24.09-trtllm-python-py3` as a `min` container.
 
 
 ### CPU-only container composition
diff --git a/docs/customization_guide/test.md b/docs/customization_guide/test.md
index 898267e34f..8487e6e3ad 100644
--- a/docs/customization_guide/test.md
+++ b/docs/customization_guide/test.md
@@ -49,7 +49,7 @@ $ ./gen_qa_custom_ops
 ```
 
 This will create multiple model repositories in /tmp/\<version\>/qa_*
-(for example /tmp/24.08/qa_model_repository).  The TensorRT models
+(for example /tmp/24.09/qa_model_repository).  The TensorRT models
 will be created for the GPU on the system that CUDA considers device 0
 (zero). If you have multiple GPUs on your system see the documentation
 in the scripts for how to target a specific GPU.
diff --git a/docs/generate_docs.py b/docs/generate_docs.py
index 3cb9de4bc6..cb7ed02d9f 100755
--- a/docs/generate_docs.py
+++ b/docs/generate_docs.py
@@ -43,11 +43,11 @@
 """
 TODO: Needs to handle cross-branch linkage.
 
-For example, server/docs/user_guide/architecture.md on branch 24.08 links to
+For example, server/docs/user_guide/architecture.md on branch 24.09 links to
 server/docs/user_guide/model_analyzer.md on main branch. In this case, the
 hyperlink of model_analyzer.md should be a URL instead of relative path.
 
-Another example can be server/docs/user_guide/model_analyzer.md on branch 24.08
+Another example can be server/docs/user_guide/model_analyzer.md on branch 24.09
 links to a file in server repo with relative path. Currently all URLs are
 hardcoded to main branch. We need to make sure that the URL actually points to the
 correct branch. We also need to handle cases like deprecated or removed files from
diff --git a/docs/user_guide/custom_operations.md b/docs/user_guide/custom_operations.md
index 6fa3cee3dc..88a7037c7f 100644
--- a/docs/user_guide/custom_operations.md
+++ b/docs/user_guide/custom_operations.md
@@ -64,7 +64,7 @@ simple way to ensure you are using the correct version of TensorRT is
 to use the [NGC TensorRT
 container](https://ngc.nvidia.com/catalog/containers/nvidia:tensorrt)
 corresponding to the Triton container. For example, if you are using
-the 24.08 version of Triton, use the 24.08 version of the TensorRT
+the 24.09 version of Triton, use the 24.09 version of the TensorRT
 container.
 
 ## TensorFlow
@@ -123,7 +123,7 @@ simple way to ensure you are using the correct version of TensorFlow
 is to use the [NGC TensorFlow
 container](https://ngc.nvidia.com/catalog/containers/nvidia:tensorflow)
 corresponding to the Triton container. For example, if you are using
-the 24.08 version of Triton, use the 24.08 version of the TensorFlow
+the 24.09 version of Triton, use the 24.09 version of the TensorFlow
 container.
 
 ## PyTorch
@@ -167,7 +167,7 @@ simple way to ensure you are using the correct version of PyTorch is
 to use the [NGC PyTorch
 container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch)
 corresponding to the Triton container. For example, if you are using
-the 24.08 version of Triton, use the 24.08 version of the PyTorch
+the 24.09 version of Triton, use the 24.09 version of the PyTorch
 container.
 
 ## ONNX
diff --git a/docs/user_guide/performance_tuning.md b/docs/user_guide/performance_tuning.md
index 70e76cd5ef..efea32a63b 100644
--- a/docs/user_guide/performance_tuning.md
+++ b/docs/user_guide/performance_tuning.md
@@ -235,7 +235,7 @@ with a `tritonserver` binary.
 
 ```bash
 # Start server container
-docker run -ti --rm --gpus=all --network=host -v $PWD:/mnt --name triton-server nvcr.io/nvidia/tritonserver:24.08-py3
+docker run -ti --rm --gpus=all --network=host -v $PWD:/mnt --name triton-server nvcr.io/nvidia/tritonserver:24.09-py3
 
 # Start serving your models
 tritonserver --model-repository=/mnt/models
@@ -284,7 +284,7 @@ by setting the `-u` flag, such as `perf_analyzer -m densenet_onnx -u
 
 ```bash
 # Start the SDK container interactively
-docker run -ti --rm --gpus=all --network=host -v $PWD:/mnt --name triton-client nvcr.io/nvidia/tritonserver:24.08-py3-sdk
+docker run -ti --rm --gpus=all --network=host -v $PWD:/mnt --name triton-client nvcr.io/nvidia/tritonserver:24.09-py3-sdk
 
 # Benchmark model being served from step 3
 perf_analyzer -m densenet_onnx --concurrency-range 1:4
diff --git a/qa/common/gen_jetson_trt_models b/qa/common/gen_jetson_trt_models
index 892b8dd383..21e9fe53ff 100755
--- a/qa/common/gen_jetson_trt_models
+++ b/qa/common/gen_jetson_trt_models
@@ -34,7 +34,7 @@
 # Make all generated files accessible outside of container
 umask 0000
 # Set the version of the models
-TRITON_VERSION=${TRITON_VERSION:=24.08}
+TRITON_VERSION=${TRITON_VERSION:=24.09}
 # Set the CUDA device to use
 CUDA_DEVICE=${RUNNER_ID:=0}
 # Set TensorRT image
diff --git a/qa/common/gen_qa_custom_ops b/qa/common/gen_qa_custom_ops
index 8864da69f5..286052914b 100755
--- a/qa/common/gen_qa_custom_ops
+++ b/qa/common/gen_qa_custom_ops
@@ -37,7 +37,7 @@
 ##
 ############################################################################
 
-TRITON_VERSION=${TRITON_VERSION:=24.08}
+TRITON_VERSION=${TRITON_VERSION:=24.09}
 NVIDIA_UPSTREAM_VERSION=${NVIDIA_UPSTREAM_VERSION:=$TRITON_VERSION}
 TENSORFLOW_IMAGE=${TENSORFLOW_IMAGE:=nvcr.io/nvidia/tensorflow:$NVIDIA_UPSTREAM_VERSION-tf2-py3}
 PYTORCH_IMAGE=${PYTORCH_IMAGE:=nvcr.io/nvidia/pytorch:$NVIDIA_UPSTREAM_VERSION-py3}
diff --git a/qa/common/gen_qa_model_repository b/qa/common/gen_qa_model_repository
index 900b8fdc03..f26ba863ce 100755
--- a/qa/common/gen_qa_model_repository
+++ b/qa/common/gen_qa_model_repository
@@ -48,7 +48,7 @@
 ##
 ############################################################################
 
-TRITON_VERSION=${TRITON_VERSION:=24.08}
+TRITON_VERSION=${TRITON_VERSION:=24.09}
 
 # ONNX. Use ONNX_OPSET 0 to use the default for ONNX version
 ONNX_VERSION=1.13.0

From 8a66af03642fd23b11566346d697f2bd0a0e4f4f Mon Sep 17 00:00:00 2001
From: Pavithra Vijayakrishnan
 <160681768+pvijayakrish@users.noreply.github.com>
Date: Tue, 10 Sep 2024 22:38:50 -0700
Subject: [PATCH 33/44] Build: Update triton version in Map (#7610)

---
 build.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/build.py b/build.py
index 36aaa161bc..35d14a98f4 100755
--- a/build.py
+++ b/build.py
@@ -70,8 +70,8 @@
 # incorrectly load the other version of the openvino libraries.
 #
 TRITON_VERSION_MAP = {
-    "2.50.0dev": (
-        "24.09dev",  # triton container
+    "2.50.0": (
+        "24.09",  # triton container
         "24.09",  # upstream container
         "1.18.1",  # ORT
         "2024.0.0",  # ORT OpenVINO

From 535445802752b49069f69f49ddcc1abb7fbf2f29 Mon Sep 17 00:00:00 2001
From: Francesco Petrini <francescogpetrini@gmail.com>
Date: Fri, 6 Sep 2024 17:47:33 -0700
Subject: [PATCH 34/44] Update versions post 24.09

---
 Dockerfile.sdk | 2 +-
 README.md      | 8 ++++----
 TRITON_VERSION | 2 +-
 build.py       | 2 +-
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/Dockerfile.sdk b/Dockerfile.sdk
index c7a68fc6af..5ddaf7274f 100644
--- a/Dockerfile.sdk
+++ b/Dockerfile.sdk
@@ -29,7 +29,7 @@
 #
 
 # Base image on the minimum Triton container
-ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.08-py3-min
+ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.09-py3-min
 
 ARG TRITON_CLIENT_REPO_SUBDIR=clientrepo
 ARG TRITON_PA_REPO_SUBDIR=perfanalyzerrepo
diff --git a/README.md b/README.md
index 63b23a9c15..fb347652fd 100644
--- a/README.md
+++ b/README.md
@@ -34,7 +34,7 @@
 
 ##### LATEST RELEASE
 You are currently on the `main` branch which tracks under-development progress towards the next release.
-The current release is version [2.49.0](https://github.com/triton-inference-server/server/releases/latest) and corresponds to the 24.08 container release on NVIDIA GPU Cloud (NGC).
+The current release is version [2.50.0](https://github.com/triton-inference-server/server/releases/latest) and corresponds to the 24.09 container release on NVIDIA GPU Cloud (NGC).
 
 Triton Inference Server is an open source inference serving software that
 streamlines AI inferencing. Triton enables teams to deploy any AI model from
@@ -92,16 +92,16 @@ Inference Server with the
 
 ```bash
 # Step 1: Create the example model repository
-git clone -b r24.08 https://github.com/triton-inference-server/server.git
+git clone -b r24.09 https://github.com/triton-inference-server/server.git
 cd server/docs/examples
 ./fetch_models.sh
 
 # Step 2: Launch triton from the NGC Triton container
-docker run --gpus=1 --rm --net=host -v ${PWD}/model_repository:/models nvcr.io/nvidia/tritonserver:24.08-py3 tritonserver --model-repository=/models
+docker run --gpus=1 --rm --net=host -v ${PWD}/model_repository:/models nvcr.io/nvidia/tritonserver:24.09-py3 tritonserver --model-repository=/models
 
 # Step 3: Sending an Inference Request
 # In a separate console, launch the image_client example from the NGC Triton SDK container
-docker run -it --rm --net=host nvcr.io/nvidia/tritonserver:24.08-py3-sdk
+docker run -it --rm --net=host nvcr.io/nvidia/tritonserver:24.09-py3-sdk
 /workspace/install/bin/image_client -m densenet_onnx -c 3 -s INCEPTION /workspace/images/mug.jpg
 
 # Inference should return the following
diff --git a/TRITON_VERSION b/TRITON_VERSION
index 5db7ab5ba3..44f28a05f6 100644
--- a/TRITON_VERSION
+++ b/TRITON_VERSION
@@ -1 +1 @@
-2.50.0dev
\ No newline at end of file
+2.50.0
\ No newline at end of file
diff --git a/build.py b/build.py
index 35d14a98f4..3a3310845e 100755
--- a/build.py
+++ b/build.py
@@ -73,7 +73,7 @@
     "2.50.0": (
         "24.09",  # triton container
         "24.09",  # upstream container
-        "1.18.1",  # ORT
+        "1.19.2",  # ORT
         "2024.0.0",  # ORT OpenVINO
         "2024.0.0",  # Standalone OpenVINO
         "3.2.6",  # DCGM version

From 8cfb3b00c98783959ec417b7ca8098d984e68c36 Mon Sep 17 00:00:00 2001
From: Misha Chornyi <99709299+mc-nv@users.noreply.github.com>
Date: Tue, 24 Sep 2024 11:46:51 -0700
Subject: [PATCH 35/44] Dockerfile.win10.min - Update dependency versions 
 (#7633)

---
 Dockerfile.win10.min | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/Dockerfile.win10.min b/Dockerfile.win10.min
index 29d2c2a43a..dec972eaf3 100644
--- a/Dockerfile.win10.min
+++ b/Dockerfile.win10.min
@@ -37,9 +37,9 @@ RUN choco install unzip -y
 #
 # Installing TensorRT
 #
-ARG TENSORRT_VERSION=10.3.0.26
-ARG TENSORRT_ZIP="TensorRT-${TENSORRT_VERSION}.Windows10.x86_64.cuda-12.5.zip"
-ARG TENSORRT_SOURCE=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.2.0/zip/TensorRT-10.2.0.19.Windows10.x86_64.cuda-12.5.zip
+ARG TENSORRT_VERSION=10.4.0.26
+ARG TENSORRT_ZIP="TensorRT-${TENSORRT_VERSION}.Windows.win10.cuda-12.6.zip"
+ARG TENSORRT_SOURCE=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.4.0/zip/TensorRT-10.4.0.26.Windows.win10.cuda-12.6.zip
 # COPY ${TENSORRT_ZIP} /tmp/${TENSORRT_ZIP}
 ADD ${TENSORRT_SOURCE} /tmp/${TENSORRT_ZIP}
 RUN unzip /tmp/%TENSORRT_ZIP%
@@ -51,9 +51,9 @@ LABEL TENSORRT_VERSION="${TENSORRT_VERSION}"
 #
 # Installing cuDNN
 #
-ARG CUDNN_VERSION=9.3.0.75
+ARG CUDNN_VERSION=9.4.0.58
 ARG CUDNN_ZIP=cudnn-windows-x86_64-${CUDNN_VERSION}_cuda12-archive.zip
-ARG CUDNN_SOURCE=https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/windows-x86_64/cudnn-windows-x86_64-9.2.1.18_cuda12-archive.zip
+ARG CUDNN_SOURCE=https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/windows-x86_64/cudnn-windows-x86_64-9.4.0.58_cuda12-archive.zip
 ADD ${CUDNN_SOURCE} /tmp/${CUDNN_ZIP}
 RUN unzip /tmp/%CUDNN_ZIP%
 RUN move cudnn-* cudnn
@@ -175,7 +175,7 @@ RUN copy "%CUDA_INSTALL_ROOT_WP%\extras\visual_studio_integration\MSBuildExtensi
 
 RUN setx PATH "%CUDA_INSTALL_ROOT_WP%\bin;%PATH%"
 
-ARG CUDNN_VERSION=9.3.0.75
+ARG CUDNN_VERSION=9.4.0.58
 ENV CUDNN_VERSION ${CUDNN_VERSION}
 COPY --from=dependency_base /cudnn /cudnn
 RUN copy cudnn\bin\cudnn*.dll "%CUDA_INSTALL_ROOT_WP%\bin\."
@@ -183,7 +183,7 @@ RUN copy cudnn\lib\x64\cudnn*.lib "%CUDA_INSTALL_ROOT_WP%\lib\x64\."
 RUN copy cudnn\include\cudnn*.h "%CUDA_INSTALL_ROOT_WP%\include\."
 LABEL CUDNN_VERSION="${CUDNN_VERSION}"
 
-ARG TENSORRT_VERSION=10.3.0.26
+ARG TENSORRT_VERSION=10.4.0.26
 ENV TRT_VERSION ${TENSORRT_VERSION}
 COPY --from=dependency_base /TensorRT /TensorRT
 RUN setx PATH "c:\TensorRT\lib;%PATH%"

From 8709680833c8a262c8ca2d533e76f334fbde952c Mon Sep 17 00:00:00 2001
From: pvijayakrish <pvijayakrish@nvidia.com>
Date: Thu, 26 Sep 2024 00:09:35 -0700
Subject: [PATCH 36/44] Update server versions post 24.09

---
 TRITON_VERSION | 2 +-
 build.py       | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/TRITON_VERSION b/TRITON_VERSION
index 44f28a05f6..124ddb483d 100644
--- a/TRITON_VERSION
+++ b/TRITON_VERSION
@@ -1 +1 @@
-2.50.0
\ No newline at end of file
+2.51.0dev
\ No newline at end of file
diff --git a/build.py b/build.py
index 3a3310845e..fdb7b47554 100755
--- a/build.py
+++ b/build.py
@@ -70,8 +70,8 @@
 # incorrectly load the other version of the openvino libraries.
 #
 TRITON_VERSION_MAP = {
-    "2.50.0": (
-        "24.09",  # triton container
+    "2.51.0dev": (
+        "24.10dev",  # triton container
         "24.09",  # upstream container
         "1.19.2",  # ORT
         "2024.0.0",  # ORT OpenVINO

From dea11317abb92241829e3c62ce2b062ea80c0fab Mon Sep 17 00:00:00 2001
From: KrishnanPrash <140860868+KrishnanPrash@users.noreply.github.com>
Date: Wed, 2 Oct 2024 12:34:54 -0700
Subject: [PATCH 37/44] ci: Reducing flakiness of `L0_python_api`  (#7674)

---
 qa/L0_python_api/test_kserve.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/qa/L0_python_api/test_kserve.py b/qa/L0_python_api/test_kserve.py
index 703d86ca43..f9af5b3b22 100644
--- a/qa/L0_python_api/test_kserve.py
+++ b/qa/L0_python_api/test_kserve.py
@@ -241,11 +241,19 @@ def callback(user_data, result, error):
             time_out = time_out - 1
             time.sleep(1)
 
+        # Depending on when gRPC frontend shut down StatusCode can vary
+        acceptable_failure_msgs = [
+            "[StatusCode.CANCELLED] CANCELLED",
+            "[StatusCode.UNAVAILABLE] failed to connect to all addresses",
+        ]
+
         assert (
             len(user_data) == 1
             and isinstance(user_data[0], InferenceServerException)
-            and "[StatusCode.UNAVAILABLE] failed to connect to all addresses"
-            in str(user_data[0])
+            and any(
+                failure_msg in str(user_data[0])
+                for failure_msg in acceptable_failure_msgs
+            )
         )
 
         teardown_client(grpc_client)

From 19f76842966bcd6a59c938b375ecea67b27768c3 Mon Sep 17 00:00:00 2001
From: Olga Andreeva <124622579+oandreeva-nv@users.noreply.github.com>
Date: Wed, 2 Oct 2024 17:09:46 -0700
Subject: [PATCH 38/44] [doc]Adjusted formatting of the warning (#7675)

---
 README.md | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index fb347652fd..36ef51f279 100644
--- a/README.md
+++ b/README.md
@@ -30,11 +30,10 @@
 
 [![License](https://img.shields.io/badge/License-BSD3-lightgrey.svg)](https://opensource.org/licenses/BSD-3-Clause)
 
-[!WARNING]
-
-##### LATEST RELEASE
-You are currently on the `main` branch which tracks under-development progress towards the next release.
-The current release is version [2.50.0](https://github.com/triton-inference-server/server/releases/latest) and corresponds to the 24.09 container release on NVIDIA GPU Cloud (NGC).
+>[!WARNING]
+>You are currently on the `main` branch which tracks under-development progress
+>towards the next release. The current release is version [2.50.0](https://github.com/triton-inference-server/server/releases/latest)
+>and corresponds to the 24.09 container release on NVIDIA GPU Cloud (NGC).
 
 Triton Inference Server is an open source inference serving software that
 streamlines AI inferencing. Triton enables teams to deploy any AI model from

From 6edd5c650e20c79293d86a2a44914e42cd5bf483 Mon Sep 17 00:00:00 2001
From: v-shobhit <161510941+v-shobhit@users.noreply.github.com>
Date: Sun, 6 Oct 2024 20:39:47 -0700
Subject: [PATCH 39/44] fix: usage of ReadDataFromJson in array tensors (#7624)

Co-authored-by: Sai Kiran Polisetty <spolisetty@nvidia.com>
---
 qa/L0_http/generate_endpoint_test.py          | 31 ++++++++++++++++---
 .../generate_models/mock_llm/config.pbtxt     |  8 ++++-
 qa/L0_http/test.sh                            |  2 +-
 src/http_server.cc                            |  2 ++
 4 files changed, 37 insertions(+), 6 deletions(-)

diff --git a/qa/L0_http/generate_endpoint_test.py b/qa/L0_http/generate_endpoint_test.py
index a9a972e02a..3eb0b6ea5f 100755
--- a/qa/L0_http/generate_endpoint_test.py
+++ b/qa/L0_http/generate_endpoint_test.py
@@ -142,6 +142,21 @@ def test_generate(self):
         self.assertIn("TEXT", data)
         self.assertEqual(text, data["TEXT"])
 
+    def test_generate_with_all_inputs(self):
+        # Setup text-based input
+        text = "hello world"
+        inputs = {"PROMPT": text, "STREAM": False, "input_ids": [100, 200]}
+
+        r = self.generate(self._model_name, inputs)
+        r.raise_for_status()
+
+        self.assertIn("Content-Type", r.headers)
+        self.assertEqual(r.headers["Content-Type"], "application/json")
+
+        data = r.json()
+        self.assertIn("TEXT", data)
+        self.assertEqual(text, data["TEXT"])
+
     def test_request_id(self):
         # Setup text based input
         text = "hello world"
@@ -220,18 +235,26 @@ def test_missing_inputs(self):
         ]
         for inputs in missing_all_inputs:
             self.generate_expect_failure(
-                self._model_name, inputs, "expected 2 inputs but got 0"
+                self._model_name,
+                inputs,
+                "expected number of inputs between 2 and 3 but got 0",
             )
             self.generate_stream_expect_failure(
-                self._model_name, inputs, "expected 2 inputs but got 0"
+                self._model_name,
+                inputs,
+                "expected number of inputs between 2 and 3 but got 0",
             )
 
         for inputs in missing_one_input:
             self.generate_expect_failure(
-                self._model_name, inputs, "expected 2 inputs but got 1"
+                self._model_name,
+                inputs,
+                "expected number of inputs between 2 and 3 but got 1",
             )
             self.generate_stream_expect_failure(
-                self._model_name, inputs, "expected 2 inputs but got 1"
+                self._model_name,
+                inputs,
+                "expected number of inputs between 2 and 3 but got 1",
             )
 
     def test_invalid_input_types(self):
diff --git a/qa/L0_http/generate_models/mock_llm/config.pbtxt b/qa/L0_http/generate_models/mock_llm/config.pbtxt
index 6871661525..74a306052a 100644
--- a/qa/L0_http/generate_models/mock_llm/config.pbtxt
+++ b/qa/L0_http/generate_models/mock_llm/config.pbtxt
@@ -1,4 +1,4 @@
-# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -41,6 +41,12 @@ input [
     name: "STREAM"
     data_type: TYPE_BOOL
     dims: [ 1, 1 ]
+  },
+  {
+    name: "input_ids"
+    data_type: TYPE_INT32
+    dims: [ 1, -1 ]
+    optional: true
   }
 ]
 
diff --git a/qa/L0_http/test.sh b/qa/L0_http/test.sh
index 81ae4c254c..572c527ba4 100755
--- a/qa/L0_http/test.sh
+++ b/qa/L0_http/test.sh
@@ -662,7 +662,7 @@ fi
 ## Python Unit Tests
 TEST_RESULT_FILE='test_results.txt'
 PYTHON_TEST=generate_endpoint_test.py
-EXPECTED_NUM_TESTS=16
+EXPECTED_NUM_TESTS=17
 set +e
 python $PYTHON_TEST >$CLIENT_LOG 2>&1
 if [ $? -ne 0 ]; then
diff --git a/src/http_server.cc b/src/http_server.cc
index 2fa395fc98..156c114b77 100644
--- a/src/http_server.cc
+++ b/src/http_server.cc
@@ -3557,6 +3557,8 @@ HTTPAPIServer::GenerateRequestClass::ExactMappingInput(
       }
     }
 
+    // get original element count back
+    element_cnt = tensor_data.IsArray() ? tensor_data.ArraySize() : 1;
     serialized_data_.emplace_back();
     std::vector<char>& serialized = serialized_data_.back();
     serialized.resize(byte_size);

From b247eb59b1826193f78b76b659e30a85e5fe3b78 Mon Sep 17 00:00:00 2001
From: KrishnanPrash <140860868+KrishnanPrash@users.noreply.github.com>
Date: Mon, 7 Oct 2024 10:06:40 -0700
Subject: [PATCH 40/44] fix: `tritonfrontend` gRPC Streaming Segmentation Fault
 (#7671)

---
 docs/customization_guide/tritonfrontend.md |  12 ++-
 qa/L0_python_api/test_kserve.py            | 112 ++++++++++++---------
 qa/L0_python_api/testing_utils.py          |  80 +++++++++++++++
 src/grpc/stream_infer_handler.cc           |  14 +--
 src/http_server.cc                         |  11 +-
 5 files changed, 168 insertions(+), 61 deletions(-)

diff --git a/docs/customization_guide/tritonfrontend.md b/docs/customization_guide/tritonfrontend.md
index 0ec4b32749..3b47e4dbee 100644
--- a/docs/customization_guide/tritonfrontend.md
+++ b/docs/customization_guide/tritonfrontend.md
@@ -25,9 +25,15 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 -->
-### Triton Server (tritonfrontend) Bindings
-
-The `tritonfrontend` python package is a set of bindings to Triton's existing frontends implemented in C++. Currently, `tritonfrontend` supports starting up `KServeHttp` and `KServeGrpc` frontends. These bindings used in-combination with Triton's Python In-Process API ([`tritonserver`](https://github.com/triton-inference-server/core/tree/main/python/tritonserver)) and [`tritonclient`](https://github.com/triton-inference-server/client/tree/main/src/python/library) extend the ability to use Triton's full feature set with a couple of lines of Python.
+### Triton Server (tritonfrontend) Bindings (Beta)
+
+The `tritonfrontend` python package is a set of bindings to Triton's existing
+frontends implemented in C++. Currently, `tritonfrontend` supports starting up
+`KServeHttp` and `KServeGrpc` frontends. These bindings used in-combination
+with Triton's Python In-Process API
+([`tritonserver`](https://github.com/triton-inference-server/core/tree/main/python/tritonserver))
+and [`tritonclient`](https://github.com/triton-inference-server/client/tree/main/src/python/library)
+extend the ability to use Triton's full feature set with a few lines of Python.
 
 Let us walk through a simple example:
 1. First we need to load the desired models and start the server with `tritonserver`.
diff --git a/qa/L0_python_api/test_kserve.py b/qa/L0_python_api/test_kserve.py
index f9af5b3b22..021ce9be17 100644
--- a/qa/L0_python_api/test_kserve.py
+++ b/qa/L0_python_api/test_kserve.py
@@ -29,18 +29,10 @@
 
 import numpy as np
 import pytest
+import testing_utils as utils
 import tritonclient.grpc as grpcclient
 import tritonclient.http as httpclient
 import tritonserver
-from testing_utils import (
-    send_and_test_inference_identity,
-    setup_client,
-    setup_server,
-    setup_service,
-    teardown_client,
-    teardown_server,
-    teardown_service,
-)
 from tritonclient.utils import InferenceServerException
 from tritonfrontend import KServeGrpc, KServeHttp
 
@@ -93,33 +85,33 @@ def test_wrong_grpc_parameters(self):
 class TestKServe:
     @pytest.mark.parametrize("frontend, client_type, url", [HTTP_ARGS, GRPC_ARGS])
     def test_server_ready(self, frontend, client_type, url):
-        server = setup_server()
-        service = setup_service(server, frontend)
-        client = setup_client(client_type, url=url)
+        server = utils.setup_server()
+        service = utils.setup_service(server, frontend)
+        client = utils.setup_client(client_type, url=url)
 
         assert client.is_server_ready()
 
-        teardown_client(client)
-        teardown_service(service)
-        teardown_server(server)
+        utils.teardown_client(client)
+        utils.teardown_service(service)
+        utils.teardown_server(server)
 
     @pytest.mark.parametrize("frontend", [HTTP_ARGS[0], GRPC_ARGS[0]])
     def test_service_double_start(self, frontend):
-        server = setup_server()
+        server = utils.setup_server()
         # setup_service() performs service.start()
-        service = setup_service(server, frontend)
+        service = utils.setup_service(server, frontend)
 
         with pytest.raises(
             tritonserver.AlreadyExistsError, match="server is already running."
         ):
             service.start()
 
-        teardown_server(server)
-        teardown_service(service)
+        utils.teardown_server(server)
+        utils.teardown_service(service)
 
     @pytest.mark.parametrize("frontend", [HTTP_ARGS[0], GRPC_ARGS[0]])
     def test_invalid_options(self, frontend):
-        server = setup_server()
+        server = utils.setup_server()
         # Current flow is KServeHttp.Options or KServeGrpc.Options have to be
         # provided to ensure type and range validation occurs.
         with pytest.raises(
@@ -128,45 +120,65 @@ def test_invalid_options(self, frontend):
         ):
             frontend.Server(server, {"port": 8001})
 
-        teardown_server(server)
+        utils.teardown_server(server)
 
     @pytest.mark.parametrize("frontend", [HTTP_ARGS[0], GRPC_ARGS[0]])
     def test_server_service_order(self, frontend):
-        server = setup_server()
-        service = setup_service(server, frontend)
+        server = utils.setup_server()
+        service = utils.setup_service(server, frontend)
 
-        teardown_server(server)
-        teardown_service(service)
+        utils.teardown_server(server)
+        utils.teardown_service(service)
 
     @pytest.mark.parametrize("frontend, client_type", [HTTP_ARGS[:2], GRPC_ARGS[:2]])
     def test_service_custom_port(self, frontend, client_type):
-        server = setup_server()
+        server = utils.setup_server()
         options = frontend.Options(port=8005)
-        service = setup_service(server, frontend, options)
-        client = setup_client(client_type, url="localhost:8005")
+        service = utils.setup_service(server, frontend, options)
+        client = utils.setup_client(client_type, url="localhost:8005")
 
         # Confirms that service starts at port 8005
         client.is_server_ready()
 
-        teardown_client(client)
-        teardown_service(service)
-        teardown_server(server)
+        utils.teardown_client(client)
+        utils.teardown_service(service)
+        utils.teardown_server(server)
 
     @pytest.mark.parametrize("frontend, client_type, url", [HTTP_ARGS, GRPC_ARGS])
     def test_inference(self, frontend, client_type, url):
-        server = setup_server()
-        service = setup_service(server, frontend)
+        server = utils.setup_server()
+        service = utils.setup_service(server, frontend)
 
         # TODO: use common/test_infer
-        assert send_and_test_inference_identity(client_type, url=url)
+        assert utils.send_and_test_inference_identity(client_type, url=url)
 
-        teardown_service(service)
-        teardown_server(server)
+        utils.teardown_service(service)
+        utils.teardown_server(server)
+
+    @pytest.mark.parametrize("frontend, client_type, url", [GRPC_ARGS])
+    def test_streaming_inference(self, frontend, client_type, url):
+        server = utils.setup_server()
+        service = utils.setup_service(server, frontend)
+
+        assert utils.send_and_test_stream_inference(client_type, url)
+
+        utils.teardown_service(service)
+        utils.teardown_server(server)
+
+    @pytest.mark.parametrize("frontend, client_type, url", [HTTP_ARGS])
+    def test_http_generate_inference(self, frontend, client_type, url):
+        server = utils.setup_server()
+        service = utils.setup_service(server, frontend)
+
+        assert utils.send_and_test_generate_inference()
+
+        utils.teardown_service(service)
+        utils.teardown_server(server)
 
     @pytest.mark.parametrize("frontend, client_type, url", [HTTP_ARGS])
     def test_http_req_during_shutdown(self, frontend, client_type, url):
-        server = setup_server()
-        http_service = setup_service(server, frontend)
+        server = utils.setup_server()
+        http_service = utils.setup_service(server, frontend)
         http_client = httpclient.InferenceServerClient(url="localhost:8000")
         model_name = "delayed_identity"
         delay = 2  # seconds
@@ -182,7 +194,7 @@ def test_http_req_during_shutdown(self, frontend, client_type, url):
             model_name=model_name, inputs=inputs, outputs=outputs
         )
         # http_service.stop() does not use graceful shutdown
-        teardown_service(http_service)
+        utils.teardown_service(http_service)
 
         # So, inference request will fail as http endpoints have been stopped.
         with pytest.raises(
@@ -194,7 +206,7 @@ def test_http_req_during_shutdown(self, frontend, client_type, url):
         # However, due to an unsuccessful get_result(), async_request is still
         # an active thread. Hence, join stalls until greenlet timeouts.
         # Does not throw an exception, but displays error in logs.
-        teardown_client(http_client)
+        utils.teardown_client(http_client)
 
         # delayed_identity will still be an active model
         # Hence, server.stop() causes InternalError: Timeout.
@@ -202,12 +214,12 @@ def test_http_req_during_shutdown(self, frontend, client_type, url):
             tritonserver.InternalError,
             match="Exit timeout expired. Exiting immediately.",
         ):
-            teardown_server(server)
+            utils.teardown_server(server)
 
     @pytest.mark.parametrize("frontend, client_type, url", [GRPC_ARGS])
     def test_grpc_req_during_shutdown(self, frontend, client_type, url):
-        server = setup_server()
-        grpc_service = setup_service(server, frontend)
+        server = utils.setup_server()
+        grpc_service = utils.setup_service(server, frontend)
         grpc_client = grpcclient.InferenceServerClient(url=url)
         user_data = []
 
@@ -234,7 +246,7 @@ def callback(user_data, result, error):
             callback=partial(callback, user_data),
         )
 
-        teardown_service(grpc_service)
+        utils.teardown_service(grpc_service)
 
         time_out = delay + 1
         while (len(user_data) == 0) and time_out > 0:
@@ -256,8 +268,8 @@ def callback(user_data, result, error):
             )
         )
 
-        teardown_client(grpc_client)
-        teardown_server(server)
+        utils.teardown_client(grpc_client)
+        utils.teardown_server(server)
 
     # KNOWN ISSUE: CAUSES SEGFAULT
     # Created  [DLIS-7231] to address at future date
@@ -265,8 +277,8 @@ def callback(user_data, result, error):
     # is deleted. However, the frontend does not know the server instance
     # is no longer valid.
     # def test_inference_after_server_stop(self):
-    #     server = setup_server()
-    #     http_service = setup_service(server, KServeHttp)
+    #     server = utils.setup_server()
+    #     http_service = utils.setup_service(server, KServeHttp)
     #     http_client = setup_client(httpclient, url="localhost:8000")
 
     #     teardown_server(server) # Server has been stopped
@@ -282,5 +294,5 @@ def callback(user_data, result, error):
 
     #     results = http_client.infer(model_name, inputs=inputs, outputs=outputs)
 
-    #     teardown_client(http_client)
-    #     teardown_service(http_service)
+    #     utils.teardown_client(http_client)
+    #     utils.teardown_service(http_service)
diff --git a/qa/L0_python_api/testing_utils.py b/qa/L0_python_api/testing_utils.py
index 8c63fea89b..4f81c373db 100644
--- a/qa/L0_python_api/testing_utils.py
+++ b/qa/L0_python_api/testing_utils.py
@@ -25,12 +25,18 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import os
+import queue
 from typing import Union
 
 import numpy as np
+import requests
 import tritonserver
+from tritonclient.utils import InferenceServerException
 from tritonfrontend import KServeGrpc, KServeHttp
 
+# TODO: Re-Format documentation to fit:
+# https://google.github.io/styleguide/pyguide.html#38-comments-and-docstrings
+
 
 def setup_server(model_repository="test_model_repository") -> tritonserver.Server:
     module_directory = os.path.split(os.path.abspath(__file__))[0]
@@ -93,3 +99,77 @@ def send_and_test_inference_identity(frontend_client, url: str) -> bool:
 
     teardown_client(client)
     return input_data[0] == output_data[0].decode()
+
+
+# Sends a streaming inference request to test_model_repository/identity model
+# and verifies input == output
+def send_and_test_stream_inference(frontend_client, url: str) -> bool:
+    model_name = "identity"
+
+    # Setting up the gRPC client stream
+    results = queue.Queue()
+    callback = lambda error, result: results.put(error or result)
+    client = frontend_client.InferenceServerClient(url=url)
+
+    client.start_stream(callback=callback)
+
+    # Preparing Input Data
+    text_input = "testing"
+    input_tensor = frontend_client.InferInput(
+        name="INPUT0", shape=[1], datatype="BYTES"
+    )
+    input_tensor.set_data_from_numpy(np.array([text_input.encode()], dtype=np.object_))
+
+    # Sending Streaming Inference Request
+    client.async_stream_infer(
+        model_name=model_name, inputs=[input_tensor], enable_empty_final_response=True
+    )
+
+    # Looping through until exception thrown or request completed
+    completed_requests, num_requests = 0, 1
+    text_output, is_final = None, None
+    while completed_requests != num_requests:
+        result = results.get()
+        if isinstance(result, InferenceServerException):
+            if result.status() == "StatusCode.CANCELLED":
+                completed_requests += 1
+            raise result
+
+        # Processing Response
+        text_output = result.as_numpy("OUTPUT0")[0].decode()
+
+        triton_final_response = result.get_response().parameters.get(
+            "triton_final_response", {}
+        )
+
+        is_final = False
+        if triton_final_response.HasField("bool_param"):
+            is_final = triton_final_response.bool_param
+
+        # Request Completed
+        if is_final:
+            completed_requests += 1
+
+    # Tearing down gRPC client stream
+    client.stop_stream(cancel_requests=True)
+
+    return is_final and (text_input == text_output)
+
+
+def send_and_test_generate_inference() -> bool:
+    model_name = "identity"
+    url = f"http://localhost:8000/v2/models/{model_name}/generate"
+    input_text = "testing"
+    data = {
+        "INPUT0": input_text,
+    }
+
+    response = requests.post(url, json=data, stream=True)
+    if response.status_code == 200:
+        result = response.json()
+        output_text = result.get("OUTPUT0", "")
+
+        if output_text == input_text:
+            return True
+
+    return False
diff --git a/src/grpc/stream_infer_handler.cc b/src/grpc/stream_infer_handler.cc
index cf788b1e09..e912e1512c 100644
--- a/src/grpc/stream_infer_handler.cc
+++ b/src/grpc/stream_infer_handler.cc
@@ -324,12 +324,14 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok)
     if (err == nullptr) {
       TRITONSERVER_InferenceTrace* triton_trace = nullptr;
 #ifdef TRITON_ENABLE_TRACING
-      GrpcServerCarrier carrier(state->context_->ctx_.get());
-      auto start_options =
-          trace_manager_->GetTraceStartOptions(carrier, request.model_name());
-      state->trace_ = std::move(trace_manager_->SampleTrace(start_options));
-      if (state->trace_ != nullptr) {
-        triton_trace = state->trace_->trace_;
+      if (trace_manager_ != nullptr) {
+        GrpcServerCarrier carrier(state->context_->ctx_.get());
+        auto start_options =
+            trace_manager_->GetTraceStartOptions(carrier, request.model_name());
+        state->trace_ = std::move(trace_manager_->SampleTrace(start_options));
+        if (state->trace_ != nullptr) {
+          triton_trace = state->trace_->trace_;
+        }
       }
 #endif  // TRITON_ENABLE_TRACING
 
diff --git a/src/http_server.cc b/src/http_server.cc
index 156c114b77..99aed411b5 100644
--- a/src/http_server.cc
+++ b/src/http_server.cc
@@ -1810,6 +1810,10 @@ HTTPAPIServer::HandleTrace(evhtp_request_t* req, const std::string& model_name)
   }
 
 #ifdef TRITON_ENABLE_TRACING
+  if (trace_manager_ == nullptr) {
+    return;
+  }
+
   TRITONSERVER_InferenceTraceLevel level = TRITONSERVER_TRACE_LEVEL_DISABLED;
   uint32_t rate;
   int32_t count;
@@ -3233,8 +3237,11 @@ HTTPAPIServer::HandleGenerate(
 
   // If tracing is enabled see if this request should be traced.
   TRITONSERVER_InferenceTrace* triton_trace = nullptr;
-  std::shared_ptr<TraceManager::Trace> trace =
-      StartTrace(req, model_name, &triton_trace);
+  std::shared_ptr<TraceManager::Trace> trace;
+  if (trace_manager_) {
+    // If tracing is enabled see if this request should be traced.
+    trace = StartTrace(req, model_name, &triton_trace);
+  }
 
   std::map<std::string, triton::common::TritonJson::Value> input_metadata;
   triton::common::TritonJson::Value meta_data_root;

From d19c6abf85a45133c124222875b5f8d4ea85b094 Mon Sep 17 00:00:00 2001
From: Jacky <18255193+kthui@users.noreply.github.com>
Date: Mon, 7 Oct 2024 15:30:32 -0700
Subject: [PATCH 41/44] test: Enhance Python gRPC streaming test to send
 multiple requests (#7684)

---
 qa/L0_python_api/testing_utils.py | 72 +++++++++++--------------------
 1 file changed, 25 insertions(+), 47 deletions(-)

diff --git a/qa/L0_python_api/testing_utils.py b/qa/L0_python_api/testing_utils.py
index 4f81c373db..79901f7411 100644
--- a/qa/L0_python_api/testing_utils.py
+++ b/qa/L0_python_api/testing_utils.py
@@ -26,6 +26,7 @@
 
 import os
 import queue
+from functools import partial
 from typing import Union
 
 import numpy as np
@@ -101,59 +102,36 @@ def send_and_test_inference_identity(frontend_client, url: str) -> bool:
     return input_data[0] == output_data[0].decode()
 
 
-# Sends a streaming inference request to test_model_repository/identity model
-# and verifies input == output
+# Sends multiple streaming requests to "delayed_identity" model with negligible delays,
+# and verifies the inputs matches outputs and the ordering is preserved.
 def send_and_test_stream_inference(frontend_client, url: str) -> bool:
-    model_name = "identity"
-
-    # Setting up the gRPC client stream
-    results = queue.Queue()
-    callback = lambda error, result: results.put(error or result)
-    client = frontend_client.InferenceServerClient(url=url)
-
-    client.start_stream(callback=callback)
-
-    # Preparing Input Data
-    text_input = "testing"
-    input_tensor = frontend_client.InferInput(
-        name="INPUT0", shape=[1], datatype="BYTES"
-    )
-    input_tensor.set_data_from_numpy(np.array([text_input.encode()], dtype=np.object_))
+    num_requests = 100
+    requests = []
+    for i in range(num_requests):
+        input0_np = np.array([[float(i) / 1000]], dtype=np.float32)
+        inputs = [frontend_client.InferInput("INPUT0", input0_np.shape, "FP32")]
+        inputs[0].set_data_from_numpy(input0_np)
+        requests.append(inputs)
 
-    # Sending Streaming Inference Request
-    client.async_stream_infer(
-        model_name=model_name, inputs=[input_tensor], enable_empty_final_response=True
-    )
-
-    # Looping through until exception thrown or request completed
-    completed_requests, num_requests = 0, 1
-    text_output, is_final = None, None
-    while completed_requests != num_requests:
-        result = results.get()
-        if isinstance(result, InferenceServerException):
-            if result.status() == "StatusCode.CANCELLED":
-                completed_requests += 1
-            raise result
-
-        # Processing Response
-        text_output = result.as_numpy("OUTPUT0")[0].decode()
+    responses = []
 
-        triton_final_response = result.get_response().parameters.get(
-            "triton_final_response", {}
-        )
+    def callback(responses, result, error):
+        responses.append({"result": result, "error": error})
 
-        is_final = False
-        if triton_final_response.HasField("bool_param"):
-            is_final = triton_final_response.bool_param
-
-        # Request Completed
-        if is_final:
-            completed_requests += 1
+    client = frontend_client.InferenceServerClient(url=url)
+    client.start_stream(partial(callback, responses))
+    for inputs in requests:
+        client.async_stream_infer("delayed_identity", inputs)
+    client.stop_stream()
+    teardown_client(client)
 
-    # Tearing down gRPC client stream
-    client.stop_stream(cancel_requests=True)
+    assert len(responses) == num_requests
+    for i in range(len(responses)):
+        assert responses[i]["error"] is None
+        output0_np = responses[i]["result"].as_numpy(name="OUTPUT0")
+        assert np.allclose(output0_np, [[float(i) / 1000]])
 
-    return is_final and (text_input == text_output)
+    return True  # test passed
 
 
 def send_and_test_generate_inference() -> bool:

From 4dbb1b9d6803036fa48fad7c2dfef9a0be86125d Mon Sep 17 00:00:00 2001
From: KrishnanPrash <140860868+KrishnanPrash@users.noreply.github.com>
Date: Mon, 7 Oct 2024 17:09:39 -0700
Subject: [PATCH 42/44] refactor: Removing `Server` subclass from
 `tritonfrontend` (#7683)

---
 docs/customization_guide/tritonfrontend.md    |  6 +-
 qa/L0_python_api/test_kserve.py               |  2 +-
 qa/L0_python_api/testing_utils.py             |  2 +-
 src/python/examples/example.py                |  3 +-
 src/python/tritonfrontend/_api/_kservegrpc.py | 83 +++++++++----------
 src/python/tritonfrontend/_api/_kservehttp.py | 73 ++++++++--------
 6 files changed, 83 insertions(+), 86 deletions(-)

diff --git a/docs/customization_guide/tritonfrontend.md b/docs/customization_guide/tritonfrontend.md
index 3b47e4dbee..763ab82fb9 100644
--- a/docs/customization_guide/tritonfrontend.md
+++ b/docs/customization_guide/tritonfrontend.md
@@ -59,11 +59,11 @@ Note: `model_path` may need to be edited depending on your setup.
 ```python
 from tritonfrontend import KServeHttp, KServeGrpc
 http_options = KServeHttp.Options(thread_count=5)
-http_service = KServeHttp.Server(server, http_options)
+http_service = KServeHttp(server, http_options)
 http_service.start()
 
 # Default options (if none provided)
-grpc_service = KServeGrpc.Server(server)
+grpc_service = KServeGrpc(server)
 grpc_service.start()
 ```
 
@@ -110,7 +110,7 @@ from tritonfrontend import KServeHttp
 import tritonclient.http as httpclient
 import numpy as np  # Use version numpy < 2
 
-with KServeHttp.Server(server) as http_service:
+with KServeHttp(server) as http_service:
     # The identity model returns an exact duplicate of the input data as output
     model_name = "identity"
     url = "localhost:8000"
diff --git a/qa/L0_python_api/test_kserve.py b/qa/L0_python_api/test_kserve.py
index 021ce9be17..9e8b82eb43 100644
--- a/qa/L0_python_api/test_kserve.py
+++ b/qa/L0_python_api/test_kserve.py
@@ -118,7 +118,7 @@ def test_invalid_options(self, frontend):
             tritonserver.InvalidArgumentError,
             match="Incorrect type for options. options argument must be of type",
         ):
-            frontend.Server(server, {"port": 8001})
+            frontend(server, {"port": 8001})
 
         utils.teardown_server(server)
 
diff --git a/qa/L0_python_api/testing_utils.py b/qa/L0_python_api/testing_utils.py
index 79901f7411..48cb3ccc37 100644
--- a/qa/L0_python_api/testing_utils.py
+++ b/qa/L0_python_api/testing_utils.py
@@ -64,7 +64,7 @@ def setup_service(
     frontend: Union[KServeHttp, KServeGrpc],
     options=None,
 ) -> Union[KServeHttp, KServeGrpc]:
-    service = frontend.Server(server=server, options=options)
+    service = frontend(server=server, options=options)
     service.start()
     return service
 
diff --git a/src/python/examples/example.py b/src/python/examples/example.py
index a1fba6e9d1..2d2ca78920 100644
--- a/src/python/examples/example.py
+++ b/src/python/examples/example.py
@@ -51,7 +51,7 @@ def main():
     http_options = KServeHttp.Options(port=8005)
 
     # or http_service = KServeHttp.Server(server, http_options) & http_service.stop()
-    with KServeHttp.Server(server, http_options) as http_service:
+    with KServeHttp(server, http_options) as http_service:
         # The identity model returns an exact duplicate of the input data as output
         model_name = "identity"
         url = "localhost:8005"
@@ -74,7 +74,6 @@ def main():
         output_data = results.as_numpy("OUTPUT0")
 
         print("--------------------- INFERENCE RESULTS ---------------------")
-        print("Input data:", input_data)
         print("Output data:", output_data)
         print("-------------------------------------------------------------")
 
diff --git a/src/python/tritonfrontend/_api/_kservegrpc.py b/src/python/tritonfrontend/_api/_kservegrpc.py
index 5471613340..b8f199ac53 100644
--- a/src/python/tritonfrontend/_api/_kservegrpc.py
+++ b/src/python/tritonfrontend/_api/_kservegrpc.py
@@ -90,48 +90,47 @@ def __post_init__(self):
             if isinstance(self.infer_compression_level, Grpc_compression_level):
                 self.infer_compression_level = self.infer_compression_level.value
 
-    class Server:
-        def __init__(self, server: tritonserver, options: "KServeGrpc.Options" = None):
-            try:
-                server_ptr = server._ptr()  # TRITONSERVER_Server pointer
-
-                # If no options provided, default options are selected
-                if options is None:
-                    options = KServeGrpc.Options()
-
-                if not isinstance(options, KServeGrpc.Options):
-                    raise InvalidArgumentError(
-                        "Incorrect type for options. options argument must be of type KServeGrpc.Options"
-                    )
-
-                # Converts dataclass instance -> python dictionary -> unordered_map<string, std::variant<...>>
-                options_dict: dict[str, Union[int, bool, str]] = options.__dict__
-
-                self.triton_frontend = TritonFrontendGrpc(server_ptr, options_dict)
-            except TritonError:
-                exc_type, exc_value, _ = sys.exc_info()
-                # raise ... from None masks the tritonfrontend Error from being added in traceback
-                raise ERROR_MAPPING[exc_type](exc_value) from None
-
-        def __enter__(self):
+    def __init__(self, server: tritonserver, options: "KServeGrpc.Options" = None):
+        try:
+            server_ptr = server._ptr()  # TRITONSERVER_Server pointer
+
+            # If no options provided, default options are selected
+            if options is None:
+                options = KServeGrpc.Options()
+
+            if not isinstance(options, KServeGrpc.Options):
+                raise InvalidArgumentError(
+                    "Incorrect type for options. options argument must be of type KServeGrpc.Options"
+                )
+
+            # Converts dataclass instance -> python dictionary -> unordered_map<string, std::variant<...>>
+            options_dict: dict[str, Union[int, bool, str]] = options.__dict__
+
+            self.triton_frontend = TritonFrontendGrpc(server_ptr, options_dict)
+        except TritonError:
+            exc_type, exc_value, _ = sys.exc_info()
+            # raise ... from None masks the tritonfrontend Error from being added in traceback
+            raise ERROR_MAPPING[exc_type](exc_value) from None
+
+    def __enter__(self):
+        self.triton_frontend.start()
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.triton_frontend.stop()
+        if exc_type:
+            raise ERROR_MAPPING[exc_type](exc_value) from None
+
+    def start(self):
+        try:
             self.triton_frontend.start()
-            return self
+        except TritonError:
+            exc_type, exc_value, _ = sys.exc_info()
+            raise ERROR_MAPPING[exc_type](exc_value) from None
 
-        def __exit__(self, exc_type, exc_value, traceback):
+    def stop(self):
+        try:
             self.triton_frontend.stop()
-            if exc_type:
-                raise ERROR_MAPPING[exc_type](exc_value) from None
-
-        def start(self):
-            try:
-                self.triton_frontend.start()
-            except TritonError:
-                exc_type, exc_value, _ = sys.exc_info()
-                raise ERROR_MAPPING[exc_type](exc_value) from None
-
-        def stop(self):
-            try:
-                self.triton_frontend.stop()
-            except TritonError:
-                exc_type, exc_value, _ = sys.exc_info()
-                raise ERROR_MAPPING[exc_type](exc_value) from None
+        except TritonError:
+            exc_type, exc_value, _ = sys.exc_info()
+            raise ERROR_MAPPING[exc_type](exc_value) from None
diff --git a/src/python/tritonfrontend/_api/_kservehttp.py b/src/python/tritonfrontend/_api/_kservehttp.py
index 6a2524986a..4a5abef4a3 100644
--- a/src/python/tritonfrontend/_api/_kservehttp.py
+++ b/src/python/tritonfrontend/_api/_kservehttp.py
@@ -50,48 +50,47 @@ class Options:
         # DLIS-7215: Add restricted protocol support
         # restricted_protocols: list
 
-    class Server:
-        def __init__(self, server: tritonserver, options: "KServeHttp.Options" = None):
-            try:
-                server_ptr = server._ptr()  # TRITONSERVER_Server pointer
+    def __init__(self, server: tritonserver, options: "KServeHttp.Options" = None):
+        try:
+            server_ptr = server._ptr()  # TRITONSERVER_Server pointer
 
-                # If no options provided, default options are selected
-                if options is None:
-                    options = KServeHttp.Options()
+            # If no options provided, default options are selected
+            if options is None:
+                options = KServeHttp.Options()
 
-                if not isinstance(options, KServeHttp.Options):
-                    raise InvalidArgumentError(
-                        "Incorrect type for options. options argument must be of type KServeHttp.Options"
-                    )
+            if not isinstance(options, KServeHttp.Options):
+                raise InvalidArgumentError(
+                    "Incorrect type for options. options argument must be of type KServeHttp.Options"
+                )
 
-                options_dict: dict[str, Union[int, bool, str]] = options.__dict__
-                # Converts dataclass instance -> python dictionary -> unordered_map<string, std::variant<...>>
+            options_dict: dict[str, Union[int, bool, str]] = options.__dict__
+            # Converts dataclass instance -> python dictionary -> unordered_map<string, std::variant<...>>
 
-                self.triton_frontend = TritonFrontendHttp(server_ptr, options_dict)
-            except TritonError:
-                exc_type, exc_value, _ = sys.exc_info()
-                # raise ... from None masks the tritonfrontend Error from being added in traceback
-                raise ERROR_MAPPING[exc_type](exc_value) from None
+            self.triton_frontend = TritonFrontendHttp(server_ptr, options_dict)
+        except TritonError:
+            exc_type, exc_value, _ = sys.exc_info()
+            # raise ... from None masks the tritonfrontend Error from being added in traceback
+            raise ERROR_MAPPING[exc_type](exc_value) from None
 
-        def __enter__(self):
-            self.triton_frontend.start()
-            return self
+    def __enter__(self):
+        self.triton_frontend.start()
+        return self
 
-        def __exit__(self, exc_type, exc_value, traceback):
-            self.triton_frontend.stop()
-            if exc_type:
-                raise ERROR_MAPPING[exc_type](exc_value) from None
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.triton_frontend.stop()
+        if exc_type:
+            raise ERROR_MAPPING[exc_type](exc_value) from None
 
-        def start(self):
-            try:
-                self.triton_frontend.start()
-            except TritonError:
-                exc_type, exc_value, _ = sys.exc_info()
-                raise ERROR_MAPPING[exc_type](exc_value) from None
+    def start(self):
+        try:
+            self.triton_frontend.start()
+        except TritonError:
+            exc_type, exc_value, _ = sys.exc_info()
+            raise ERROR_MAPPING[exc_type](exc_value) from None
 
-        def stop(self):
-            try:
-                self.triton_frontend.stop()
-            except TritonError:
-                exc_type, exc_value, _ = sys.exc_info()
-                raise ERROR_MAPPING[exc_type](exc_value) from None
+    def stop(self):
+        try:
+            self.triton_frontend.stop()
+        except TritonError:
+            exc_type, exc_value, _ = sys.exc_info()
+            raise ERROR_MAPPING[exc_type](exc_value) from None

From da05094930edfdd1c7bf557fb1957e831eccc79f Mon Sep 17 00:00:00 2001
From: pranavm-nvidia <49246958+pranavm-nvidia@users.noreply.github.com>
Date: Tue, 8 Oct 2024 14:50:03 -0400
Subject: [PATCH 43/44] feat: Add copyright hook (#7666)

---
 .pre-commit-config.yaml |  12 +-
 LICENSE                 |   2 +-
 tools/add_copyright.py  | 365 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 377 insertions(+), 2 deletions(-)
 create mode 100644 tools/add_copyright.py

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index f44f815351..663a36d631 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,4 +1,4 @@
-# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -72,3 +72,13 @@ repos:
   - id: mixed-line-ending
   - id: requirements-txt-fixer
   - id: trailing-whitespace
+
+- repo: local
+  hooks:
+  - id: add-license
+    name: Add License
+    entry: python tools/add_copyright.py
+    language: python
+    stages: [pre-commit]
+    verbose: true
+    require_serial: true
diff --git a/LICENSE b/LICENSE
index 5529809efc..914565ec7d 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,4 +1,4 @@
-Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions
diff --git a/tools/add_copyright.py b/tools/add_copyright.py
new file mode 100644
index 0000000000..34432bb0c6
--- /dev/null
+++ b/tools/add_copyright.py
@@ -0,0 +1,365 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import argparse
+import os
+import re
+import subprocess
+import sys
+from datetime import datetime
+from typing import Callable, Dict, Optional, Sequence
+
+current_year = str(datetime.now().year)
+
+ROOT_DIR = os.path.join(os.path.dirname(__file__), os.path.pardir)
+
+LICENSE_PATH = os.path.join(ROOT_DIR, "LICENSE")
+
+COPYRIGHT_YEAR_PAT = re.compile(
+    r"Copyright( \(c\))? (\d{4})?-?(\d{4}), NVIDIA CORPORATION"
+)
+
+
+def has_copyright(content: str) -> bool:
+    return COPYRIGHT_YEAR_PAT.search(content)
+
+
+def update_copyright_year(
+    path: str, content: Optional[str] = None, disallow_range: bool = False
+) -> str:
+    """
+    Updates the copyright year in the provided file.
+    If the copyright is not present in the file, this function has no effect.
+    """
+    if content is None:
+        with open(path, "r") as f:
+            content = f.read()
+
+    match = COPYRIGHT_YEAR_PAT.search(content)
+    min_year = match.groups()[1] or match.groups()[2]
+
+    new_copyright = f"Copyright{match.groups()[0] or ''} "
+    if min_year < current_year and not disallow_range:
+        new_copyright += f"{min_year}-{current_year}"
+    else:
+        new_copyright += f"{current_year}"
+    new_copyright += ", NVIDIA CORPORATION"
+
+    updated_content = COPYRIGHT_YEAR_PAT.sub(new_copyright, content)
+
+    if content != updated_content:
+        with open(path, "w") as f:
+            f.write(updated_content)
+
+
+def update_and_get_license() -> str:
+    """
+    Updates the copyright year in the LICENSE file if necessary and then
+    returns its contents.
+    """
+    # TODO: Check if this is right - if the license file needs to have a range,
+    # we need to remove the range before returning the license text.
+    #
+    # License file should always have the current year.
+    update_copyright_year(LICENSE_PATH, disallow_range=True)
+
+    with open(LICENSE_PATH, "r") as license_file:
+        return license_file.read()
+
+
+LICENSE_TEXT = update_and_get_license()
+
+#
+# Header manipulation helpers
+#
+
+
+def prefix_lines(content: str, prefix: str) -> str:
+    # NOTE: This could have been done via `textwrap.indent`, but we're not actually indenting,
+    # so it seems semantically wrong to do that.
+    return prefix + f"\n{prefix}".join(content.splitlines())
+
+
+def insert_after(regex: str) -> Callable[[str], str]:
+    """
+    Builds a callback that will insert a provided header after
+    the specified regular expression. If the expression is not
+    found in the file contents, the header will be inserted at the
+    beginning of the file.
+
+    Args:
+        regex: The regular expression to match.
+
+    Returns:
+        A callable that can be used as the `add_header` argument to `update_or_add_header`.
+    """
+
+    def add_header(header: str, content: str) -> str:
+        match = re.match(regex, content)
+
+        if match is None:
+            return header + "\n" + content
+
+        insertion_point = match.span()[-1]
+
+        return content[:insertion_point] + f"{header}\n" + content[insertion_point:]
+
+    return add_header
+
+
+def update_or_add_header(
+    path: str, header: str, add_header: Optional[Callable[[str, str], str]] = None
+):
+    """
+    Updates in place or adds a new copyright header to the specified file.
+
+    Args:
+        path: The path of the file.
+        header: The contents of the copyright header.
+        add_header: A callback that receives the copyright header and file contents and
+            controls how the contents of the file are updated. By default, the copyright
+            header is prepended to the file.
+    """
+    with open(path, "r") as f:
+        content = f.read()
+
+    if has_copyright(content):
+        update_copyright_year(path, content)
+        return
+
+    add_header = add_header or (lambda header, content: header + "\n" + content)
+
+    content = add_header(header, content)
+
+    # As a sanity check, make sure we didn't accidentally add the copyright header
+    # twice, or add a new header when one was already present.
+    if content.count("Copyright (c)") != 1:
+        print(
+            f"WARNING: Something went wrong while processing: {path}!\n"
+            "Please check if the copyright header was included twice or wasn't added at all. "
+        )
+
+    with open(path, "w") as f:
+        f.write(content)
+
+
+# Each file type requires slightly different handling when inserting the copyright
+# header. For example, for C++ files, the header must be prefixed with `//` and for
+# shell scripts, it must be prefixed with `#` and must be inserted *after* the shebang.
+#
+# This mapping stores callables that return whether a handler wants to process a specified
+# file based on the path along with callables that will accept the file path and update
+# it with the copyright header.
+FILE_TYPE_HANDLERS: Dict[Callable[[str], bool], Callable[[str], None]] = {}
+
+
+#
+# Path matching callables
+# These allow registered functions to more easily specify what kinds of
+# paths they should be applied to.
+#
+def has_ext(exts: Sequence[str]):
+    def has_ext_impl(path: str):
+        _, ext = os.path.splitext(path)
+        return ext in exts
+
+    return has_ext_impl
+
+
+def basename_is(expected_path: str):
+    return lambda path: os.path.basename(path) == expected_path
+
+
+def path_contains(expected: str):
+    return lambda path: expected in path
+
+
+def any_of(*funcs: Sequence[Callable[[str], bool]]):
+    return lambda path: any(func(path) for func in funcs)
+
+
+#
+# File handlers for different types of files.
+# Many types of files require very similar handling - those are combined where possible.
+#
+
+
+def register(match: Callable[[str], bool]):
+    def register_impl(func):
+        FILE_TYPE_HANDLERS[match] = func
+        return func
+
+    return register_impl
+
+
+@register(
+    any_of(
+        has_ext([".py", ".sh", ".bash", ".yaml", ".pbtxt"]),
+        basename_is("CMakeLists.txt"),
+        path_contains("Dockerfile"),
+    )
+)
+def py_or_shell_like(path):
+    update_or_add_header(
+        path,
+        prefix_lines(LICENSE_TEXT, "# "),
+        # Insert the header *after* the shebang.
+        # NOTE: This could break if there is a shebang-like pattern elsewhere in the file.
+        # In that case, this could be edited to check only the first line of the file (after removing whitespace).
+        insert_after(r"#!(.*)\n"),
+    )
+
+
+@register(has_ext([".cc", ".h"]))
+def cpp(path):
+    update_or_add_header(path, prefix_lines(LICENSE_TEXT, "// "))
+
+
+@register(has_ext([".tpl"]))
+def tpl(path):
+    update_or_add_header(path, "{{/*\n" + prefix_lines(LICENSE_TEXT, "# ") + "\n*/}}")
+
+
+@register(has_ext([".html", ".md"]))
+def html_md(path):
+    update_or_add_header(path, "<!--\n" + prefix_lines(LICENSE_TEXT, "# ") + "\n-->")
+
+
+def add_copyrights(paths):
+    for path in paths:
+        for match, handler in FILE_TYPE_HANDLERS.items():
+            if match(path):
+                handler(path)
+                break
+        else:
+            print(
+                f"WARNING: No handler registered for file: {path}. Please add a new handler to {__file__}!"
+            )
+
+    subprocess.run(["git", "add"] + paths)
+
+    print(f"Processed copyright headers for {len(paths)} file(s).")
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        description="Adds copyright headers to source files"
+    )
+    parser.add_argument("files", nargs="*")
+
+    args, _ = parser.parse_known_args()
+    add_copyrights(args.files)
+    return 0
+
+
+if __name__ == "__main__":
+    # sys.exit is important here to avoid the test-related imports below during normal execution.
+    sys.exit(main())
+
+
+#
+# Integration Tests
+#
+import tempfile
+
+import pytest
+
+
+# Processes provided text through the copyright hook by writing it to a temporary file.
+def process_text(content, extension):
+    with tempfile.NamedTemporaryFile("w+", suffix=extension) as f:
+        f.write(content)
+        f.flush()
+
+        add_copyrights([f.name])
+
+        f.seek(0)
+        return f.read()
+
+
+# We use this slightly weird hack to make sure the copyright hook does not do a text replacement
+# of the parameters in the test, since they look exactly like copyright headers.
+def make_copyright_text(text):
+    return f"Copyright {text}"
+
+
+@pytest.mark.parametrize(
+    "content, expected",
+    [
+        # Convert to range if the year that's already present is older than the current year.
+        (
+            make_copyright_text("(c) 2018, NVIDIA CORPORATION"),
+            make_copyright_text(f"(c) 2018-{current_year}, NVIDIA CORPORATION"),
+        ),
+        (
+            make_copyright_text("2018, NVIDIA CORPORATION"),
+            make_copyright_text(f"2018-{current_year}, NVIDIA CORPORATION"),
+        ),
+        # No effect if the year is current:
+        (
+            make_copyright_text(f"(c) {current_year}, NVIDIA CORPORATION"),
+            make_copyright_text(f"(c) {current_year}, NVIDIA CORPORATION"),
+        ),
+        (
+            make_copyright_text(f"{current_year}, NVIDIA CORPORATION"),
+            make_copyright_text(f"{current_year}, NVIDIA CORPORATION"),
+        ),
+        # If there is already a range, update the upper bound of the range:
+        (
+            make_copyright_text("(c) 2018-2023, NVIDIA CORPORATION"),
+            make_copyright_text(f"(c) 2018-{current_year}, NVIDIA CORPORATION"),
+        ),
+    ],
+)
+def test_copyright_update(content, expected):
+    # We don't really care about the extension here - just needs to be something the hook will recognize.
+    assert process_text(content, ".py") == expected
+
+
+@pytest.mark.parametrize(
+    "content, extension, expected",
+    [
+        ("", ".cc", f"// {make_copyright_text(f'(c) {current_year}')}"),
+        ("", ".h", f"// {make_copyright_text(f'(c) {current_year}')}"),
+        ("", ".py", f"# {make_copyright_text(f'(c) {current_year}')}"),
+        ("", ".sh", f"# {make_copyright_text(f'(c) {current_year}')}"),
+        # Make sure copyright comes after shebangs
+        (
+            "#!/bin/python\n",
+            ".py",
+            f"#!/bin/python\n# {make_copyright_text(f'(c) {current_year}')}",
+        ),
+        (
+            "#!/bin/bash\n",
+            ".sh",
+            f"#!/bin/bash\n# {make_copyright_text(f'(c) {current_year}')}",
+        ),
+    ],
+)
+def test_adding_new_copyrights(content, extension, expected):
+    assert process_text(content, extension).startswith(expected)
+
+
+def test_license_has_no_range():
+    assert LICENSE_TEXT.startswith(f"Copyright (c) {current_year},")

From fde6e5887775b1236e25192601b6cd1d7abe3620 Mon Sep 17 00:00:00 2001
From: KrishnanPrash <140860868+KrishnanPrash@users.noreply.github.com>
Date: Tue, 8 Oct 2024 17:10:48 -0700
Subject: [PATCH 44/44] build: Adding `tritonfrontend` to `build.py` (#7681)

Co-authored-by: Ryan McCormick <rmccormick@nvidia.com>
---
 build.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/build.py b/build.py
index fdb7b47554..14301f843d 100755
--- a/build.py
+++ b/build.py
@@ -1849,11 +1849,11 @@ def core_build(
             os.path.join(repo_install_dir, "lib", "libtritonserver.so"),
             os.path.join(install_dir, "lib"),
         )
-    # [FIXME] Placing the Triton server wheel file in 'python' for now, should
-    # have been upload to pip registry and be able to install directly
+    # [FIXME] Placing the tritonserver and tritonfrontend wheel files in 'python' for now,
+    # should be uploaded to pip registry to be able to install directly
     cmake_script.mkdir(os.path.join(install_dir, "python"))
     cmake_script.cp(
-        os.path.join(repo_install_dir, "python", "tritonserver*.whl"),
+        os.path.join(repo_install_dir, "python", "triton*.whl"),
         os.path.join(install_dir, "python"),
     )