diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index f44f815351..663a36d631 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,4 +1,4 @@
-# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -72,3 +72,13 @@ repos:
   - id: mixed-line-ending
   - id: requirements-txt-fixer
   - id: trailing-whitespace
+
+- repo: local
+  hooks:
+  - id: add-license
+    name: Add License
+    entry: python tools/add_copyright.py
+    language: python
+    stages: [pre-commit]
+    verbose: true
+    require_serial: true
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ff578c9724..56cb346dc0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -125,17 +125,13 @@ FetchContent_Declare(
 
 # Some libs are installed to ${TRITON_THIRD_PARTY_INSTALL_PREFIX}/{LIB}/lib64 instead
 # of ${TRITON_THIRD_PARTY_INSTALL_PREFIX}/{LIB}/lib on Centos
-set (LIB_DIR "lib")
-# /etc/os-release does not exist on Windows
-if(EXISTS "/etc/os-release")
-  file(STRINGS /etc/os-release DISTRO REGEX "^NAME=")
-  string(REGEX REPLACE "NAME=\"(.*)\"" "\\1" DISTRO "${DISTRO}")
-  message(STATUS "Distro Name: ${DISTRO}")
-  if(DISTRO MATCHES "CentOS.*")
+set(LIB_DIR "lib")
+if(LINUX)
+  file(STRINGS "/etc/os-release" DISTRO_ID_LIKE REGEX "ID_LIKE")
+  if(${DISTRO_ID_LIKE} MATCHES "rhel|centos")
     set (LIB_DIR "lib64")
-  endif()
-endif()
-
+  endif(${DISTRO_ID_LIKE} MATCHES "rhel|centos")
+endif(LINUX)
 set(TRITON_CORE_HEADERS_ONLY OFF)
 
 FetchContent_MakeAvailable(repo-third-party repo-core)
diff --git a/Dockerfile.QA b/Dockerfile.QA
index 2c43f735a5..68ab519b41 100644
--- a/Dockerfile.QA
+++ b/Dockerfile.QA
@@ -267,6 +267,12 @@ RUN cp -r qa/L0_decoupled/models qa/L0_decoupled/python_models/ && \
     cp /workspace/tritonbuild/python/examples/decoupled/square_config.pbtxt \
         qa/L0_decoupled/python_models/square_int32/.
 
+RUN mkdir -p qa/L0_decoupled_grpc_error && \
+    cp -r qa/L0_decoupled/. qa/L0_decoupled_grpc_error
+
+RUN mkdir -p qa/L0_grpc_error_state_cleanup && \
+    cp -r qa/L0_grpc_state_cleanup/. qa/L0_grpc_error_state_cleanup
+
 RUN mkdir -p qa/L0_repoagent_checksum/models/identity_int32/1 && \
     cp tritonbuild/identity/install/backends/identity/libtriton_identity.so \
         qa/L0_repoagent_checksum/models/identity_int32/1/.
@@ -384,6 +390,10 @@ RUN rm -fr qa/L0_copyrights qa/L0_build_variants && \
 RUN find qa/pkgs/ -maxdepth 1 -type f -name \
     "tritonserver-*.whl" | xargs -I {} pip3 install --upgrade {}[all]
 
+# Install Triton Frontend Python API
+RUN find qa/pkgs/ -type f -name \
+    "tritonfrontend-*.whl" | xargs -I {} pip3 install --upgrade {}[all]
+
 ENV LD_LIBRARY_PATH /opt/tritonserver/qa/clients:${LD_LIBRARY_PATH}
 
 # DLIS-3631: Needed to run Perf Analyzer CI tests correctly
diff --git a/Dockerfile.sdk b/Dockerfile.sdk
index 1524b5ead3..5ddaf7274f 100644
--- a/Dockerfile.sdk
+++ b/Dockerfile.sdk
@@ -29,10 +29,11 @@
 #
 
 # Base image on the minimum Triton container
-ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.07-py3-min
+ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.09-py3-min
 
 ARG TRITON_CLIENT_REPO_SUBDIR=clientrepo
 ARG TRITON_PA_REPO_SUBDIR=perfanalyzerrepo
+ARG TRITON_REPO_ORGANIZATION=http://github.com/triton-inference-server
 ARG TRITON_COMMON_REPO_TAG=main
 ARG TRITON_CORE_REPO_TAG=main
 ARG TRITON_CLIENT_REPO_TAG=main
@@ -217,6 +218,7 @@ WORKDIR /workspace
 COPY TRITON_VERSION .
 COPY NVIDIA_Deep_Learning_Container_License.pdf .
 COPY --from=sdk_build /workspace/client/ client/
+COPY --from=sdk_build /workspace/perf_analyzer/ perf_analyzer/
 COPY --from=sdk_build /workspace/install/ install/
 RUN cd install && \
     export VERSION=`cat /workspace/TRITON_VERSION` && \
diff --git a/Dockerfile.win10.min b/Dockerfile.win10.min
index 0a554fbcf4..dec972eaf3 100644
--- a/Dockerfile.win10.min
+++ b/Dockerfile.win10.min
@@ -37,9 +37,9 @@ RUN choco install unzip -y
 #
 # Installing TensorRT
 #
-ARG TENSORRT_VERSION=10.2.0.19
-ARG TENSORRT_ZIP="TensorRT-${TENSORRT_VERSION}.Windows10.x86_64.cuda-12.5.zip"
-ARG TENSORRT_SOURCE=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.2.0/zip/TensorRT-10.2.0.19.Windows10.x86_64.cuda-12.5.zip
+ARG TENSORRT_VERSION=10.4.0.26
+ARG TENSORRT_ZIP="TensorRT-${TENSORRT_VERSION}.Windows.win10.cuda-12.6.zip"
+ARG TENSORRT_SOURCE=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.4.0/zip/TensorRT-10.4.0.26.Windows.win10.cuda-12.6.zip
 # COPY ${TENSORRT_ZIP} /tmp/${TENSORRT_ZIP}
 ADD ${TENSORRT_SOURCE} /tmp/${TENSORRT_ZIP}
 RUN unzip /tmp/%TENSORRT_ZIP%
@@ -51,9 +51,9 @@ LABEL TENSORRT_VERSION="${TENSORRT_VERSION}"
 #
 # Installing cuDNN
 #
-ARG CUDNN_VERSION=9.2.1.18
+ARG CUDNN_VERSION=9.4.0.58
 ARG CUDNN_ZIP=cudnn-windows-x86_64-${CUDNN_VERSION}_cuda12-archive.zip
-ARG CUDNN_SOURCE=https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/windows-x86_64/cudnn-windows-x86_64-9.2.1.18_cuda12-archive.zip
+ARG CUDNN_SOURCE=https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/windows-x86_64/cudnn-windows-x86_64-9.4.0.58_cuda12-archive.zip
 ADD ${CUDNN_SOURCE} /tmp/${CUDNN_ZIP}
 RUN unzip /tmp/%CUDNN_ZIP%
 RUN move cudnn-* cudnn
@@ -101,14 +101,14 @@ LABEL CMAKE_VERSION=${CMAKE_VERSION}
 #
 # Installing Visual Studio BuildTools: VS17 2022
 #
-ARG BUILDTOOLS_VERSION=17.9.34622.214
+ARG BUILDTOOLS_VERSION=17.10.35201.131
 # Download collect.exe in case of an install failure.
 ADD https://aka.ms/vscollect.exe "C:\tmp\collect.exe"
 
 # Use the latest release channel. For more control, specify the location of an internal layout.
 # Download the Build Tools bootstrapper.
 # ARG BUILD_TOOLS_SOURCE=https://aka.ms/vs/17/release/vs_buildtools.exe
-ARG BUILD_TOOLS_SOURCE=https://download.visualstudio.microsoft.com/download/pr/5e7b923b-7d89-4e14-95b8-a84ab168e243/96b21d216c7954aaf606c6d7ba59a3de991884a8a86c578c767ba349c23188a9/vs_BuildTools.exe
+ARG BUILD_TOOLS_SOURCE=https://download.visualstudio.microsoft.com/download/pr/28626b4b-f88f-4b55-a0cf-f3eaa2c643fb/e6c43d4dfb36338d954cdb3ad9010ab2a479e712088f4f6b016eadcc721bab28/vs_BuildTools.exe
 ADD ${BUILD_TOOLS_SOURCE} vs_buildtools.exe
 # Install Build Tools with the Microsoft.VisualStudio.Workload.VCTools workload, including recommended.
 ARG VS_INSTALL_PATH_WP="C:\BuildTools"
@@ -175,7 +175,7 @@ RUN copy "%CUDA_INSTALL_ROOT_WP%\extras\visual_studio_integration\MSBuildExtensi
 
 RUN setx PATH "%CUDA_INSTALL_ROOT_WP%\bin;%PATH%"
 
-ARG CUDNN_VERSION=9.2.1.18
+ARG CUDNN_VERSION=9.4.0.58
 ENV CUDNN_VERSION ${CUDNN_VERSION}
 COPY --from=dependency_base /cudnn /cudnn
 RUN copy cudnn\bin\cudnn*.dll "%CUDA_INSTALL_ROOT_WP%\bin\."
@@ -183,7 +183,7 @@ RUN copy cudnn\lib\x64\cudnn*.lib "%CUDA_INSTALL_ROOT_WP%\lib\x64\."
 RUN copy cudnn\include\cudnn*.h "%CUDA_INSTALL_ROOT_WP%\include\."
 LABEL CUDNN_VERSION="${CUDNN_VERSION}"
 
-ARG TENSORRT_VERSION=10.2.0.19
+ARG TENSORRT_VERSION=10.4.0.26
 ENV TRT_VERSION ${TENSORRT_VERSION}
 COPY --from=dependency_base /TensorRT /TensorRT
 RUN setx PATH "c:\TensorRT\lib;%PATH%"
diff --git a/LICENSE b/LICENSE
index 5529809efc..914565ec7d 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,4 +1,4 @@
-Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions
diff --git a/README.md b/README.md
index 17628b4f03..36ef51f279 100644
--- a/README.md
+++ b/README.md
@@ -30,11 +30,10 @@
 
 [![License](https://img.shields.io/badge/License-BSD3-lightgrey.svg)](https://opensource.org/licenses/BSD-3-Clause)
 
-[!WARNING]
-
-##### LATEST RELEASE
-You are currently on the `main` branch which tracks under-development progress towards the next release.
-The current release is version [2.48.0](https://github.com/triton-inference-server/server/releases/latest) and corresponds to the 24.07 container release on NVIDIA GPU Cloud (NGC).
+>[!WARNING]
+>You are currently on the `main` branch which tracks under-development progress
+>towards the next release. The current release is version [2.50.0](https://github.com/triton-inference-server/server/releases/latest)
+>and corresponds to the 24.09 container release on NVIDIA GPU Cloud (NGC).
 
 Triton Inference Server is an open source inference serving software that
 streamlines AI inferencing. Triton enables teams to deploy any AI model from
@@ -92,16 +91,16 @@ Inference Server with the
 
 ```bash
 # Step 1: Create the example model repository
-git clone -b r24.07 https://github.com/triton-inference-server/server.git
+git clone -b r24.09 https://github.com/triton-inference-server/server.git
 cd server/docs/examples
 ./fetch_models.sh
 
 # Step 2: Launch triton from the NGC Triton container
-docker run --gpus=1 --rm --net=host -v ${PWD}/model_repository:/models nvcr.io/nvidia/tritonserver:24.07-py3 tritonserver --model-repository=/models
+docker run --gpus=1 --rm --net=host -v ${PWD}/model_repository:/models nvcr.io/nvidia/tritonserver:24.09-py3 tritonserver --model-repository=/models
 
 # Step 3: Sending an Inference Request
 # In a separate console, launch the image_client example from the NGC Triton SDK container
-docker run -it --rm --net=host nvcr.io/nvidia/tritonserver:24.07-py3-sdk
+docker run -it --rm --net=host nvcr.io/nvidia/tritonserver:24.09-py3-sdk
 /workspace/install/bin/image_client -m densenet_onnx -c 3 -s INCEPTION /workspace/images/mug.jpg
 
 # Inference should return the following
@@ -179,7 +178,7 @@ configuration](docs/user_guide/model_configuration.md) for the model.
   [Backend-Platform Support Matrix](https://github.com/triton-inference-server/backend/blob/main/docs/backend_platform_support_matrix.md)
   to learn which backends are supported on your target platform.
 - Learn how to [optimize performance](docs/user_guide/optimization.md) using the
-  [Performance Analyzer](https://github.com/triton-inference-server/client/blob/main/src/c++/perf_analyzer/README.md)
+  [Performance Analyzer](https://github.com/triton-inference-server/perf_analyzer/blob/main/README.md)
   and
   [Model Analyzer](https://github.com/triton-inference-server/model_analyzer)
 - Learn how to [manage loading and unloading models](docs/user_guide/model_management.md) in
diff --git a/TRITON_VERSION b/TRITON_VERSION
index 37433781ef..124ddb483d 100644
--- a/TRITON_VERSION
+++ b/TRITON_VERSION
@@ -1 +1 @@
-2.49.0dev
\ No newline at end of file
+2.51.0dev
\ No newline at end of file
diff --git a/build.py b/build.py
index 6ab8a58515..14301f843d 100755
--- a/build.py
+++ b/build.py
@@ -37,6 +37,7 @@
 import sys
 from inspect import getsourcefile
 
+import distro
 import requests
 
 #
@@ -69,10 +70,10 @@
 # incorrectly load the other version of the openvino libraries.
 #
 TRITON_VERSION_MAP = {
-    "2.49.0dev": (
-        "24.08dev",  # triton container
-        "24.07",  # upstream container
-        "1.18.1",  # ORT
+    "2.51.0dev": (
+        "24.10dev",  # triton container
+        "24.09",  # upstream container
+        "1.19.2",  # ORT
         "2024.0.0",  # ORT OpenVINO
         "2024.0.0",  # Standalone OpenVINO
         "3.2.6",  # DCGM version
@@ -115,13 +116,25 @@ def fail_if(p, msg):
 
 
 def target_platform():
-    if FLAGS.target_platform is not None:
+    # When called by compose.py, FLAGS will be None
+    if FLAGS and FLAGS.target_platform is not None:
         return FLAGS.target_platform
-    return platform.system().lower()
+    platform_string = platform.system().lower()
+    if platform_string == "linux":
+        # Need to inspect the /etc/os-release file to get
+        # the distribution of linux
+        id_like_list = distro.like().split()
+        if "debian" in id_like_list:
+            return "linux"
+        else:
+            return "rhel"
+    else:
+        return platform_string
 
 
 def target_machine():
-    if FLAGS.target_machine is not None:
+    # When called by compose.py, FLAGS will be None
+    if FLAGS and FLAGS.target_machine is not None:
         return FLAGS.target_machine
     return platform.machine().lower()
 
@@ -203,6 +216,8 @@ def header(self, desc=None):
 
         self.comment("Exit script immediately if any command fails")
         if target_platform() == "windows":
+            self._file.write("$UseStructuredOutput = $false\n")
+            self.blankln()
             self._file.write("function ExitWithCode($exitcode) {\n")
             self._file.write("    $host.SetShouldExit($exitcode)\n")
             self._file.write("    exit $exitcode\n")
@@ -628,13 +643,16 @@ def pytorch_cmake_args(images):
         cmake_backend_arg("pytorch", "TRITON_PYTORCH_DOCKER_IMAGE", None, image),
     ]
 
-    if FLAGS.enable_gpu:
+    # TODO: TPRD-372 TorchTRT extension is not currently supported by our manylinux build
+    # TODO: TPRD-373 NVTX extension is not currently supported by our manylinux build
+    if target_platform() != "rhel":
+        if FLAGS.enable_gpu:
+            cargs.append(
+                cmake_backend_enable("pytorch", "TRITON_PYTORCH_ENABLE_TORCHTRT", True)
+            )
         cargs.append(
-            cmake_backend_enable("pytorch", "TRITON_PYTORCH_ENABLE_TORCHTRT", True)
+            cmake_backend_enable("pytorch", "TRITON_ENABLE_NVTX", FLAGS.enable_nvtx)
         )
-    cargs.append(
-        cmake_backend_enable("pytorch", "TRITON_ENABLE_NVTX", FLAGS.enable_nvtx)
-    )
     return cargs
 
 
@@ -644,12 +662,15 @@ def onnxruntime_cmake_args(images, library_paths):
             "onnxruntime",
             "TRITON_BUILD_ONNXRUNTIME_VERSION",
             None,
-            TRITON_VERSION_MAP[FLAGS.version][2],
+            os.getenv("TRITON_BUILD_ONNXRUNTIME_VERSION")
+            if os.getenv("TRITON_BUILD_ONNXRUNTIME_VERSION")
+            else TRITON_VERSION_MAP[FLAGS.version][2],
         )
     ]
 
     # TRITON_ENABLE_GPU is already set for all backends in backend_cmake_args()
-    if FLAGS.enable_gpu:
+    # TODO: TPRD-334 TensorRT extension is not currently supported by our manylinux build
+    if FLAGS.enable_gpu and target_platform() != "rhel":
         cargs.append(
             cmake_backend_enable(
                 "onnxruntime", "TRITON_ENABLE_ONNXRUNTIME_TENSORRT", True
@@ -680,8 +701,11 @@ def onnxruntime_cmake_args(images, library_paths):
                 )
             )
 
-        if (target_machine() != "aarch64") and (
-            TRITON_VERSION_MAP[FLAGS.version][3] is not None
+        # TODO: TPRD-333 OpenVino extension is not currently supported by our manylinux build
+        if (
+            (target_machine() != "aarch64")
+            and (target_platform() != "rhel")
+            and (TRITON_VERSION_MAP[FLAGS.version][3] is not None)
         ):
             cargs.append(
                 cmake_backend_enable(
@@ -697,7 +721,7 @@ def onnxruntime_cmake_args(images, library_paths):
                 )
             )
 
-        if target_platform() == "igpu":
+        if (target_platform() == "igpu") or (target_platform() == "rhel"):
             cargs.append(
                 cmake_backend_arg(
                     "onnxruntime",
@@ -833,8 +857,31 @@ def install_dcgm_libraries(dcgm_version, target_machine):
         )
         return ""
     else:
-        if target_machine == "aarch64":
-            return """
+        # RHEL has the same install instructions for both aarch64 and x86
+        if target_platform() == "rhel":
+            if target_machine == "aarch64":
+                return """
+ENV DCGM_VERSION {}
+# Install DCGM. Steps from https://developer.nvidia.com/dcgm#Downloads
+RUN dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/sbsa/cuda-rhel8.repo \\
+    && dnf clean expire-cache \\
+    && dnf install -y datacenter-gpu-manager-{}
+""".format(
+                    dcgm_version, dcgm_version
+                )
+            else:
+                return """
+ENV DCGM_VERSION {}
+# Install DCGM. Steps from https://developer.nvidia.com/dcgm#Downloads
+RUN dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo \\
+    && dnf clean expire-cache \\
+    && dnf install -y datacenter-gpu-manager-{}
+""".format(
+                    dcgm_version, dcgm_version
+                )
+        else:
+            if target_machine == "aarch64":
+                return """
 ENV DCGM_VERSION {}
 # Install DCGM. Steps from https://developer.nvidia.com/dcgm#Downloads
 RUN curl -o /tmp/cuda-keyring.deb \\
@@ -844,10 +891,10 @@ def install_dcgm_libraries(dcgm_version, target_machine):
       && apt-get update \\
       && apt-get install -y datacenter-gpu-manager=1:{}
 """.format(
-                dcgm_version, dcgm_version
-            )
-        else:
-            return """
+                    dcgm_version, dcgm_version
+                )
+            else:
+                return """
 ENV DCGM_VERSION {}
 # Install DCGM. Steps from https://developer.nvidia.com/dcgm#Downloads
 RUN curl -o /tmp/cuda-keyring.deb \\
@@ -857,8 +904,106 @@ def install_dcgm_libraries(dcgm_version, target_machine):
       && apt-get update \\
       && apt-get install -y datacenter-gpu-manager=1:{}
 """.format(
-                dcgm_version, dcgm_version
-            )
+                    dcgm_version, dcgm_version
+                )
+
+
+def create_dockerfile_buildbase_rhel(ddir, dockerfile_name, argmap):
+    df = """
+ARG TRITON_VERSION={}
+ARG TRITON_CONTAINER_VERSION={}
+ARG BASE_IMAGE={}
+""".format(
+        argmap["TRITON_VERSION"],
+        argmap["TRITON_CONTAINER_VERSION"],
+        argmap["BASE_IMAGE"],
+    )
+
+    df += """
+FROM ${BASE_IMAGE}
+
+ARG TRITON_VERSION
+ARG TRITON_CONTAINER_VERSION
+"""
+    df += """
+# Install docker docker buildx
+RUN yum install -y ca-certificates curl gnupg yum-utils \\
+      && yum-config-manager --add-repo https://download.docker.com/linux/rhel/docker-ce.repo \\
+      && yum install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
+#   && yum install -y docker.io docker-buildx-plugin
+
+# libcurl4-openSSL-dev is needed for GCS
+# python3-dev is needed by Torchvision
+# python3-pip and libarchive-dev is needed by python backend
+# libxml2-dev is needed for Azure Storage
+# scons is needed for armnn_tflite backend build dep
+RUN yum install -y \\
+            ca-certificates \\
+            autoconf \\
+            automake \\
+            git \\
+            gperf \\
+            re2-devel \\
+            openssl-devel \\
+            libtool \\
+            libcurl-devel \\
+            libb64-devel \\
+            gperftools-devel \\
+            patchelf \\
+            python3.11-devel \\
+            python3-pip \\
+            python3-setuptools \\
+            rapidjson-devel \\
+            python3-scons \\
+            pkg-config \\
+            unzip \\
+            wget \\
+            zlib-devel \\
+            libarchive-devel \\
+            libxml2-devel \\
+            numactl-devel \\
+            wget
+
+RUN pip3 install --upgrade pip \\
+      && pip3 install --upgrade \\
+          wheel \\
+          setuptools \\
+          docker \\
+          virtualenv
+
+# Install boost version >= 1.78 for boost::span
+# Current libboost-dev apt packages are < 1.78, so install from tar.gz
+RUN wget -O /tmp/boost.tar.gz \\
+          https://archives.boost.io/release/1.80.0/source/boost_1_80_0.tar.gz \\
+      && (cd /tmp && tar xzf boost.tar.gz) \\
+      && mv /tmp/boost_1_80_0/boost /usr/include/boost
+
+# Server build requires recent version of CMake (FetchContent required)
+# Might not need this if the installed version of cmake is high enough for our build.
+# RUN apt update -q=2 \\
+#       && apt install -y gpg wget \\
+#       && wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - |  tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null \\
+#       && . /etc/os-release \\
+#       && echo "deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ $UBUNTU_CODENAME main" | tee /etc/apt/sources.list.d/kitware.list >/dev/null \\
+#       && apt-get update -q=2 \\
+#       && apt-get install -y --no-install-recommends cmake=3.27.7* cmake-data=3.27.7*
+"""
+    if FLAGS.enable_gpu:
+        df += install_dcgm_libraries(argmap["DCGM_VERSION"], target_machine())
+    df += """
+ENV TRITON_SERVER_VERSION ${TRITON_VERSION}
+ENV NVIDIA_TRITON_SERVER_VERSION ${TRITON_CONTAINER_VERSION}
+"""
+
+    df += """
+WORKDIR /workspace
+RUN rm -fr *
+COPY . .
+ENTRYPOINT []
+"""
+
+    with open(os.path.join(ddir, dockerfile_name), "w") as dfile:
+        dfile.write(df)
 
 
 def create_dockerfile_buildbase(ddir, dockerfile_name, argmap):
@@ -1161,7 +1306,28 @@ def dockerfile_prepare_container_linux(argmap, backends, enable_gpu, target_mach
         fi \\
       && [ `id -u $TRITON_SERVER_USER` -eq 1000 ] \\
       && [ `id -g $TRITON_SERVER_USER` -eq 1000 ]
+""".format(
+        gpu_enabled=gpu_enabled
+    )
 
+    if target_platform() == "rhel":
+        df += """
+# Common dpeendencies.
+RUN yum install -y \\
+        git \\
+        gperf \\
+        re2-devel \\
+        openssl-devel \\
+        libtool \\
+        libcurl-devel \\
+        libb64-devel \\
+        gperftools-devel \\
+        patchelf \\
+        wget \\
+        numactl-devel
+"""
+    else:
+        df += """
 # Ensure apt-get won't prompt for selecting options
 ENV DEBIAN_FRONTEND=noninteractive
 
@@ -1184,12 +1350,14 @@ def dockerfile_prepare_container_linux(argmap, backends, enable_gpu, target_mach
               wget \\
               {backend_dependencies} \\
       && rm -rf /var/lib/apt/lists/*
+""".format(
+            backend_dependencies=backend_dependencies
+        )
 
+    df += """
 # Set TCMALLOC_RELEASE_RATE for users setting LD_PRELOAD with tcmalloc
 ENV TCMALLOC_RELEASE_RATE 200
-""".format(
-        gpu_enabled=gpu_enabled, backend_dependencies=backend_dependencies
-    )
+"""
 
     if "fastertransformer" in backends:
         be = "fastertransformer"
@@ -1206,12 +1374,15 @@ def dockerfile_prepare_container_linux(argmap, backends, enable_gpu, target_mach
 
     if enable_gpu:
         df += install_dcgm_libraries(argmap["DCGM_VERSION"], target_machine)
-        df += """
+        # This segment will break the RHEL SBSA build. Need to determine whether
+        # this is necessary to incorporate.
+        if target_platform() != "rhel":
+            df += """
 # Extra defensive wiring for CUDA Compat lib
 RUN ln -sf ${_CUDA_COMPAT_PATH}/lib.real ${_CUDA_COMPAT_PATH}/lib \\
-      && echo ${_CUDA_COMPAT_PATH}/lib > /etc/ld.so.conf.d/00-cuda-compat.conf \\
-      && ldconfig \\
-      && rm -f ${_CUDA_COMPAT_PATH}/lib
+    && echo ${_CUDA_COMPAT_PATH}/lib > /etc/ld.so.conf.d/00-cuda-compat.conf \\
+    && ldconfig \\
+    && rm -f ${_CUDA_COMPAT_PATH}/lib
 """
     else:
         df += add_cpu_libs_to_linux_dockerfile(backends, target_machine)
@@ -1433,9 +1604,14 @@ def create_build_dockerfiles(
             )
         dockerfileargmap["GPU_BASE_IMAGE"] = gpu_base_image
 
-    create_dockerfile_buildbase(
-        FLAGS.build_dir, "Dockerfile.buildbase", dockerfileargmap
-    )
+    if target_platform() == "rhel":
+        create_dockerfile_buildbase_rhel(
+            FLAGS.build_dir, "Dockerfile.buildbase", dockerfileargmap
+        )
+    else:
+        create_dockerfile_buildbase(
+            FLAGS.build_dir, "Dockerfile.buildbase", dockerfileargmap
+        )
 
     if target_platform() == "windows":
         create_dockerfile_windows(
@@ -1647,6 +1823,21 @@ def core_build(
             os.path.join(repo_install_dir, "bin", "tritonserver.dll"),
             os.path.join(install_dir, "bin"),
         )
+        cmake_script.cp(
+            os.path.join(repo_install_dir, "lib", "tritonserver.lib"),
+            os.path.join(install_dir, "bin"),
+        )
+    elif target_platform() == "rhel":
+        cmake_script.mkdir(os.path.join(install_dir, "bin"))
+        cmake_script.cp(
+            os.path.join(repo_install_dir, "bin", "tritonserver"),
+            os.path.join(install_dir, "bin"),
+        )
+        cmake_script.mkdir(os.path.join(install_dir, "lib64"))
+        cmake_script.cp(
+            os.path.join(repo_install_dir, "lib64", "libtritonserver.so"),
+            os.path.join(install_dir, "lib64"),
+        )
     else:
         cmake_script.mkdir(os.path.join(install_dir, "bin"))
         cmake_script.cp(
@@ -1658,11 +1849,11 @@ def core_build(
             os.path.join(repo_install_dir, "lib", "libtritonserver.so"),
             os.path.join(install_dir, "lib"),
         )
-    # [FIXME] Placing the Triton server wheel file in 'python' for now, should
-    # have been upload to pip registry and be able to install directly
+    # [FIXME] Placing the tritonserver and tritonfrontend wheel files in 'python' for now,
+    # should be uploaded to pip registry to be able to install directly
     cmake_script.mkdir(os.path.join(install_dir, "python"))
     cmake_script.cp(
-        os.path.join(repo_install_dir, "python", "tritonserver*.whl"),
+        os.path.join(repo_install_dir, "python", "triton*.whl"),
         os.path.join(install_dir, "python"),
     )
 
@@ -1802,6 +1993,10 @@ def backend_clone(
         os.path.join(build_dir, be, "src", "model.py"),
         backend_dir,
     )
+    clone_script.cpdir(
+        os.path.join(build_dir, be, "src", "utils"),
+        backend_dir,
+    )
 
     clone_script.comment()
     clone_script.comment(f"end '{be}' backend")
@@ -2120,7 +2315,7 @@ def enable_all():
         "--target-platform",
         required=False,
         default=None,
-        help='Target platform for build, can be "linux", "windows" or "igpu". If not specified, build targets the current platform.',
+        help='Target platform for build, can be "linux", "rhel", "windows" or "igpu". If not specified, build targets the current platform.',
     )
     parser.add_argument(
         "--target-machine",
diff --git a/deploy/aws/values.yaml b/deploy/aws/values.yaml
index 98151829c7..bd8ae0fe3b 100644
--- a/deploy/aws/values.yaml
+++ b/deploy/aws/values.yaml
@@ -27,7 +27,7 @@
 replicaCount: 1
 
 image:
-  imageName: nvcr.io/nvidia/tritonserver:24.07-py3
+  imageName: nvcr.io/nvidia/tritonserver:24.09-py3
   pullPolicy: IfNotPresent
   modelRepositoryPath: s3://triton-inference-server-repository/model_repository
   numGpus: 1
diff --git a/deploy/fleetcommand/Chart.yaml b/deploy/fleetcommand/Chart.yaml
index 340e19fb50..8feee92b3c 100644
--- a/deploy/fleetcommand/Chart.yaml
+++ b/deploy/fleetcommand/Chart.yaml
@@ -26,7 +26,7 @@
 
 apiVersion: v1
 # appVersion is the Triton version; update when changing release
-appVersion: "2.48.0"
+appVersion: "2.50.0"
 description: Triton Inference Server (Fleet Command)
 name: triton-inference-server
 # version is the Chart version; update when changing anything in the chart
diff --git a/deploy/fleetcommand/values.yaml b/deploy/fleetcommand/values.yaml
index 7a556ef7df..dc5f37ca3b 100644
--- a/deploy/fleetcommand/values.yaml
+++ b/deploy/fleetcommand/values.yaml
@@ -27,7 +27,7 @@
 replicaCount: 1
 
 image:
-  imageName: nvcr.io/nvidia/tritonserver:24.07-py3
+  imageName: nvcr.io/nvidia/tritonserver:24.09-py3
   pullPolicy: IfNotPresent
   numGpus: 1
   serverCommand: tritonserver
@@ -47,13 +47,13 @@ image:
     #
     # To set model control mode, uncomment and configure below
     # TODO: Fix the following url, it is invalid
-    # See https://github.com/triton-inference-server/server/blob/r24.07/docs/model_management.md
+    # See https://github.com/triton-inference-server/server/blob/r24.09/docs/model_management.md
     #  for more details
     #- --model-control-mode=explicit|poll|none
     #
     # Additional server args
     #
-    # see https://github.com/triton-inference-server/server/blob/r24.07/README.md
+    # see https://github.com/triton-inference-server/server/blob/r24.09/README.md
     #  for more details
 
 service:
diff --git a/deploy/gcp/values.yaml b/deploy/gcp/values.yaml
index 937acc6b80..c5427c151e 100644
--- a/deploy/gcp/values.yaml
+++ b/deploy/gcp/values.yaml
@@ -27,7 +27,7 @@
 replicaCount: 1
 
 image:
-  imageName: nvcr.io/nvidia/tritonserver:24.07-py3
+  imageName: nvcr.io/nvidia/tritonserver:24.09-py3
   pullPolicy: IfNotPresent
   modelRepositoryPath: gs://triton-inference-server-repository/model_repository
   numGpus: 1
diff --git a/deploy/gke-marketplace-app/README.md b/deploy/gke-marketplace-app/README.md
index e99b9efbae..595d4634ab 100644
--- a/deploy/gke-marketplace-app/README.md
+++ b/deploy/gke-marketplace-app/README.md
@@ -1,5 +1,5 @@
 <!--
-# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -172,7 +172,7 @@ The client example push about ~650 QPS(Query per second) to Triton Server, and w
 ![Locust Client Chart](client.png)
 
 Alternatively, user can opt to use
-[Perf Analyzer](https://github.com/triton-inference-server/client/blob/main/src/c++/perf_analyzer/README.md)
+[Perf Analyzer](https://github.com/triton-inference-server/perf_analyzer/blob/main/README.md)
 to profile and study the performance of Triton Inference Server. Here we also
 provide a
 [client script](https://github.com/triton-inference-server/server/tree/master/deploy/gke-marketplace-app/client-sample/perf_analyzer_grpc.sh)
diff --git a/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml b/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml
index 21e5a34077..a63a12ce34 100644
--- a/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml
+++ b/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml
@@ -33,7 +33,7 @@ metadata:
   namespace: default
 spec:
   containers:
-  - image: nvcr.io/nvidia/tritonserver:24.07-py3-sdk
+  - image: nvcr.io/nvidia/tritonserver:24.09-py3-sdk
     imagePullPolicy: Always
     name: nv-triton-client
     securityContext:
diff --git a/deploy/gke-marketplace-app/server-deployer/build_and_push.sh b/deploy/gke-marketplace-app/server-deployer/build_and_push.sh
index e4fe8fe04f..19d84816a0 100755
--- a/deploy/gke-marketplace-app/server-deployer/build_and_push.sh
+++ b/deploy/gke-marketplace-app/server-deployer/build_and_push.sh
@@ -27,9 +27,9 @@
 
 export REGISTRY=gcr.io/$(gcloud config get-value project | tr ':' '/')
 export APP_NAME=tritonserver
-export MAJOR_VERSION=2.48
-export MINOR_VERSION=2.48.0
-export NGC_VERSION=24.07-py3
+export MAJOR_VERSION=2.50
+export MINOR_VERSION=2.50.0
+export NGC_VERSION=24.09-py3
 
 docker pull nvcr.io/nvidia/$APP_NAME:$NGC_VERSION
 
diff --git a/deploy/gke-marketplace-app/server-deployer/chart/triton/Chart.yaml b/deploy/gke-marketplace-app/server-deployer/chart/triton/Chart.yaml
index e2b00ad12b..e9f8880a0b 100644
--- a/deploy/gke-marketplace-app/server-deployer/chart/triton/Chart.yaml
+++ b/deploy/gke-marketplace-app/server-deployer/chart/triton/Chart.yaml
@@ -25,7 +25,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 apiVersion: v1
-appVersion: "2.48"
+appVersion: "2.50"
 description: Triton Inference Server
 name: triton-inference-server
-version: 2.48.0
+version: 2.50.0
diff --git a/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml b/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml
index 3d460f8aa0..450d8f735c 100644
--- a/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml
+++ b/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml
@@ -31,14 +31,14 @@ maxReplicaCount: 3
 tritonProtocol: HTTP
 # HPA GPU utilization autoscaling target
 HPATargetAverageValue: 85
-modelRepositoryPath: gs://triton_sample_models/24.07
-publishedVersion: '2.48.0'
+modelRepositoryPath: gs://triton_sample_models/24.09
+publishedVersion: '2.50.0'
 gcpMarketplace: true
 
 image:
   registry: gcr.io
   repository: nvidia-ngc-public/tritonserver
-  tag: 24.07-py3
+  tag: 24.09-py3
   pullPolicy: IfNotPresent
   # modify the model repository here to match your GCP storage bucket
   numGpus: 1
diff --git a/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml b/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml
index 0ecf429a44..16494b5261 100644
--- a/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml
+++ b/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml
@@ -27,7 +27,7 @@
 x-google-marketplace:
   schemaVersion: v2
   applicationApiVersion: v1beta1
-  publishedVersion: '2.48.0'
+  publishedVersion: '2.50.0'
   publishedVersionMetadata:
     releaseNote: >-
       Initial release.
diff --git a/deploy/gke-marketplace-app/server-deployer/schema.yaml b/deploy/gke-marketplace-app/server-deployer/schema.yaml
index c82f73e47f..f3525a52f1 100644
--- a/deploy/gke-marketplace-app/server-deployer/schema.yaml
+++ b/deploy/gke-marketplace-app/server-deployer/schema.yaml
@@ -27,7 +27,7 @@
 x-google-marketplace:
   schemaVersion: v2
   applicationApiVersion: v1beta1
-  publishedVersion: '2.48.0'
+  publishedVersion: '2.50.0'
   publishedVersionMetadata:
     releaseNote: >-
       Initial release.
@@ -89,7 +89,7 @@ properties:
   modelRepositoryPath:
     type: string
     title: Bucket where models are stored. Please make sure the user/service account to create the GKE app has permission to this GCS bucket. Read Triton documentation on configs and formatting details, supporting TensorRT, TensorFlow, Pytorch, Onnx ... etc.
-    default: gs://triton_sample_models/24.07
+    default: gs://triton_sample_models/24.09
   image.ldPreloadPath:
     type: string
     title: Leave this empty by default. Triton allows users to create custom layers for backend such as TensorRT plugin or Tensorflow custom ops, the compiled shared library must be provided via LD_PRELOAD environment variable.
diff --git a/deploy/gke-marketplace-app/trt-engine/README.md b/deploy/gke-marketplace-app/trt-engine/README.md
index 22343d966d..0c8012eb68 100644
--- a/deploy/gke-marketplace-app/trt-engine/README.md
+++ b/deploy/gke-marketplace-app/trt-engine/README.md
@@ -33,7 +33,7 @@
 ```
 docker run --gpus all -it --network host \
     --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 \
-    -v ~:/scripts nvcr.io/nvidia/tensorrt:24.07-py3
+    -v ~:/scripts nvcr.io/nvidia/tensorrt:24.09-py3
 
 pip install onnx six torch tf2onnx tensorflow
 
@@ -57,7 +57,7 @@ mkdir -p engines
 
 python3 builder.py -m models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/model.ckpt -o engines/bert_large_int8_bs1_s128.engine -b 1 -s 128 -c models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/ -v models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/vocab.txt --int8 --fp16 --strict --calib-num 1 -iln -imh
 
-gsutil cp bert_large_int8_bs1_s128.engine gs://triton_sample_models/24.07/bert/1/model.plan
+gsutil cp bert_large_int8_bs1_s128.engine gs://triton_sample_models/24.09/bert/1/model.plan
 ```
 
-For each Triton upgrade, container version used to generate the model, and the model path in GCS `gs://triton_sample_models/24.07/` should be updated accordingly with the correct version.
+For each Triton upgrade, container version used to generate the model, and the model path in GCS `gs://triton_sample_models/24.09/` should be updated accordingly with the correct version.
diff --git a/deploy/k8s-onprem/README.md b/deploy/k8s-onprem/README.md
index 4287b23c35..cb641830c9 100644
--- a/deploy/k8s-onprem/README.md
+++ b/deploy/k8s-onprem/README.md
@@ -1,5 +1,5 @@
 <!--
-# Copyright (c) 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -295,7 +295,7 @@ Image 'images/mug.jpg':
 After you have confirmed that your Triton cluster is operational and can perform inference,
 you can test the load balancing and autoscaling features by sending a heavy load of requests.
 One option for doing this is using the
-[perf_analyzer](https://github.com/triton-inference-server/client/blob/main/src/c++/perf_analyzer/README.md)
+[perf_analyzer](https://github.com/triton-inference-server/perf_analyzer/blob/main/README.md)
 application.
 
 You can apply a progressively increasing load with a command like:
diff --git a/deploy/k8s-onprem/values.yaml b/deploy/k8s-onprem/values.yaml
index 9366a0710c..ccee5e9c24 100644
--- a/deploy/k8s-onprem/values.yaml
+++ b/deploy/k8s-onprem/values.yaml
@@ -29,7 +29,7 @@ tags:
   loadBalancing: true
 
 image:
-  imageName: nvcr.io/nvidia/tritonserver:24.07-py3
+  imageName: nvcr.io/nvidia/tritonserver:24.09-py3
   pullPolicy: IfNotPresent
   modelRepositoryServer: < Replace with the IP Address of your file server >
   modelRepositoryPath: /srv/models
diff --git a/deploy/oci/values.yaml b/deploy/oci/values.yaml
index 3a85e7901b..55b8193ee2 100644
--- a/deploy/oci/values.yaml
+++ b/deploy/oci/values.yaml
@@ -27,7 +27,7 @@
 replicaCount: 1
 
 image:
-  imageName: nvcr.io/nvidia/tritonserver:24.07-py3
+  imageName: nvcr.io/nvidia/tritonserver:24.09-py3
   pullPolicy: IfNotPresent
   modelRepositoryPath: s3://https://<OCI_NAMESPACE>.compat.objectstorage.<OCI_REGION>.oraclecloud.com:443/triton-inference-server-repository
   numGpus: 1
diff --git a/docs/README.md b/docs/README.md
index 22e0c0d691..0f9faba3fe 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -1,5 +1,5 @@
 <!--
-# Copyright 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2018-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -173,7 +173,7 @@ Understanding Inference performance is key to better resource utilization. Use T
 - [Performance Tuning Guide](user_guide/performance_tuning.md)
 - [Optimization](user_guide/optimization.md)
 - [Model Analyzer](user_guide/model_analyzer.md)
-- [Performance Analyzer](https://github.com/triton-inference-server/client/blob/main/src/c++/perf_analyzer/README.md)
+- [Performance Analyzer](https://github.com/triton-inference-server/perf_analyzer/blob/main/README.md)
 - [Inference Request Tracing](user_guide/trace.md)
 ### Jetson and JetPack
 Triton can be deployed on edge devices. Explore [resources](user_guide/jetson.md) and [examples](examples/jetson/README.md).
@@ -185,7 +185,7 @@ The following resources are recommended to explore the full suite of Triton Infe
 
 - **Configuring Deployment**: Triton comes with three tools which can be used to configure deployment setting, measure performance and recommend optimizations.
   - [Model Analyzer](https://github.com/triton-inference-server/model_analyzer) Model Analyzer is CLI tool built to recommend deployment configurations for Triton Inference Server based on user's Quality of Service Requirements. It also generates detailed reports about model performance to summarize the benefits and trade offs of different configurations.
-  - [Perf Analyzer](https://github.com/triton-inference-server/client/blob/main/src/c++/perf_analyzer/README.md):
+  - [Perf Analyzer](https://github.com/triton-inference-server/perf_analyzer/blob/main/README.md):
   Perf Analyzer is a CLI application built to generate inference requests and
   measures the latency of those requests and throughput of the model being
   served.
diff --git a/docs/contents.md b/docs/contents.md
index cf5653340d..5aaafa7afa 100644
--- a/docs/contents.md
+++ b/docs/contents.md
@@ -1,5 +1,5 @@
 <!--
-# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -119,17 +119,24 @@ client/src/grpc_generated/java/README
 :maxdepth: 1
 :caption: Performance Analyzer
 
-client/src/c++/perf_analyzer/README
-client/src/c++/perf_analyzer/docs/README
-client/src/c++/perf_analyzer/docs/install
-client/src/c++/perf_analyzer/docs/quick_start
-client/src/c++/perf_analyzer/docs/cli
-client/src/c++/perf_analyzer/docs/inference_load_modes
-client/src/c++/perf_analyzer/docs/input_data
-client/src/c++/perf_analyzer/docs/measurements_metrics
-client/src/c++/perf_analyzer/docs/benchmarking
-client/src/c++/perf_analyzer/genai-perf/README
-client/src/c++/perf_analyzer/genai-perf/examples/tutorial
+perf_analyzer/README
+perf_analyzer/docs/README
+perf_analyzer/docs/install
+perf_analyzer/docs/quick_start
+perf_analyzer/docs/cli
+perf_analyzer/docs/inference_load_modes
+perf_analyzer/docs/input_data
+perf_analyzer/docs/measurements_metrics
+perf_analyzer/docs/benchmarking
+perf_analyzer/genai-perf/README
+perf_analyzer/genai-perf/docs/compare
+perf_analyzer/genai-perf/docs/embeddings
+perf_analyzer/genai-perf/docs/files
+perf_analyzer/genai-perf/docs/lora
+perf_analyzer/genai-perf/docs/multi_modal
+perf_analyzer/genai-perf/docs/rankings
+perf_analyzer/genai-perf/docs/tutorial
+perf_analyzer/genai-perf/examples/tutorial
 ```
 
 ```{toctree}
diff --git a/docs/customization_guide/build.md b/docs/customization_guide/build.md
index db16b65c6b..56e5875776 100644
--- a/docs/customization_guide/build.md
+++ b/docs/customization_guide/build.md
@@ -173,7 +173,7 @@ $ ./build.py ... --repo-tag=common:<container tag> --repo-tag=core:<container ta
 
 If you are building on a release branch then `<container tag>` will
 default to the branch name. For example, if you are building on the
-r24.07 branch, `<container tag>` will default to r24.07. If you are
+r24.09 branch, `<container tag>` will default to r24.09. If you are
 building on any other branch (including the *main* branch) then
 `<container tag>` will default to "main". Therefore, you typically do
 not need to provide `<container tag>` at all (nor the preceding
@@ -328,16 +328,16 @@ and so you must enable them explicitly. The following build.py
 invocation builds all features and backends available on windows.
 
 ```bash
-python build.py --cmake-dir=<path/to/repo>/build --build-dir=/tmp/citritonbuild --no-container-pull --image=base,win10-py3-min --enable-logging --enable-stats --enable-tracing --enable-gpu --endpoint=grpc --endpoint=http --repo-tag=common:<container tag> --repo-tag=core:<container tag> --repo-tag=backend:<container tag> --repo-tag=thirdparty:<container tag> --backend=ensemble --backend=tensorrt:<container tag> --backend=onnxruntime:<container tag> --backend=openvino:<container tag>
+python build.py --cmake-dir=<path/to/repo>/build --build-dir=/tmp/citritonbuild --no-container-pull --image=base,win10-py3-min --enable-logging --enable-stats --enable-tracing --enable-gpu --endpoint=grpc --endpoint=http --repo-tag=common:<container tag> --repo-tag=core:<container tag> --repo-tag=backend:<container tag> --repo-tag=thirdparty:<container tag> --backend=ensemble --backend=tensorrt:<container tag> --backend=onnxruntime:<container tag> --backend=openvino:<container tag> --backend=python:<container tag>
 ```
 
-If you are building on *main* branch then '<container tag>' will
+If you are building on *main* branch then `<container tag>` will
 default to "main". If you are building on a release branch then
-'<container tag>' will default to the branch name. For example, if you
-are building on the r24.07 branch, '<container tag>' will default to
-r24.07. Therefore, you typically do not need to provide '<container
-tag>' at all (nor the preceding colon). You can use a different
-'<container tag>' for a component to instead use the corresponding
+`<container tag>` will default to the branch name. For example, if you
+are building on the r24.09 branch, `<container tag>` will default to
+r24.09. Therefore, you typically do not need to provide `<container
+tag>` at all (nor the preceding colon). You can use a different
+`<container tag>` for a component to instead use the corresponding
 branch/tag in the build. For example, if you have a branch called
 "mybranch" in the
 [onnxruntime_backend](https://github.com/triton-inference-server/onnxruntime_backend)
diff --git a/docs/customization_guide/compose.md b/docs/customization_guide/compose.md
index ca3aafdbd0..0c6afc1e0b 100644
--- a/docs/customization_guide/compose.md
+++ b/docs/customization_guide/compose.md
@@ -46,8 +46,8 @@ The `compose.py` script can be found in the
 Simply clone the repository and run `compose.py` to create a custom container.
 Note: Created container version will depend on the branch that was cloned.
 For example branch
- [r24.07](https://github.com/triton-inference-server/server/tree/r24.07)
-should be used to create a image based on the NGC 24.07 Triton release.
+ [r24.09](https://github.com/triton-inference-server/server/tree/r24.09)
+should be used to create a image based on the NGC 24.09 Triton release.
 
 `compose.py` provides `--backend`, `--repoagent` options that allow you to
 specify which backends and repository agents to include in the custom image.
@@ -79,20 +79,20 @@ For example, running
 ```
 python3 compose.py --backend pytorch --repoagent checksum
 ```
-on branch [r24.07](https://github.com/triton-inference-server/server/tree/r24.07) pulls:
-- `min` container `nvcr.io/nvidia/tritonserver:24.07-py3-min`
-- `full` container `nvcr.io/nvidia/tritonserver:24.07-py3`
+on branch [r24.09](https://github.com/triton-inference-server/server/tree/r24.09) pulls:
+- `min` container `nvcr.io/nvidia/tritonserver:24.09-py3-min`
+- `full` container `nvcr.io/nvidia/tritonserver:24.09-py3`
 
 Alternatively, users can specify the version of Triton container to pull from
 any branch by either:
 1. Adding flag `--container-version <container version>` to branch
 ```
-python3 compose.py --backend pytorch --repoagent checksum --container-version 24.07
+python3 compose.py --backend pytorch --repoagent checksum --container-version 24.09
 ```
 2. Specifying `--image min,<min container image name> --image full,<full container image name>`.
    The user is responsible for specifying compatible `min` and `full` containers.
 ```
-python3 compose.py --backend pytorch --repoagent checksum --image min,nvcr.io/nvidia/tritonserver:24.07-py3-min --image full,nvcr.io/nvidia/tritonserver:24.07-py3
+python3 compose.py --backend pytorch --repoagent checksum --image min,nvcr.io/nvidia/tritonserver:24.09-py3-min --image full,nvcr.io/nvidia/tritonserver:24.09-py3
 ```
 Method 1 and 2 will result in the same composed container. Furthermore,
 `--image` flag overrides the `--container-version` flag when both are specified.
@@ -103,8 +103,8 @@ Note:
 2. vLLM and TensorRT-LLM backends are currently not supported backends for
 `compose.py`. If you want to build additional backends on top of these backends,
 it would be better to [build it yourself](#build-it-yourself) by using
-`nvcr.io/nvidia/tritonserver:24.07-vllm-python-py3` or
-`nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3` as a `min` container.
+`nvcr.io/nvidia/tritonserver:24.09-vllm-python-py3` or
+`nvcr.io/nvidia/tritonserver:24.09-trtllm-python-py3` as a `min` container.
 
 
 ### CPU-only container composition
diff --git a/docs/customization_guide/inference_protocols.md b/docs/customization_guide/inference_protocols.md
index 592f26e7d1..a241f097da 100644
--- a/docs/customization_guide/inference_protocols.md
+++ b/docs/customization_guide/inference_protocols.md
@@ -115,6 +115,16 @@ These options can be used to configure the KeepAlive settings:
 
 For client-side documentation, see [Client-Side GRPC KeepAlive](https://github.com/triton-inference-server/client/blob/main/README.md#grpc-keepalive).
 
+#### GRPC Status Codes
+
+Triton implements GRPC error handling for streaming requests when a specific flag is enabled through headers. Upon encountering an error, Triton returns the appropriate GRPC error code and subsequently closes the stream.
+
+* `triton_grpc_error` : The header value needs to be set to true while starting the stream.
+
+GRPC status codes can be used for better visibility and monitoring. For more details, see [gRPC Status Codes](https://grpc.io/docs/guides/status-codes/)
+
+For client-side documentation, see [Client-Side GRPC Status Codes](https://github.com/triton-inference-server/client/tree/main#GRPC-Status-Codes)
+
 ### Limit Endpoint Access (BETA)
 
 Triton users may want to restrict access to protocols or APIs that are
diff --git a/docs/customization_guide/test.md b/docs/customization_guide/test.md
index d664a139d3..8487e6e3ad 100644
--- a/docs/customization_guide/test.md
+++ b/docs/customization_guide/test.md
@@ -48,8 +48,8 @@ $ ./gen_qa_model_repository
 $ ./gen_qa_custom_ops
 ```
 
-This will create multiple model repositories in /tmp/<version>/qa_*
-(for example /tmp/24.07/qa_model_repository).  The TensorRT models
+This will create multiple model repositories in /tmp/\<version\>/qa_*
+(for example /tmp/24.09/qa_model_repository).  The TensorRT models
 will be created for the GPU on the system that CUDA considers device 0
 (zero). If you have multiple GPUs on your system see the documentation
 in the scripts for how to target a specific GPU.
@@ -57,14 +57,17 @@ in the scripts for how to target a specific GPU.
 ## Build SDK Image
 
 Build the *tritonserver_sdk* image that contains the client
-libraries, model analyzer, and examples using the following
-commands. You must first checkout the <client branch> branch of the
-*client* repo into the clientrepo/ subdirectory. Typically you want to
-set <client branch> to be the same as your current server branch.
+libraries, model analyzer, perf analyzer and examples using the following
+commands. You must first checkout the `<client branch>` branch of the
+*client* repo into the clientrepo/ subdirectory and the `<perf analyzer branch>`
+branch of the *perf_analyzer* repo into the perfanalyzerrepo/ subdirectory
+respectively. Typically you want to set both `<client branch>` and `<perf analyzer branch>`
+to be the same as your current server branch.
 
 ```
 $ cd <server repo root>
 $ git clone --single-branch --depth=1 -b <client branch> https://github.com/triton-inference-server/client.git clientrepo
+$ git clone --single-branch --depth=1 -b <perf analyzer branch> https://github.com/triton-inference-server/perf_analyzer.git perfanalyzerrepo
 $ docker build -t tritonserver_sdk -f Dockerfile.sdk .
 ```
 
diff --git a/docs/customization_guide/tritonfrontend.md b/docs/customization_guide/tritonfrontend.md
new file mode 100644
index 0000000000..763ab82fb9
--- /dev/null
+++ b/docs/customization_guide/tritonfrontend.md
@@ -0,0 +1,146 @@
+<!--
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+-->
+### Triton Server (tritonfrontend) Bindings (Beta)
+
+The `tritonfrontend` python package is a set of bindings to Triton's existing
+frontends implemented in C++. Currently, `tritonfrontend` supports starting up
+`KServeHttp` and `KServeGrpc` frontends. These bindings used in-combination
+with Triton's Python In-Process API
+([`tritonserver`](https://github.com/triton-inference-server/core/tree/main/python/tritonserver))
+and [`tritonclient`](https://github.com/triton-inference-server/client/tree/main/src/python/library)
+extend the ability to use Triton's full feature set with a few lines of Python.
+
+Let us walk through a simple example:
+1. First we need to load the desired models and start the server with `tritonserver`.
+```python
+import tritonserver
+
+# Constructing path to Model Repository
+model_path = f"server/src/python/examples/example_model_repository"
+
+server_options = tritonserver.Options(
+    server_id="ExampleServer",
+    model_repository=model_path,
+    log_error=True,
+    log_warn=True,
+    log_info=True,
+)
+server = tritonserver.Server(server_options).start(wait_until_ready=True)
+```
+Note: `model_path` may need to be edited depending on your setup.
+
+
+2. Now, to start up the respective services with `tritonfrontend`
+```python
+from tritonfrontend import KServeHttp, KServeGrpc
+http_options = KServeHttp.Options(thread_count=5)
+http_service = KServeHttp(server, http_options)
+http_service.start()
+
+# Default options (if none provided)
+grpc_service = KServeGrpc(server)
+grpc_service.start()
+```
+
+3. Finally, with running services, we can use `tritonclient` or simple `curl` commands to send requests and receive responses from the frontends.
+
+```python
+import tritonclient.http as httpclient
+import numpy as np # Use version numpy < 2
+model_name = "identity" # output == input
+url = "localhost:8000"
+
+# Create a Triton client
+client = httpclient.InferenceServerClient(url=url)
+
+# Prepare input data
+input_data = np.array([["Roger Roger"]], dtype=object)
+
+# Create input and output objects
+inputs = [httpclient.InferInput("INPUT0", input_data.shape, "BYTES")]
+
+# Set the data for the input tensor
+inputs[0].set_data_from_numpy(input_data)
+
+results = client.infer(model_name, inputs=inputs)
+
+# Get the output data
+output_data = results.as_numpy("OUTPUT0")
+
+# Print results
+print("[INFERENCE RESULTS]")
+print("Output data:", output_data)
+
+# Stop respective services and server.
+http_service.stop()
+grpc_service.stop()
+server.stop()
+```
+
+---
+
+Additionally, `tritonfrontend` provides context manager support as well. So steps 2-3, could also be achieved through:
+```python
+from tritonfrontend import KServeHttp
+import tritonclient.http as httpclient
+import numpy as np  # Use version numpy < 2
+
+with KServeHttp(server) as http_service:
+    # The identity model returns an exact duplicate of the input data as output
+    model_name = "identity"
+    url = "localhost:8000"
+    # Create a Triton client
+    with httpclient.InferenceServerClient(url=url) as client:
+        # Prepare input data
+        input_data = np.array(["Roger Roger"], dtype=object)
+        # Create input and output objects
+        inputs = [httpclient.InferInput("INPUT0", input_data.shape, "BYTES")]
+        # Set the data for the input tensor
+        inputs[0].set_data_from_numpy(input_data)
+        # Perform inference
+        results = client.infer(model_name, inputs=inputs)
+        # Get the output data
+        output_data = results.as_numpy("OUTPUT0")
+        # Print results
+        print("[INFERENCE RESULTS]")
+        print("Output data:", output_data)
+
+server.stop()
+```
+With this workflow, you can avoid having to stop each service after client requests have terminated.
+
+
+## Known Issues
+- The following features are not currently supported when launching the Triton frontend services through the python bindings:
+    - [Tracing](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/trace.md)
+    - [Shared Memory](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_shared_memory.md)
+    - [Metrics](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/metrics.md)
+    - [Restricted Protocols](https://github.com/triton-inference-server/server/blob/main/docs/customization_guide/inference_protocols.md#limit-endpoint-access-beta)
+    - VertexAI
+    - Sagemaker
+- After a running server has been stopped, if the client sends an inference request, a Segmentation Fault will occur.
\ No newline at end of file
diff --git a/docs/examples/fetch_models.sh b/docs/examples/fetch_models.sh
index 5594878b3e..f5aaed85aa 100755
--- a/docs/examples/fetch_models.sh
+++ b/docs/examples/fetch_models.sh
@@ -37,4 +37,4 @@ mv /tmp/inception_v3_2016_08_28_frozen.pb model_repository/inception_graphdef/1/
 # ONNX densenet
 mkdir -p model_repository/densenet_onnx/1
 wget -O model_repository/densenet_onnx/1/model.onnx \
-     https://contentmamluswest001.blob.core.windows.net/content/14b2744cf8d6418c87ffddc3f3127242/9502630827244d60a1214f250e3bbca7/08aed7327d694b8dbaee2c97b8d0fcba/densenet121-1.2.onnx
+     https://github.com/onnx/models/raw/main/validated/vision/classification/densenet-121/model/densenet-7.onnx
diff --git a/docs/examples/jetson/README.md b/docs/examples/jetson/README.md
index 281d5f2a97..77a20474b9 100644
--- a/docs/examples/jetson/README.md
+++ b/docs/examples/jetson/README.md
@@ -1,5 +1,5 @@
 <!--
-# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -53,7 +53,7 @@ Inference Server as a shared library.
 ## Part 2. Analyzing model performance with perf_analyzer
 
 To analyze model performance on Jetson,
-[perf_analyzer](https://github.com/triton-inference-server/client/blob/main/src/c++/perf_analyzer/README.md)
+[perf_analyzer](https://github.com/triton-inference-server/perf_analyzer/blob/main/README.md)
 tool is used. The `perf_analyzer` is included in the release tar file or can be
 compiled from source.
 
@@ -65,4 +65,4 @@ From this directory of the repository, execute the following to evaluate model p
 
 In the example above we saved the results as a `.csv` file. To visualize these
 results, follow the steps described
-[here](https://github.com/triton-inference-server/client/blob/main/src/c++/perf_analyzer/README.md).
+[here](https://github.com/triton-inference-server/perf_analyzer/blob/main/README.md).
diff --git a/docs/generate_docs.py b/docs/generate_docs.py
index 1cc6644fde..cb7ed02d9f 100755
--- a/docs/generate_docs.py
+++ b/docs/generate_docs.py
@@ -43,11 +43,11 @@
 """
 TODO: Needs to handle cross-branch linkage.
 
-For example, server/docs/user_guide/architecture.md on branch 24.07 links to
+For example, server/docs/user_guide/architecture.md on branch 24.09 links to
 server/docs/user_guide/model_analyzer.md on main branch. In this case, the
 hyperlink of model_analyzer.md should be a URL instead of relative path.
 
-Another example can be server/docs/user_guide/model_analyzer.md on branch 24.07
+Another example can be server/docs/user_guide/model_analyzer.md on branch 24.09
 links to a file in server repo with relative path. Currently all URLs are
 hardcoded to main branch. We need to make sure that the URL actually points to the
 correct branch. We also need to handle cases like deprecated or removed files from
@@ -123,9 +123,7 @@ def run_command(command):
             stderr=subprocess.PIPE,
         )
     except subprocess.CalledProcessError as e:
-        log_message(f"Error executing command: {e.cmd}")
-        log_message(e.output)
-        log_message(e.stderr)
+        raise (e)
 
 
 def clone_from_github(repo, tag, org):
@@ -137,7 +135,6 @@ def clone_from_github(repo, tag, org):
     """
     # Construct the full GitHub repository URL
     repo_url = f"https://github.com/{org}/{repo}.git"
-    print(repo_url)
     # Construct the git clone command
     if tag:
         clone_command = [
@@ -155,7 +152,7 @@ def clone_from_github(repo, tag, org):
         subprocess.run(clone_command, check=True)
         log_message(f"Successfully cloned {repo}")
     except subprocess.CalledProcessError as e:
-        log_message(f"Failed to clone {repo}. Error: {e}")
+        raise (e)
 
 
 def parse_repo_tag(repo_tags):
@@ -189,8 +186,8 @@ def get_git_repo_name(file_path):
             .decode()
             .strip()
         )
-    except subprocess.CalledProcessError:
-        return None
+    except subprocess.CalledProcessError as e:
+        raise (e)
 
     # Extract repository name from the remote URL.
     if remote_url.endswith(".git"):
@@ -388,10 +385,20 @@ def main():
     if "client" in repo_tags:
         clone_from_github("client", repo_tags["client"], github_org)
 
+    # Usage generate_docs.py --repo-tag=perf_analyzer:main
+    if "perf_analyzer" in repo_tags:
+        clone_from_github("perf_analyzer", repo_tags["perf_analyzer"], github_org)
+
     # Usage generate_docs.py --repo-tag=python_backend:main
     if "python_backend" in repo_tags:
         clone_from_github("python_backend", repo_tags["python_backend"], github_org)
 
+    # Usage generate_docs.py --repo-tag=tensorrtllm_backend:main
+    if "tensorrtllm_backend" in repo_tags:
+        clone_from_github(
+            "tensorrtllm_backend", repo_tags["tensorrtllm_backend"], github_org
+        )
+
     # Usage generate_docs.py --backend-tag=custom_backend:main
     # Custom backend can be anything currently empty
     if "custom_backend" in backend_tags:
@@ -408,6 +415,10 @@ def main():
         run_command("rm -rf python_backend")
     if "custom_backend" in backend_tags:
         run_command("rm -rf custom_backend")
+    if "tensorrtllm_backend" in repo_tags:
+        run_command("rm -rf tensorrtllm_backend")
+    if "perf_analyzer" in repo_tags:
+        run_command("rm -rf perf_analyzer")
 
     # Return to previous working directory server/.
     os.chdir(server_abspath)
diff --git a/docs/getting_started/trtllm_user_guide.md b/docs/getting_started/trtllm_user_guide.md
new file mode 100644
index 0000000000..7f128e98c7
--- /dev/null
+++ b/docs/getting_started/trtllm_user_guide.md
@@ -0,0 +1,118 @@
+<!--
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+-->
+
+# TensorRT-LLM User Guide
+
+## What is TensorRT-LLM
+
+[TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM)
+(TRT-LLM) is an open-source library designed to accelerate and optimize the
+inference performance of large language models (LLMs) on NVIDIA GPUs. TRT-LLM
+offers users an easy-to-use Python API to build TensorRT engines for LLMs,
+incorporating state-of-the-art optimizations to ensure efficient inference on
+NVIDIA GPUs.
+
+## How to run TRT-LLM models with Triton Server via TensorRT-LLM backend
+
+The
+[TensorRT-LLM Backend](https://github.com/triton-inference-server/tensorrtllm_backend)
+lets you serve TensorRT-LLM models with Triton Inference Server. Check out the
+[Getting Started](https://github.com/triton-inference-server/tensorrtllm_backend?tab=readme-ov-file#getting-started)
+section in the TensorRT-LLM Backend repo to learn how to utlize the
+[NGC Triton TRT-LLM container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/tritonserver)
+to prepare engines for your LLM models and serve them with Triton.
+
+## How to use your custom TRT-LLM model
+
+All the supported models can be found in the
+[examples](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples) folder in
+the TRT-LLM repo. Follow the examples to convert your models to TensorRT
+engines.
+
+After the engine is built, [prepare the model repository](https://github.com/triton-inference-server/tensorrtllm_backend?tab=readme-ov-file#prepare-the-model-repository)
+for Triton, and
+[modify the model configuration](https://github.com/triton-inference-server/tensorrtllm_backend?tab=readme-ov-file#modify-the-model-configuration).
+
+Only the *mandatory parameters* need to be set in the model config file. Feel free
+to modify the optional parameters as needed. To learn more about the
+parameters, model inputs, and outputs, see the
+[model config documentation](ttps://github.com/triton-inference-server/tensorrtllm_backend/blob/main/docs/model_config.md) for more details.
+
+## Advanced Configuration Options and Deployment Strategies
+
+Explore advanced configuration options and deployment strategies to optimize
+and run Triton with your TRT-LLM models effectively:
+
+- [Model Deployment](https://github.com/triton-inference-server/tensorrtllm_backend/tree/main?tab=readme-ov-file#model-deployment): Techniques for efficiently deploying and managing your models in various environments.
+- [Multi-Instance GPU (MIG) Support](https://github.com/triton-inference-server/tensorrtllm_backend/tree/main?tab=readme-ov-file#mig-support): Run Triton and TRT-LLM models with MIG to optimize GPU resource management.
+- [Scheduling](https://github.com/triton-inference-server/tensorrtllm_backend/tree/main?tab=readme-ov-file#scheduling): Configure scheduling policies to control how requests are managed and executed.
+- [Key-Value Cache](https://github.com/triton-inference-server/tensorrtllm_backend/tree/main?tab=readme-ov-file#key-value-cache): Utlizte KV cache and KV cache reuse to optimize memory usage and improve performance.
+- [Decoding](https://github.com/triton-inference-server/tensorrtllm_backend/tree/main?tab=readme-ov-file#decoding): Advanced methods for generating text, including top-k, top-p, top-k top-p, beam search, Medusa, and speculative decoding.
+- [Chunked Context](https://github.com/triton-inference-server/tensorrtllm_backend/tree/main?tab=readme-ov-file#chunked-context): Splitting the context into several chunks and batching them during generation phase to increase overall throughput.
+- [Quantization](https://github.com/triton-inference-server/tensorrtllm_backend/tree/main?tab=readme-ov-file#quantization): Apply quantization techniques to reduce model size and enhance inference speed.
+- [LoRa (Low-Rank Adaptation)](https://github.com/triton-inference-server/tensorrtllm_backend/tree/main?tab=readme-ov-file#lora): Use LoRa for efficient model fine-tuning and adaptation.
+
+## Tutorials
+
+Make sure to check out the
+[tutorials](https://github.com/triton-inference-server/tutorials) repo to see
+more guides on serving popular LLM models with Triton Server and TensorRT-LLM,
+as well as deploying them on Kubernetes.
+
+## Benchmark
+
+[GenAI-Perf](https://github.com/triton-inference-server/perf_analyzer/tree/main/genai-perf)
+is a command line tool for measuring the throughput and latency of LLMs served
+by Triton Inference Server. Check out the
+[Quick Start](https://github.com/triton-inference-server/perf_analyzer/tree/main/genai-perf#quick-start)
+to learn how to use GenAI-Perf to benchmark your LLM models.
+
+## Performance Best Practices
+
+Check out the
+[Performance Best Practices guide](https://nvidia.github.io/TensorRT-LLM/performance/perf-best-practices.html)
+to learn how to optimize your TensorRT-LLM models for better performance.
+
+## Metrics
+
+Triton Server provides
+[metrics](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/metrics.md)
+indicating GPU and request statistics.
+See the
+[Triton Metrics](https://github.com/triton-inference-server/tensorrtllm_backend?tab=readme-ov-file#triton-metrics)
+section in the TensorRT-LLM Backend repo to learn how to query the Triton
+metrics endpoint to obtain TRT-LLM statistics.
+
+## Ask questions or report issues
+
+Can't find what you're looking for, or have a question or issue? Feel free to
+ask questions or report issues in the GitHub issues page:
+
+- [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM/issues)
+- [TensorRT-LLM Backend](https://github.com/triton-inference-server/tensorrtllm_backend/issues)
+- [Triton Inference Server](https://github.com/triton-inference-server/server/issues)
diff --git a/docs/user_guide/custom_operations.md b/docs/user_guide/custom_operations.md
index 136edd180f..88a7037c7f 100644
--- a/docs/user_guide/custom_operations.md
+++ b/docs/user_guide/custom_operations.md
@@ -64,7 +64,7 @@ simple way to ensure you are using the correct version of TensorRT is
 to use the [NGC TensorRT
 container](https://ngc.nvidia.com/catalog/containers/nvidia:tensorrt)
 corresponding to the Triton container. For example, if you are using
-the 24.07 version of Triton, use the 24.07 version of the TensorRT
+the 24.09 version of Triton, use the 24.09 version of the TensorRT
 container.
 
 ## TensorFlow
@@ -123,7 +123,7 @@ simple way to ensure you are using the correct version of TensorFlow
 is to use the [NGC TensorFlow
 container](https://ngc.nvidia.com/catalog/containers/nvidia:tensorflow)
 corresponding to the Triton container. For example, if you are using
-the 24.07 version of Triton, use the 24.07 version of the TensorFlow
+the 24.09 version of Triton, use the 24.09 version of the TensorFlow
 container.
 
 ## PyTorch
@@ -167,7 +167,7 @@ simple way to ensure you are using the correct version of PyTorch is
 to use the [NGC PyTorch
 container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch)
 corresponding to the Triton container. For example, if you are using
-the 24.07 version of Triton, use the 24.07 version of the PyTorch
+the 24.09 version of Triton, use the 24.09 version of the PyTorch
 container.
 
 ## ONNX
diff --git a/docs/user_guide/debugging_guide.md b/docs/user_guide/debugging_guide.md
index 3a38f209d3..e5b0263d30 100644
--- a/docs/user_guide/debugging_guide.md
+++ b/docs/user_guide/debugging_guide.md
@@ -1,5 +1,5 @@
 <!--
-# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -59,7 +59,7 @@ Before proceeding, please see if the model configuration documentation [here](./
     - [Custom_models](https://github.com/triton-inference-server/server/tree/main/qa/custom_models), [ensemble_models](https://github.com/triton-inference-server/server/tree/main/qa/ensemble_models), and [python_models](https://github.com/triton-inference-server/server/tree/main/qa/python_models) include examples of configs for their respective use cases.
     - [L0_model_config](https://github.com/triton-inference-server/server/tree/main/qa/L0_model_config) tests many types of incomplete model configs.
 
-Note that if you are running into an issue with [perf_analyzer](https://github.com/triton-inference-server/client/blob/main/src/c%2B%2B/perf_analyzer/README.md) or [Model Analyzer](https://github.com/triton-inference-server/model_analyzer), try loading the model onto Triton directly. This checks if the configuration is incorrect or the perf_analyzer or Model Analyzer options need to be updated.
+Note that if you are running into an issue with [perf_analyzer](https://github.com/triton-inference-server/perf_analyzer/blob/main/README.md) or [Model Analyzer](https://github.com/triton-inference-server/model_analyzer), try loading the model onto Triton directly. This checks if the configuration is incorrect or the perf_analyzer or Model Analyzer options need to be updated.
 
 ## Model Issues
 **Step 1. Run Models Outside of Triton**
diff --git a/docs/user_guide/faq.md b/docs/user_guide/faq.md
index 523b38f750..2381b1d9b9 100644
--- a/docs/user_guide/faq.md
+++ b/docs/user_guide/faq.md
@@ -1,5 +1,5 @@
 <!--
-# Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -99,7 +99,7 @@ available through the [HTTP/REST, GRPC, and C
 APIs](../customization_guide/inference_protocols.md).
 
 A client application,
-[perf_analyzer](https://github.com/triton-inference-server/client/blob/main/src/c++/perf_analyzer/README.md),
+[perf_analyzer](https://github.com/triton-inference-server/perf_analyzer/blob/main/README.md),
 allows you to measure the performance of an individual model using a synthetic
 load. The perf_analyzer application is designed to show you the tradeoff of
 latency vs. throughput.
diff --git a/docs/user_guide/jetson.md b/docs/user_guide/jetson.md
index cda1da111d..e2b2b0ad34 100644
--- a/docs/user_guide/jetson.md
+++ b/docs/user_guide/jetson.md
@@ -201,7 +201,7 @@ tritonserver --model-repository=/path/to/model_repo --backend-directory=/path/to
 ```
 
 **Note**:
-[perf_analyzer](https://github.com/triton-inference-server/client/blob/main/src/c++/perf_analyzer/README.md)
+[perf_analyzer](https://github.com/triton-inference-server/perf_analyzer/blob/main/README.md)
 is supported on Jetson, while the [model_analyzer](model_analyzer.md) is
 currently not available for Jetson. To execute `perf_analyzer` for C API, use
 the CLI flag `--service-kind=triton_c_api`:
diff --git a/docs/user_guide/metrics.md b/docs/user_guide/metrics.md
index 0a7f3cf1a3..b8fc0d8ee0 100644
--- a/docs/user_guide/metrics.md
+++ b/docs/user_guide/metrics.md
@@ -378,3 +378,9 @@ Further documentation can be found in the `TRITONSERVER_MetricFamily*` and
 The TRT-LLM backend uses the custom metrics API to track and expose specific metrics about
 LLMs, KV Cache, and Inflight Batching to Triton:
 https://github.com/triton-inference-server/tensorrtllm_backend?tab=readme-ov-file#triton-metrics
+
+### vLLM Backend Metrics
+
+The vLLM backend uses the custom metrics API to track and expose specific metrics about
+LLMs to Triton:
+https://github.com/triton-inference-server/vllm_backend?tab=readme-ov-file#triton-metrics
diff --git a/docs/user_guide/model_analyzer.md b/docs/user_guide/model_analyzer.md
index 663a8a277a..c4b606364b 100644
--- a/docs/user_guide/model_analyzer.md
+++ b/docs/user_guide/model_analyzer.md
@@ -1,5 +1,5 @@
 <!--
-# Copyright (c) 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -30,7 +30,7 @@
 
 The Triton [Model Analyzer](https://github.com/triton-inference-server/model_analyzer)
  is a tool that uses
-[Performance Analyzer](https://github.com/triton-inference-server/client/blob/main/src/c++/perf_analyzer/README.md)
+[Performance Analyzer](https://github.com/triton-inference-server/perf_analyzer/blob/main/README.md)
 to send requests to your model while measuring GPU memory and compute
 utilization. The Model Analyzer is specifically useful for characterizing the
 GPU memory requirements for your model under different batching and model
diff --git a/docs/user_guide/model_configuration.md b/docs/user_guide/model_configuration.md
index e7a2d29c3c..1b0e64a533 100644
--- a/docs/user_guide/model_configuration.md
+++ b/docs/user_guide/model_configuration.md
@@ -934,7 +934,7 @@ dynamic batcher configurations.
 ```
 
 * Use the
-  [Performance Analyzer](https://github.com/triton-inference-server/client/blob/main/src/c++/perf_analyzer/README.md)
+  [Performance Analyzer](https://github.com/triton-inference-server/perf_analyzer/blob/main/README.md)
   to determine the latency and throughput provided by the default dynamic
   batcher configuration.
 
diff --git a/docs/user_guide/optimization.md b/docs/user_guide/optimization.md
index f842198a90..5ca3d376b2 100644
--- a/docs/user_guide/optimization.md
+++ b/docs/user_guide/optimization.md
@@ -1,5 +1,5 @@
 <!--
-# Copyright (c) 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -44,7 +44,7 @@ single GPU.
 Unless you already have a client application suitable for measuring
 the performance of your model on Triton, you should familiarize
 yourself with
-[Performance Analyzer](https://github.com/triton-inference-server/client/blob/main/src/c++/perf_analyzer/README.md).
+[Performance Analyzer](https://github.com/triton-inference-server/perf_analyzer/blob/main/README.md).
 The Performance Analyzer is an essential tool for optimizing your model's
 performance.
 
diff --git a/docs/user_guide/perf_analyzer.md b/docs/user_guide/perf_analyzer.md
index 7019d51c63..0631b404c5 100644
--- a/docs/user_guide/perf_analyzer.md
+++ b/docs/user_guide/perf_analyzer.md
@@ -1,5 +1,5 @@
 <!--
-# Copyright (c) 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -27,4 +27,4 @@
 -->
 
 Perf Analyzer documentation has been relocated to
-[here](https://github.com/triton-inference-server/client/blob/main/src/c++/perf_analyzer/README.md).
+[here](https://github.com/triton-inference-server/perf_analyzer/blob/main/README.md).
diff --git a/docs/user_guide/performance_tuning.md b/docs/user_guide/performance_tuning.md
index 49cad9e637..efea32a63b 100644
--- a/docs/user_guide/performance_tuning.md
+++ b/docs/user_guide/performance_tuning.md
@@ -73,7 +73,7 @@ For additional material, see the
     verify that we can run inference requests and get a baseline performance
     benchmark of your model.
     Triton's
-    [Perf Analyzer](https://github.com/triton-inference-server/client/blob/main/src/c++/perf_analyzer/README.md)
+    [Perf Analyzer](https://github.com/triton-inference-server/perf_analyzer/blob/main/README.md)
     tool specifically fits this purpose. Here is a simplified output for
     demonstration purposes:
 
@@ -103,7 +103,7 @@ For additional material, see the
     There are many variables that can be tweaked just within your model
     configuration (`config.pbtxt`) to obtain different results.
     - As your model, config, or use case evolves,
-    [Perf Analyzer](https://github.com/triton-inference-server/client/blob/main/src/c++/perf_analyzer/README.md)
+    [Perf Analyzer](https://github.com/triton-inference-server/perf_analyzer/blob/main/README.md)
     is a great tool to quickly verify model functionality and performance.
 
 3. How can I improve my model performance?
@@ -235,7 +235,7 @@ with a `tritonserver` binary.
 
 ```bash
 # Start server container
-docker run -ti --rm --gpus=all --network=host -v $PWD:/mnt --name triton-server nvcr.io/nvidia/tritonserver:24.07-py3
+docker run -ti --rm --gpus=all --network=host -v $PWD:/mnt --name triton-server nvcr.io/nvidia/tritonserver:24.09-py3
 
 # Start serving your models
 tritonserver --model-repository=/mnt/models
@@ -284,7 +284,7 @@ by setting the `-u` flag, such as `perf_analyzer -m densenet_onnx -u
 
 ```bash
 # Start the SDK container interactively
-docker run -ti --rm --gpus=all --network=host -v $PWD:/mnt --name triton-client nvcr.io/nvidia/tritonserver:24.07-py3-sdk
+docker run -ti --rm --gpus=all --network=host -v $PWD:/mnt --name triton-client nvcr.io/nvidia/tritonserver:24.09-py3-sdk
 
 # Benchmark model being served from step 3
 perf_analyzer -m densenet_onnx --concurrency-range 1:4
diff --git a/qa/L0_backend_python/argument_validation/test.sh b/qa/L0_backend_python/argument_validation/test.sh
index b14ba4abb3..90cbef89b5 100755
--- a/qa/L0_backend_python/argument_validation/test.sh
+++ b/qa/L0_backend_python/argument_validation/test.sh
@@ -25,7 +25,7 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-CLIENT_PY=../python_unittest.py
+CLIENT_PY=../test_infer_shm_leak.py
 CLIENT_LOG="./arg_validation_client.log"
 TEST_RESULT_FILE='test_results.txt'
 SERVER_ARGS="--model-repository=${MODELDIR}/argument_validation/models --backend-directory=${BACKEND_DIR} --log-verbose=1"
diff --git a/qa/L0_backend_python/bls/test.sh b/qa/L0_backend_python/bls/test.sh
index 204af7e2ba..46d1f40818 100755
--- a/qa/L0_backend_python/bls/test.sh
+++ b/qa/L0_backend_python/bls/test.sh
@@ -25,7 +25,7 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-CLIENT_PY=../python_unittest.py
+CLIENT_PY=../test_infer_shm_leak.py
 CLIENT_LOG="./bls_client.log"
 TEST_RESULT_FILE='test_results.txt'
 source ../../common/util.sh
@@ -33,7 +33,7 @@ source ../../common/util.sh
 TRITON_REPO_ORGANIZATION=${TRITON_REPO_ORGANIZATION:=http://github.com/triton-inference-server}
 
 RET=0
-rm -fr *.log ./models *.txt
+rm -fr *.log ./models *.txt *.xml
 
 # FIXME: [DLIS-5970] Until Windows supports GPU tensors, only test CPU
 if [[ ${TEST_WINDOWS} == 0 ]]; then
@@ -119,30 +119,35 @@ if [[ ${TEST_WINDOWS} == 0 ]]; then
 
             for MODEL_NAME in bls bls_memory bls_memory_async bls_async; do
                 export MODEL_NAME=${MODEL_NAME}
-
-                python3 -m pytest --junitxml="${MODEL_NAME}.${TRIAL}.${CUDA_MEMORY_POOL_SIZE_MB}.report.xml" $CLIENT_PY >> $CLIENT_LOG 2>&1
-                if [ $? -ne 0 ]; then
+                # Run with pytest to capture the return code correctly
+                pytest --junitxml="${MODEL_NAME}.${TRIAL}.${CUDA_MEMORY_POOL_SIZE_MB}.report.xml" $CLIENT_PY >> $CLIENT_LOG 2>&1
+                EXIT_CODE=$?
+                if [ $EXIT_CODE -ne 0 ]; then
                     echo -e "\n***\n*** ${MODEL_NAME} ${BLS_KIND} test FAILED. \n***"
+                    RET=$EXIT_CODE
                     cat $SERVER_LOG
                     cat $CLIENT_LOG
-                    RET=1
                 fi
             done
 
-            set -e
-
             kill_server
 
-            # Check for bls 'test_timeout' to ensure timeout value is being correctly passed
-            if [ `grep -c "Request timeout: 11000000000" $SERVER_LOG` == "0" ]; then
-                echo -e "\n***\n*** BLS timeout value not correctly passed to model: line ${LINENO}\n***"
-                cat $SERVER_LOG
-                RET=1
+            set -e
+
+            # Only check the timeout value if there is no error since the test
+            # may fail before the test_timeout case gets run.
+            if [ $RET -eq 0 ]; then
+                # Check for bls 'test_timeout' to ensure timeout value is being correctly passed
+                if [ `grep -c "Request timeout: 11000000000" $SERVER_LOG` == "0" ]; then
+                    echo -e "\n***\n*** BLS timeout value not correctly passed to model: line ${LINENO}\n***"
+                    cat $SERVER_LOG
+                    RET=1
+                fi
             fi
 
-            if [[ $CUDA_MEMORY_POOL_SIZE_MB -eq 128 ]]; then
+            if [[ $CUDA_MEMORY_POOL_SIZE_MB -eq 256 ]]; then
                 if [ `grep -c "Failed to allocate memory from CUDA memory pool" $SERVER_LOG` != "0" ]; then
-                    echo -e "\n***\n*** Expected to use CUDA memory pool for all tests when CUDA_MEMOY_POOL_SIZE_MB is 128 MB for 'bls' $BLS_KIND test\n***"
+                    echo -e "\n***\n*** Expected to use CUDA memory pool for all tests when CUDA_MEMORY_POOL_SIZE_MB is 256 MB for 'bls' $BLS_KIND test\n***"
                     cat $SERVER_LOG
                     RET=1
                 fi
@@ -342,10 +347,10 @@ set -e
 
 kill_server
 
-if [ $RET -eq 1 ]; then
-    echo -e "\n***\n*** BLS test FAILED. \n***"
-else
+if [ $RET -eq 0 ]; then
     echo -e "\n***\n*** BLS test PASSED. \n***"
+else
+    echo -e "\n***\n*** BLS test FAILED. \n***"
 fi
 
 exit $RET
diff --git a/qa/L0_backend_python/custom_metrics/test.sh b/qa/L0_backend_python/custom_metrics/test.sh
index 4491d9e030..9020c7ebfd 100755
--- a/qa/L0_backend_python/custom_metrics/test.sh
+++ b/qa/L0_backend_python/custom_metrics/test.sh
@@ -25,7 +25,7 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-CLIENT_PY=../python_unittest.py
+CLIENT_PY=../test_infer_shm_leak.py
 CLIENT_LOG="./custom_metrics_client.log"
 TEST_RESULT_FILE='test_results.txt'
 source ../../common/util.sh
diff --git a/qa/L0_backend_python/lifecycle/lifecycle_test.py b/qa/L0_backend_python/lifecycle/lifecycle_test.py
index 883f6d20b6..d6eb2a8f53 100755
--- a/qa/L0_backend_python/lifecycle/lifecycle_test.py
+++ b/qa/L0_backend_python/lifecycle/lifecycle_test.py
@@ -35,6 +35,7 @@
 sys.path.append("../../common")
 
 import queue
+import threading
 import time
 import unittest
 from functools import partial
@@ -241,6 +242,135 @@ def test_infer_pymodel_error(self):
             initial_metrics_value,
         )
 
+    # Test grpc stream behavior when triton_grpc_error is set to true.
+    # Expected to close stream and return GRPC error when model returns error.
+    def test_triton_grpc_error_error_on(self):
+        model_name = "execute_grpc_error"
+        shape = [2, 2]
+        number_of_requests = 2
+        user_data = UserData()
+        triton_client = grpcclient.InferenceServerClient(f"{_tritonserver_ipaddr}:8001")
+        metadata = {"triton_grpc_error": "true"}
+        triton_client.start_stream(
+            callback=partial(callback, user_data), headers=metadata
+        )
+        stream_end = False
+        for i in range(number_of_requests):
+            input_data = np.random.randn(*shape).astype(np.float32)
+            inputs = [
+                grpcclient.InferInput(
+                    "IN", input_data.shape, np_to_triton_dtype(input_data.dtype)
+                )
+            ]
+            inputs[0].set_data_from_numpy(input_data)
+            try:
+                triton_client.async_stream_infer(model_name=model_name, inputs=inputs)
+                result = user_data._completed_requests.get()
+                if type(result) == InferenceServerException:
+                    # execute_grpc_error intentionally returns error with StatusCode.INTERNAL status on 2nd request
+                    self.assertEqual(str(result.status()), "StatusCode.INTERNAL")
+                    stream_end = True
+                else:
+                    # Stream is not killed
+                    output_data = result.as_numpy("OUT")
+                    self.assertIsNotNone(output_data, "error: expected 'OUT'")
+            except Exception as e:
+                if stream_end == True:
+                    # We expect the stream to have closed
+                    self.assertTrue(
+                        True,
+                        "This should always pass as cancellation should succeed",
+                    )
+                else:
+                    self.assertFalse(
+                        True, "Unexpected Stream killed without Error from CORE"
+                    )
+
+    # Test grpc stream behavior when triton_grpc_error is set to true in multiple open streams.
+    # Expected to close stream and return GRPC error when model returns error.
+    def test_triton_grpc_error_multithreaded(self):
+        thread1 = threading.Thread(target=self.test_triton_grpc_error_error_on)
+        thread2 = threading.Thread(target=self.test_triton_grpc_error_error_on)
+        # Start the threads
+        thread1.start()
+        thread2.start()
+        # Wait for both threads to finish
+        thread1.join()
+        thread2.join()
+
+    # Test grpc stream behavior when triton_grpc_error is set to true and subsequent stream is cancelled.
+    # Expected cancellation is successful.
+    def test_triton_grpc_error_cancel(self):
+        model_name = "execute_grpc_error"
+        shape = [2, 2]
+        number_of_requests = 1
+        user_data = UserData()
+        triton_server_url = "localhost:8001"  # Replace with your Triton server address
+        stream_end = False
+        triton_client = grpcclient.InferenceServerClient(triton_server_url)
+
+        metadata = {"triton_grpc_error": "true"}
+
+        triton_client.start_stream(
+            callback=partial(callback, user_data), headers=metadata
+        )
+
+        for i in range(number_of_requests):
+            input_data = np.random.randn(*shape).astype(np.float32)
+            inputs = [
+                grpcclient.InferInput(
+                    "IN", input_data.shape, np_to_triton_dtype(input_data.dtype)
+                )
+            ]
+            inputs[0].set_data_from_numpy(input_data)
+            try:
+                triton_client.async_stream_infer(model_name=model_name, inputs=inputs)
+                result = user_data._completed_requests.get()
+                if type(result) == InferenceServerException:
+                    stream_end = True
+                if i == 0:
+                    triton_client.stop_stream(cancel_requests=True)
+            except Exception as e:
+                if stream_end == True:
+                    # We expect the stream to have closed
+                    self.assertTrue(
+                        True,
+                        "This should always pass as cancellation should succeed",
+                    )
+                else:
+                    self.assertFalse(
+                        True, "Unexpected Stream killed without Error from CORE"
+                    )
+        self.assertTrue(
+            True,
+            "This should always pass as cancellation should succeed without any exception",
+        )
+
+    # Test grpc stream behavior when triton_grpc_error is set to false
+    # and subsequent stream is NOT closed when error is reported from CORE
+    def test_triton_grpc_error_error_off(self):
+        model_name = "execute_grpc_error"
+        shape = [2, 2]
+        number_of_requests = 4
+        response_counter = 0
+        user_data = UserData()
+        triton_client = grpcclient.InferenceServerClient(f"{_tritonserver_ipaddr}:8001")
+        triton_client.start_stream(callback=partial(callback, user_data))
+        for i in range(number_of_requests):
+            input_data = np.random.randn(*shape).astype(np.float32)
+            inputs = [
+                grpcclient.InferInput(
+                    "IN", input_data.shape, np_to_triton_dtype(input_data.dtype)
+                )
+            ]
+            inputs[0].set_data_from_numpy(input_data)
+            triton_client.async_stream_infer(model_name=model_name, inputs=inputs)
+            _ = user_data._completed_requests.get()
+            response_counter += 1
+        # we expect response_counter == number_of_requests,
+        # which indicates that after the first reported grpc error stream did NOT close and mode != triton_grpc_error
+        self.assertEqual(response_counter, number_of_requests)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_backend_python/lifecycle/test.sh b/qa/L0_backend_python/lifecycle/test.sh
index dba4581ddd..59b846f56b 100755
--- a/qa/L0_backend_python/lifecycle/test.sh
+++ b/qa/L0_backend_python/lifecycle/test.sh
@@ -52,6 +52,14 @@ cp ../../python_models/execute_error/config.pbtxt ./models/execute_error/
           sed -i "s/^max_batch_size:.*/max_batch_size: 8/" config.pbtxt && \
           echo "dynamic_batching { preferred_batch_size: [8], max_queue_delay_microseconds: 12000000 }" >> config.pbtxt)
 
+mkdir -p models/execute_grpc_error/1/
+cp ../../python_models/execute_grpc_error/model.py ./models/execute_grpc_error/1/
+cp ../../python_models/execute_grpc_error/config.pbtxt ./models/execute_grpc_error/
+(cd models/execute_grpc_error && \
+          sed -i "s/^name:.*/name: \"execute_grpc_error\"/" config.pbtxt && \
+          sed -i "s/^max_batch_size:.*/max_batch_size: 8/" config.pbtxt && \
+          echo "dynamic_batching { preferred_batch_size: [8], max_queue_delay_microseconds: 1200000 }" >> config.pbtxt)
+
 mkdir -p models/execute_return_error/1/
 cp ../../python_models/execute_return_error/model.py ./models/execute_return_error/1/
 cp ../../python_models/execute_return_error/config.pbtxt ./models/execute_return_error/
diff --git a/qa/L0_backend_python/request_rescheduling/test.sh b/qa/L0_backend_python/request_rescheduling/test.sh
index 6fd6fe09e5..31ba6692d9 100755
--- a/qa/L0_backend_python/request_rescheduling/test.sh
+++ b/qa/L0_backend_python/request_rescheduling/test.sh
@@ -25,7 +25,7 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-CLIENT_PY="../python_unittest.py"
+CLIENT_PY="../test_infer_shm_leak.py"
 CLIENT_LOG="./request_rescheduling_client.log"
 TEST_RESULT_FILE='test_results.txt'
 source ../../common/util.sh
diff --git a/qa/L0_backend_python/response_sender/response_sender_complete_final_test.py b/qa/L0_backend_python/response_sender/response_sender_complete_final_test.py
new file mode 100644
index 0000000000..386a54e3d3
--- /dev/null
+++ b/qa/L0_backend_python/response_sender/response_sender_complete_final_test.py
@@ -0,0 +1,77 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+import time
+import unittest
+
+import numpy as np
+import tritonclient.grpc as grpcclient
+
+
+class ResponseSenderTest(unittest.TestCase):
+    def _generate_streaming_callback_and_responses_pair(self):
+        responses = []  # [{"result": result, "error": error}, ...]
+
+        def callback(result, error):
+            responses.append({"result": result, "error": error})
+
+        return callback, responses
+
+    def test_respond_after_complete_final(self):
+        with open(os.environ["SERVER_LOG"]) as f:
+            server_log = f.read()
+        self.assertNotIn("Test Passed", server_log)
+
+        model_name = "response_sender_complete_final"
+        shape = [1, 1]
+        inputs = [grpcclient.InferInput("INPUT0", shape, "FP32")]
+        input0_np = np.array([[123.45]], np.float32)
+        inputs[0].set_data_from_numpy(input0_np)
+
+        callback, responses = self._generate_streaming_callback_and_responses_pair()
+        with grpcclient.InferenceServerClient("localhost:8001") as client:
+            client.start_stream(callback)
+            client.async_stream_infer(model_name, inputs)
+            client.stop_stream()
+
+        self.assertEqual(len(responses), 1)
+        for response in responses:
+            output0_np = response["result"].as_numpy(name="OUTPUT0")
+            self.assertTrue(np.allclose(input0_np, output0_np))
+            self.assertIsNone(response["error"])
+
+        time.sleep(1)  # make sure the logs are written before checking
+        with open(os.environ["SERVER_LOG"]) as f:
+            server_log = f.read()
+        self.assertNotIn("Unexpected request length", server_log)
+        self.assertNotIn("Expected exception not raised", server_log)
+        self.assertNotIn("Test FAILED", server_log)
+        self.assertIn("Test Passed", server_log)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/qa/L0_backend_python/response_sender/test.sh b/qa/L0_backend_python/response_sender/test.sh
index 33db46edbb..cca7e7acfa 100755
--- a/qa/L0_backend_python/response_sender/test.sh
+++ b/qa/L0_backend_python/response_sender/test.sh
@@ -97,6 +97,37 @@ set -e
 kill $SERVER_PID
 wait $SERVER_PID
 
+#
+# Test response sender to raise exception on response after complete final flag
+#
+rm -rf models && mkdir models
+mkdir -p models/response_sender_complete_final/1 && \
+    cp ../../python_models/response_sender_complete_final/model.py models/response_sender_complete_final/1 && \
+    cp ../../python_models/response_sender_complete_final/config.pbtxt models/response_sender_complete_final
+
+TEST_LOG="response_sender_complete_final_test.log"
+SERVER_LOG="response_sender_complete_final_test.server.log"
+SERVER_ARGS="--model-repository=${MODELDIR}/response_sender/models --backend-directory=${BACKEND_DIR} --log-verbose=1"
+
+run_server
+if [ "$SERVER_PID" == "0" ]; then
+    echo -e "\n***\n*** Failed to start $SERVER\n***"
+    cat $SERVER_LOG
+    exit 1
+fi
+
+set +e
+SERVER_LOG=$SERVER_LOG python3 -m pytest --junitxml=concurrency_test.report.xml response_sender_complete_final_test.py > $TEST_LOG 2>&1
+if [ $? -ne 0 ]; then
+    echo -e "\n***\n*** response sender complete final test FAILED\n***"
+    cat $TEST_LOG
+    RET=1
+fi
+set -e
+
+kill $SERVER_PID
+wait $SERVER_PID
+
 #
 # Test async response sender under decoupled / non-decoupled
 #
diff --git a/qa/L0_backend_python/setup_python_enviroment.sh b/qa/L0_backend_python/setup_python_enviroment.sh
index 88baccc4f6..a2171e02da 100755
--- a/qa/L0_backend_python/setup_python_enviroment.sh
+++ b/qa/L0_backend_python/setup_python_enviroment.sh
@@ -151,7 +151,7 @@ apt-get update && apt-get -y install \
                             libboost-dev
 rm -f /usr/bin/python3 && \
 ln -s "/usr/bin/python3.${PYTHON_ENV_VERSION}" /usr/bin/python3
-pip3 install --upgrade install requests numpy virtualenv protobuf
+pip3 install --upgrade requests numpy virtualenv protobuf
 find /opt/tritonserver/qa/pkgs/ -maxdepth 1 -type f -name \
     "tritonclient-*linux*.whl" | xargs printf -- '%s[all]' | \
     xargs pip3 install --upgrade
diff --git a/qa/L0_backend_python/test.sh b/qa/L0_backend_python/test.sh
index 65767419f2..324ee5ba1f 100755
--- a/qa/L0_backend_python/test.sh
+++ b/qa/L0_backend_python/test.sh
@@ -39,18 +39,18 @@ fi
 # /mnt/c when needed but the paths on the tritonserver command-line
 # must be C:/ style.
 export TEST_WINDOWS=0
-if [[ "$(< /proc/sys/kernel/osrelease)" == *microsoft* ]]; then
+if [[ -v WSL_DISTRO_NAME ]] || [[ -v MSYSTEM ]]; then
     export DATADIR=${DATADIR:="/c/data/inferenceserver/${REPO_VERSION}"}
     export TRITON_DIR=${TRITON_DIR:=c:/tritonserver}
     # This will run in WSL, but Triton will run in windows, so environment
     # variables meant for loaded models must be exported using WSLENV.
     # The /w flag indicates the value should only be included when invoking
     # Win32 from WSL.
-    export WSLENV=TRITON_DIR/w
+    export WSLENV=TRITON_DIR
     export SERVER=${SERVER:=c:/tritonserver/bin/tritonserver.exe}
     export BACKEND_DIR=${BACKEND_DIR:=c:/tritonserver/backends}
     export MODELDIR=${MODELDIR:=c:/}
-    TEST_WINDOWS=1
+    export TEST_WINDOWS=1
 else
     export DATADIR=${DATADIR:="/data/inferenceserver/${REPO_VERSION}"}
     export TRITON_DIR=${TRITON_DIR:="/opt/tritonserver"}
@@ -425,11 +425,20 @@ if [ "$TEST_JETSON" == "0" ]; then
         # between dependencies.
         setup_virtualenv
 
+        set +e
         (cd ${TEST} && bash -ex test.sh)
-        if [ $? -ne 0 ]; then
+        EXIT_CODE=$?
+        if [ $EXIT_CODE -ne 0 ]; then
             echo "Subtest ${TEST} FAILED"
-            RET=1
+            RET=$EXIT_CODE
+
+            # In bls test, it is allowed to fail with a strict memory leak of 480 bytes with exit code '123'.
+            # Propagate the exit code to make sure it's not overwritten by other tests.
+            if [[ ${TEST} == "bls" ]]  && [[ $EXIT_CODE -ne 1 ]] ; then
+                BLS_RET=$RET
+            fi
         fi
+        set -e
 
         deactivate_virtualenv
     done
@@ -438,11 +447,13 @@ if [ "$TEST_JETSON" == "0" ]; then
     if [[ ${PYTHON_ENV_VERSION} = "10" ]] && [[ ${TEST_WINDOWS} == 0 ]]; then
         # In 'env' test we use miniconda for dependency management. No need to run
         # the test in a virtual environment.
+        set +e
         (cd env && bash -ex test.sh)
         if [ $? -ne 0 ]; then
             echo "Subtest env FAILED"
             RET=1
         fi
+        set -e
     fi
 fi
 
@@ -459,12 +470,14 @@ for TEST in ${SUBTESTS}; do
     # between dependencies.
     setup_virtualenv
 
+    set +e
     (cd ${TEST} && bash -ex test.sh)
 
     if [ $? -ne 0 ]; then
         echo "Subtest ${TEST} FAILED"
         RET=1
     fi
+    set -e
 
     deactivate_virtualenv
 done
@@ -475,4 +488,14 @@ else
   echo -e "\n***\n*** Test FAILED\n***"
 fi
 
-exit $RET
+# Exit with RET if it is 1, meaning that the test failed.
+# Otherwise, exit with BLS_RET if it is set, meaning that the known memory leak is captured.
+if [ $RET -eq 1 ]; then
+    exit $RET
+else
+    if [ -z "$BLS_RET" ]; then
+        exit $RET
+    else
+        exit $BLS_RET
+    fi
+fi
diff --git a/qa/L0_backend_python/python_unittest.py b/qa/L0_backend_python/test_infer_shm_leak.py
similarity index 75%
rename from qa/L0_backend_python/python_unittest.py
rename to qa/L0_backend_python/test_infer_shm_leak.py
index 4b94996976..966243e86e 100755
--- a/qa/L0_backend_python/python_unittest.py
+++ b/qa/L0_backend_python/test_infer_shm_leak.py
@@ -33,6 +33,7 @@
 import os
 import unittest
 
+import pytest
 import shm_util
 import tritonclient.grpc as grpcclient
 from tritonclient.utils import *
@@ -41,11 +42,13 @@
 # we overwrite the IP address with the TRITONSERVER_IPADDR envvar
 _tritonserver_ipaddr = os.environ.get("TRITONSERVER_IPADDR", "localhost")
 
+# The exit code 123 is used to indicate that the shm leak probe detected a 480
+# bytes leak in the bls sub-test. Any leak other than 480 bytes will cause the
+# test to fail with the default exit code 1.
+ALLOWED_FAILURE_EXIT_CODE = 123
 
-class PythonUnittest(unittest.TestCase):
-    def setUp(self):
-        self._shm_leak_detector = shm_util.ShmLeakDetector()
 
+class TestInferShmLeak:
     def _run_unittest(self, model_name):
         with grpcclient.InferenceServerClient(f"{_tritonserver_ipaddr}:8001") as client:
             # No input is required
@@ -54,15 +57,17 @@ def _run_unittest(self, model_name):
 
             # The model returns 1 if the tests were successfully passed.
             # Otherwise, it will return 0.
-            self.assertEqual(
-                output0, [1], f"python_unittest failed for model {model_name}"
-            )
-
-    def test_python_unittest(self):
-        model_name = os.environ["MODEL_NAME"]
-        with self._shm_leak_detector.Probe() as shm_probe:
-            self._run_unittest(model_name)
+            assert output0 == [1], f"python_unittest failed for model {model_name}"
 
+    def test_shm_leak(self):
+        self._shm_leak_detector = shm_util.ShmLeakDetector()
+        model_name = os.environ.get("MODEL_NAME", "default_model")
 
-if __name__ == "__main__":
-    unittest.main()
+        try:
+            with self._shm_leak_detector.Probe() as shm_probe:
+                self._run_unittest(model_name)
+        except AssertionError as e:
+            if "Known shared memory leak of 480 bytes detected" in str(e):
+                pytest.exit(str(e), returncode=ALLOWED_FAILURE_EXIT_CODE)
+            else:
+                raise e
diff --git a/qa/L0_batcher/test.sh b/qa/L0_batcher/test.sh
index 827751eb40..7043aab2a5 100755
--- a/qa/L0_batcher/test.sh
+++ b/qa/L0_batcher/test.sh
@@ -79,7 +79,7 @@ TF_VERSION=${TF_VERSION:=2}
 # On windows the paths invoked by the script (running in WSL) must use
 # /mnt/c when needed but the paths on the tritonserver command-line
 # must be C:/ style.
-if [[ "$(< /proc/sys/kernel/osrelease)" == *microsoft* ]]; then
+if [[ -v WSL_DISTRO_NAME ]] || [[ -v MSYSTEM ]]; then
     MODELDIR=${MODELDIR:=C:/models}
     DATADIR=${DATADIR:="/mnt/c/data/inferenceserver/${REPO_VERSION}"}
     BACKEND_DIR=${BACKEND_DIR:=C:/tritonserver/backends}
@@ -601,7 +601,7 @@ done
 TEST_CASE=test_multi_batch_preserve_ordering
 
 # Skip test for Windows. Trace file concats at 8192 chars on Windows.
-if [[ "$(< /proc/sys/kernel/osrelease)" != *microsoft* ]]; then
+if  [[ ! -v WSL_DISTRO_NAME ]] || [[ ! -v MSYSTEM ]]; then
     rm -fr ./custom_models && mkdir ./custom_models && \
         cp -r ../custom_models/custom_zero_1_float32 ./custom_models/. && \
         mkdir -p ./custom_models/custom_zero_1_float32/1
diff --git a/qa/L0_client_build_variants/test.sh b/qa/L0_client_build_variants/test.sh
index c31c55e310..9dc1c4c85d 100755
--- a/qa/L0_client_build_variants/test.sh
+++ b/qa/L0_client_build_variants/test.sh
@@ -58,10 +58,6 @@ TRITON_REPO_ORGANIZATION=${TRITON_REPO_ORGANIZATION:="http://github.com/triton-i
               -DTRITON_ENABLE_PYTHON_HTTP=ON \
               -DTRITON_ENABLE_PYTHON_GRPC=ON \
               -DTRITON_ENABLE_JAVA_HTTP=ON \
-              -DTRITON_ENABLE_PERF_ANALYZER=ON \
-              -DTRITON_ENABLE_PERF_ANALYZER_C_API=ON \
-              -DTRITON_ENABLE_PERF_ANALYZER_TFS=OFF \
-              -DTRITON_ENABLE_PERF_ANALYZER_TS=OFF \
               -DTRITON_ENABLE_EXAMPLES=ON \
               -DTRITON_ENABLE_TESTS=ON \
               -DTRITON_ENABLE_GPU=OFF \
@@ -90,10 +86,6 @@ fi
               -DTRITON_ENABLE_CC_GRPC=ON \
               -DTRITON_ENABLE_PYTHON_HTTP=OFF \
               -DTRITON_ENABLE_PYTHON_GRPC=ON \
-              -DTRITON_ENABLE_PERF_ANALYZER=ON \
-              -DTRITON_ENABLE_PERF_ANALYZER_C_API=ON \
-              -DTRITON_ENABLE_PERF_ANALYZER_TFS=OFF \
-              -DTRITON_ENABLE_PERF_ANALYZER_TS=OFF \
               -DTRITON_ENABLE_EXAMPLES=ON \
               -DTRITON_ENABLE_TESTS=ON \
               -DTRITON_ENABLE_GPU=ON \
@@ -121,10 +113,6 @@ fi
               -DTRITON_ENABLE_CC_GRPC=OFF \
               -DTRITON_ENABLE_PYTHON_HTTP=ON \
               -DTRITON_ENABLE_PYTHON_GRPC=OFF \
-              -DTRITON_ENABLE_PERF_ANALYZER=ON \
-              -DTRITON_ENABLE_PERF_ANALYZER_C_API=ON \
-              -DTRITON_ENABLE_PERF_ANALYZER_TFS=OFF \
-              -DTRITON_ENABLE_PERF_ANALYZER_TS=OFF \
               -DTRITON_ENABLE_EXAMPLES=ON \
               -DTRITON_ENABLE_TESTS=ON \
               -DTRITON_ENABLE_GPU=ON \
@@ -141,59 +129,27 @@ else
     exit 1
 fi
 
-#
-# Build without Perf Analyzer
-#
-(cd /workspace/build && \
-        rm -fr cc-clients python-clients && \
-        cmake -DCMAKE_INSTALL_PREFIX=/workspace/install \
-              -DTRITON_ENABLE_CC_HTTP=ON \
-              -DTRITON_ENABLE_CC_GRPC=ON \
-              -DTRITON_ENABLE_PYTHON_HTTP=ON \
-              -DTRITON_ENABLE_PYTHON_GRPC=ON \
-              -DTRITON_ENABLE_PERF_ANALYZER=OFF \
-              -DTRITON_ENABLE_PERF_ANALYZER_C_API=OFF \
-              -DTRITON_ENABLE_PERF_ANALYZER_TFS=OFF \
-              -DTRITON_ENABLE_PERF_ANALYZER_TS=OFF \
-              -DTRITON_ENABLE_EXAMPLES=ON \
-              -DTRITON_ENABLE_TESTS=ON \
-              -DTRITON_ENABLE_GPU=ON \
-              -DTRITON_REPO_ORGANIZATION:STRING=${TRITON_REPO_ORGANIZATION} \
-              -DTRITON_COMMON_REPO_TAG=${TRITON_COMMON_REPO_TAG} \
-              -DTRITON_CORE_REPO_TAG=${TRITON_CORE_REPO_TAG} \
-              -DTRITON_THIRD_PARTY_REPO_TAG=${TRITON_THIRD_PARTY_REPO_TAG} \
-              /workspace/client && \
-        make -j16 cc-clients python-clients)
-if [ $? -eq 0 ]; then
-    echo -e "\n***\n*** No-Perf-Analyzer Passed\n***"
-else
-    echo -e "\n***\n*** No-Perf-Analyzer FAILED\n***"
-    exit 1
-fi
-
+# TODO: TPRD-342 These tests should be PA CI test
+# cases not Triton test cases
+rm -fr /workspace/build
+mkdir -p /workspace/build
 #
 # Build without C API in Perf Analyzer
 #
 (cd /workspace/build && \
-        rm -fr cc-clients python-clients && \
         cmake -DCMAKE_INSTALL_PREFIX=/workspace/install \
               -DTRITON_ENABLE_CC_HTTP=ON \
               -DTRITON_ENABLE_CC_GRPC=ON \
-              -DTRITON_ENABLE_PYTHON_HTTP=ON \
-              -DTRITON_ENABLE_PYTHON_GRPC=ON \
-              -DTRITON_ENABLE_PERF_ANALYZER=ON \
               -DTRITON_ENABLE_PERF_ANALYZER_C_API=OFF \
               -DTRITON_ENABLE_PERF_ANALYZER_TFS=ON \
               -DTRITON_ENABLE_PERF_ANALYZER_TS=ON \
-              -DTRITON_ENABLE_EXAMPLES=ON \
-              -DTRITON_ENABLE_TESTS=ON \
               -DTRITON_ENABLE_GPU=ON \
               -DTRITON_REPO_ORGANIZATION:STRING=${TRITON_REPO_ORGANIZATION} \
               -DTRITON_COMMON_REPO_TAG=${TRITON_COMMON_REPO_TAG} \
               -DTRITON_CORE_REPO_TAG=${TRITON_CORE_REPO_TAG} \
-              -DTRITON_THIRD_PARTY_REPO_TAG=${TRITON_THIRD_PARTY_REPO_TAG} \
-              /workspace/client && \
-        make -j16 cc-clients python-clients)
+              -DTRITON_CLIENT_REPO_TAG=${TRITON_CLIENT_REPO_TAG} \
+              /workspace/perf_analyzer && \
+        make -j16 perf-analyzer)
 if [ $? -eq 0 ]; then
     echo -e "\n***\n*** No-CAPI Passed\n***"
 else
@@ -205,25 +161,20 @@ fi
 # Build without TensorFlow Serving in Perf Analyzer
 #
 (cd /workspace/build && \
-        rm -fr cc-clients python-clients && \
+        rm -fr cc_clients perf_analyzer && \
         cmake -DCMAKE_INSTALL_PREFIX=/workspace/install \
               -DTRITON_ENABLE_CC_HTTP=ON \
               -DTRITON_ENABLE_CC_GRPC=ON \
-              -DTRITON_ENABLE_PYTHON_HTTP=ON \
-              -DTRITON_ENABLE_PYTHON_GRPC=ON \
-              -DTRITON_ENABLE_PERF_ANALYZER=ON \
               -DTRITON_ENABLE_PERF_ANALYZER_C_API=ON \
               -DTRITON_ENABLE_PERF_ANALYZER_TFS=OFF \
               -DTRITON_ENABLE_PERF_ANALYZER_TS=ON \
-              -DTRITON_ENABLE_EXAMPLES=ON \
-              -DTRITON_ENABLE_TESTS=ON \
               -DTRITON_ENABLE_GPU=ON \
               -DTRITON_REPO_ORGANIZATION:STRING=${TRITON_REPO_ORGANIZATION} \
               -DTRITON_COMMON_REPO_TAG=${TRITON_COMMON_REPO_TAG} \
               -DTRITON_CORE_REPO_TAG=${TRITON_CORE_REPO_TAG} \
-              -DTRITON_THIRD_PARTY_REPO_TAG=${TRITON_THIRD_PARTY_REPO_TAG} \
-              /workspace/client && \
-        make -j16 cc-clients python-clients)
+              -DTRITON_CLIENT_REPO_TAG=${TRITON_CLIENT_REPO_TAG} \
+              /workspace/perf_analyzer && \
+        make -j16 perf-analyzer)
 if [ $? -eq 0 ]; then
     echo -e "\n***\n*** No-TF-Serving Passed\n***"
 else
@@ -235,25 +186,20 @@ fi
 # Build without TorchServe in Perf Analyzer
 #
 (cd /workspace/build && \
-        rm -fr cc-clients python-clients && \
+        rm -fr cc_clients perf_analyzer && \
         cmake -DCMAKE_INSTALL_PREFIX=/workspace/install \
               -DTRITON_ENABLE_CC_HTTP=ON \
               -DTRITON_ENABLE_CC_GRPC=ON \
-              -DTRITON_ENABLE_PYTHON_HTTP=ON \
-              -DTRITON_ENABLE_PYTHON_GRPC=ON \
-              -DTRITON_ENABLE_PERF_ANALYZER=ON \
               -DTRITON_ENABLE_PERF_ANALYZER_C_API=ON \
               -DTRITON_ENABLE_PERF_ANALYZER_TFS=ON \
               -DTRITON_ENABLE_PERF_ANALYZER_TS=OFF \
-              -DTRITON_ENABLE_EXAMPLES=ON \
-              -DTRITON_ENABLE_TESTS=ON \
               -DTRITON_ENABLE_GPU=ON \
               -DTRITON_REPO_ORGANIZATION:STRING=${TRITON_REPO_ORGANIZATION} \
               -DTRITON_COMMON_REPO_TAG=${TRITON_COMMON_REPO_TAG} \
               -DTRITON_CORE_REPO_TAG=${TRITON_CORE_REPO_TAG} \
-              -DTRITON_THIRD_PARTY_REPO_TAG=${TRITON_THIRD_PARTY_REPO_TAG} \
-              /workspace/client && \
-        make -j16 cc-clients python-clients)
+              -DTRITON_CLIENT_REPO_TAG=${TRITON_CLIENT_REPO_TAG} \
+              /workspace/perf_analyzer && \
+        make -j16 perf-analyzer)
 if [ $? -eq 0 ]; then
     echo -e "\n***\n*** No-TorchServe Passed\n***"
 else
diff --git a/qa/L0_cuda_shared_memory/cuda_shared_memory_test.py b/qa/L0_cuda_shared_memory/cuda_shared_memory_test.py
index 07f9c05a88..51137e8934 100755
--- a/qa/L0_cuda_shared_memory/cuda_shared_memory_test.py
+++ b/qa/L0_cuda_shared_memory/cuda_shared_memory_test.py
@@ -31,18 +31,20 @@
 sys.path.append("../common")
 
 import os
+import time
 import unittest
+from functools import partial
 
 import infer_util as iu
 import numpy as np
 import test_util as tu
 import tritonclient.grpc as grpcclient
 import tritonclient.http as httpclient
-import tritonshmutils.cuda_shared_memory as cshm
+import tritonclient.utils.cuda_shared_memory as cshm
 from tritonclient.utils import *
 
 
-class CudaSharedMemoryTest(tu.TestResultCollector):
+class CudaSharedMemoryTestBase(tu.TestResultCollector):
     DEFAULT_SHM_BYTE_SIZE = 64
 
     def setUp(self):
@@ -61,76 +63,6 @@ def _setup_client(self):
                 self.url, verbose=True
             )
 
-    def test_invalid_create_shm(self):
-        # Raises error since tried to create invalid cuda shared memory region
-        try:
-            shm_op0_handle = cshm.create_shared_memory_region("dummy_data", -1, 0)
-            cshm.destroy_shared_memory_region(shm_op0_handle)
-        except Exception as ex:
-            self.assertEqual(str(ex), "unable to create cuda shared memory handle")
-
-    def test_valid_create_set_register(self):
-        # Create a valid cuda shared memory region, fill data in it and register
-        shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0)
-        cshm.set_shared_memory_region(
-            shm_op0_handle, [np.array([1, 2], dtype=np.float32)]
-        )
-        self.triton_client.register_cuda_shared_memory(
-            "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8
-        )
-        shm_status = self.triton_client.get_cuda_shared_memory_status()
-        if self.protocol == "http":
-            self.assertEqual(len(shm_status), 1)
-        else:
-            self.assertEqual(len(shm_status.regions), 1)
-        cshm.destroy_shared_memory_region(shm_op0_handle)
-
-    def test_unregister_before_register(self):
-        # Create a valid cuda shared memory region and unregister before register
-        shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0)
-        self.triton_client.unregister_cuda_shared_memory("dummy_data")
-        shm_status = self.triton_client.get_cuda_shared_memory_status()
-        if self.protocol == "http":
-            self.assertEqual(len(shm_status), 0)
-        else:
-            self.assertEqual(len(shm_status.regions), 0)
-        cshm.destroy_shared_memory_region(shm_op0_handle)
-
-    def test_unregister_after_register(self):
-        # Create a valid cuda shared memory region and unregister after register
-        shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0)
-        self.triton_client.register_cuda_shared_memory(
-            "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8
-        )
-        self.triton_client.unregister_cuda_shared_memory("dummy_data")
-        shm_status = self.triton_client.get_cuda_shared_memory_status()
-        if self.protocol == "http":
-            self.assertEqual(len(shm_status), 0)
-        else:
-            self.assertEqual(len(shm_status.regions), 0)
-        cshm.destroy_shared_memory_region(shm_op0_handle)
-
-    def test_reregister_after_register(self):
-        # Create a valid cuda shared memory region and unregister after register
-        shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0)
-        self.triton_client.register_cuda_shared_memory(
-            "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8
-        )
-        try:
-            self.triton_client.register_cuda_shared_memory(
-                "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8
-            )
-        except Exception as ex:
-            self.assertIn(
-                "shared memory region 'dummy_data' already in manager", str(ex)
-            )
-        shm_status = self.triton_client.get_cuda_shared_memory_status()
-        if self.protocol == "http":
-            self.assertEqual(len(shm_status), 1)
-        else:
-            self.assertEqual(len(shm_status.regions), 1)
-        cshm.destroy_shared_memory_region(shm_op0_handle)
-
     def _configure_server(
         self,
         create_byte_size=DEFAULT_SHM_BYTE_SIZE,
@@ -205,6 +137,78 @@ def _cleanup_server(self, shm_handles):
         for shm_handle in shm_handles:
             cshm.destroy_shared_memory_region(shm_handle)
 
+
+class CudaSharedMemoryTest(CudaSharedMemoryTestBase):
+    def test_invalid_create_shm(self):
+        # Raises error since tried to create invalid cuda shared memory region
+        try:
+            shm_op0_handle = cshm.create_shared_memory_region("dummy_data", -1, 0)
+            cshm.destroy_shared_memory_region(shm_op0_handle)
+        except Exception as ex:
+            self.assertEqual(str(ex), "unable to create cuda shared memory handle")
+
+    def test_valid_create_set_register(self):
+        # Create a valid cuda shared memory region, fill data in it and register
+        shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0)
+        cshm.set_shared_memory_region(
+            shm_op0_handle, [np.array([1, 2], dtype=np.float32)]
+        )
+        self.triton_client.register_cuda_shared_memory(
+            "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8
+        )
+        shm_status = self.triton_client.get_cuda_shared_memory_status()
+        if self.protocol == "http":
+            self.assertEqual(len(shm_status), 1)
+        else:
+            self.assertEqual(len(shm_status.regions), 1)
+        cshm.destroy_shared_memory_region(shm_op0_handle)
+
+    def test_unregister_before_register(self):
+        # Create a valid cuda shared memory region and unregister before register
+        shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0)
+        self.triton_client.unregister_cuda_shared_memory("dummy_data")
+        shm_status = self.triton_client.get_cuda_shared_memory_status()
+        if self.protocol == "http":
+            self.assertEqual(len(shm_status), 0)
+        else:
+            self.assertEqual(len(shm_status.regions), 0)
+        cshm.destroy_shared_memory_region(shm_op0_handle)
+
+    def test_unregister_after_register(self):
+        # Create a valid cuda shared memory region and unregister after register
+        shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0)
+        self.triton_client.register_cuda_shared_memory(
+            "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8
+        )
+        self.triton_client.unregister_cuda_shared_memory("dummy_data")
+        shm_status = self.triton_client.get_cuda_shared_memory_status()
+        if self.protocol == "http":
+            self.assertEqual(len(shm_status), 0)
+        else:
+            self.assertEqual(len(shm_status.regions), 0)
+        cshm.destroy_shared_memory_region(shm_op0_handle)
+
+    def test_reregister_after_register(self):
+        # Create a valid cuda shared memory region and unregister after register
+        shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0)
+        self.triton_client.register_cuda_shared_memory(
+            "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8
+        )
+        try:
+            self.triton_client.register_cuda_shared_memory(
+                "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8
+            )
+        except Exception as ex:
+            self.assertIn(
+                "shared memory region 'dummy_data' already in manager", str(ex)
+            )
+        shm_status = self.triton_client.get_cuda_shared_memory_status()
+        if self.protocol == "http":
+            self.assertEqual(len(shm_status), 1)
+        else:
+            self.assertEqual(len(shm_status.regions), 1)
+        cshm.destroy_shared_memory_region(shm_op0_handle)
+
     def test_unregister_after_inference(self):
         # Unregister after inference
         error_msg = []
@@ -396,5 +400,169 @@ def test_infer_byte_size_out_of_bound(self):
         self._cleanup_server(shm_handles)
 
 
+class TestCudaSharedMemoryUnregister(CudaSharedMemoryTestBase):
+    def _test_unregister_shm_fail(self):
+        second_client = httpclient.InferenceServerClient("localhost:8000", verbose=True)
+
+        with self.assertRaises(InferenceServerException) as ex:
+            second_client.unregister_cuda_shared_memory()
+        self.assertIn(
+            "Failed to unregister the following cuda shared memory regions: input0_data ,input1_data ,output0_data ,output1_data",
+            str(ex.exception),
+        )
+
+        with self.assertRaises(InferenceServerException) as ex:
+            second_client.unregister_cuda_shared_memory("input0_data")
+        self.assertIn(
+            "Cannot unregister shared memory region 'input0_data', it is currently in use.",
+            str(ex.exception),
+        )
+
+        with self.assertRaises(InferenceServerException) as ex:
+            second_client.unregister_cuda_shared_memory("input1_data")
+        self.assertIn(
+            "Cannot unregister shared memory region 'input1_data', it is currently in use.",
+            str(ex.exception),
+        )
+
+        with self.assertRaises(InferenceServerException) as ex:
+            second_client.unregister_cuda_shared_memory("output0_data")
+        self.assertIn(
+            "Cannot unregister shared memory region 'output0_data', it is currently in use.",
+            str(ex.exception),
+        )
+
+        with self.assertRaises(InferenceServerException) as ex:
+            second_client.unregister_cuda_shared_memory("output1_data")
+        self.assertIn(
+            "Cannot unregister shared memory region 'output1_data', it is currently in use.",
+            str(ex.exception),
+        )
+
+    def _test_shm_not_found(self):
+        second_client = httpclient.InferenceServerClient("localhost:8000", verbose=True)
+
+        with self.assertRaises(InferenceServerException) as ex:
+            second_client.get_cuda_shared_memory_status("input0_data")
+        self.assertIn(
+            "Unable to find cuda shared memory region: 'input0_data'",
+            str(ex.exception),
+        )
+
+        with self.assertRaises(InferenceServerException) as ex:
+            second_client.get_cuda_shared_memory_status("input1_data")
+        self.assertIn(
+            "Unable to find cuda shared memory region: 'input1_data'",
+            str(ex.exception),
+        )
+
+        with self.assertRaises(InferenceServerException) as ex:
+            second_client.get_cuda_shared_memory_status("output0_data")
+        self.assertIn(
+            "Unable to find cuda shared memory region: 'output0_data'",
+            str(ex.exception),
+        )
+
+        with self.assertRaises(InferenceServerException) as ex:
+            second_client.get_cuda_shared_memory_status("output1_data")
+        self.assertIn(
+            "Unable to find cuda shared memory region: 'output1_data'",
+            str(ex.exception),
+        )
+
+    def test_unregister_shm_during_inference_http(self):
+        try:
+            self.triton_client.unregister_cuda_shared_memory()
+            shm_handles = self._configure_server()
+
+            inputs = [
+                httpclient.InferInput("INPUT0", [1, 16], "INT32"),
+                httpclient.InferInput("INPUT1", [1, 16], "INT32"),
+            ]
+            outputs = [
+                httpclient.InferRequestedOutput("OUTPUT0", binary_data=True),
+                httpclient.InferRequestedOutput("OUTPUT1", binary_data=False),
+            ]
+
+            inputs[0].set_shared_memory("input0_data", self.DEFAULT_SHM_BYTE_SIZE)
+            inputs[1].set_shared_memory("input1_data", self.DEFAULT_SHM_BYTE_SIZE)
+            outputs[0].set_shared_memory("output0_data", self.DEFAULT_SHM_BYTE_SIZE)
+            outputs[1].set_shared_memory("output1_data", self.DEFAULT_SHM_BYTE_SIZE)
+
+            async_request = self.triton_client.async_infer(
+                model_name="simple", inputs=inputs, outputs=outputs
+            )
+
+            # Ensure inference started
+            time.sleep(2)
+
+            # Try unregister shm regions during inference
+            self._test_unregister_shm_fail()
+
+            # Blocking call
+            async_request.get_result()
+
+            # Try unregister shm regions after inference
+            self.triton_client.unregister_cuda_shared_memory()
+            self._test_shm_not_found()
+
+        finally:
+            self._cleanup_server(shm_handles)
+
+    def test_unregister_shm_during_inference_grpc(self):
+        try:
+            self.triton_client.unregister_cuda_shared_memory()
+            shm_handles = self._configure_server()
+
+            inputs = [
+                grpcclient.InferInput("INPUT0", [1, 16], "INT32"),
+                grpcclient.InferInput("INPUT1", [1, 16], "INT32"),
+            ]
+            outputs = [
+                grpcclient.InferRequestedOutput("OUTPUT0"),
+                grpcclient.InferRequestedOutput("OUTPUT1"),
+            ]
+
+            inputs[0].set_shared_memory("input0_data", self.DEFAULT_SHM_BYTE_SIZE)
+            inputs[1].set_shared_memory("input1_data", self.DEFAULT_SHM_BYTE_SIZE)
+            outputs[0].set_shared_memory("output0_data", self.DEFAULT_SHM_BYTE_SIZE)
+            outputs[1].set_shared_memory("output1_data", self.DEFAULT_SHM_BYTE_SIZE)
+
+            def callback(user_data, result, error):
+                if error:
+                    user_data.append(error)
+                else:
+                    user_data.append(result)
+
+            user_data = []
+
+            self.triton_client.async_infer(
+                model_name="simple",
+                inputs=inputs,
+                outputs=outputs,
+                callback=partial(callback, user_data),
+            )
+
+            # Ensure inference started
+            time.sleep(2)
+
+            # Try unregister shm regions during inference
+            self._test_unregister_shm_fail()
+
+            # Wait until the results are available in user_data
+            time_out = 20
+            while (len(user_data) == 0) and time_out > 0:
+                time_out = time_out - 1
+                time.sleep(1)
+            time.sleep(2)
+
+            # Try unregister shm regions after inference
+            self.triton_client.unregister_cuda_shared_memory()
+            self._test_shm_not_found()
+
+        finally:
+            self._cleanup_server(shm_handles)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_cuda_shared_memory/test.sh b/qa/L0_cuda_shared_memory/test.sh
index 02857b2153..b7126a9295 100755
--- a/qa/L0_cuda_shared_memory/test.sh
+++ b/qa/L0_cuda_shared_memory/test.sh
@@ -84,6 +84,47 @@ for i in \
     done
 done
 
+mkdir -p python_models/simple/1/
+cp ../python_models/execute_delayed_model/model.py ./python_models/simple/1/
+cp ../python_models/execute_delayed_model/config.pbtxt ./python_models/simple/
+sed -i 's/KIND_CPU/KIND_GPU/g' ./python_models/simple/config.pbtxt
+
+for client_type in http grpc; do
+    SERVER_ARGS="--model-repository=`pwd`/python_models --log-verbose=1 ${SERVER_ARGS_EXTRA}"
+    SERVER_LOG="./unregister_shm.$client_type.server.log"
+    run_server
+    if [ "$SERVER_PID" == "0" ]; then
+        echo -e "\n***\n*** Failed to start $SERVER\n***"
+        cat $SERVER_LOG
+        exit 1
+    fi
+
+    export CLIENT_TYPE=$client_type
+    CLIENT_LOG="./unregister_shm.$client_type.client.log"
+    set +e
+    python3 $SHM_TEST TestCudaSharedMemoryUnregister.test_unregister_shm_during_inference_$client_type >>$CLIENT_LOG 2>&1
+    if [ $? -ne 0 ]; then
+        cat $CLIENT_LOG
+        echo -e "\n***\n*** Test Failed\n***"
+        RET=1
+    else
+        check_test_results $TEST_RESULT_FILE 1
+        if [ $? -ne 0 ]; then
+            cat $TEST_RESULT_FILE
+            echo -e "\n***\n*** Test Result Verification Failed\n***"
+            RET=1
+        fi
+    fi
+
+    kill $SERVER_PID
+    wait $SERVER_PID
+    if [ $? -ne 0 ]; then
+        echo -e "\n***\n*** Test Server shut down non-gracefully\n***"
+        RET=1
+    fi
+    set -e
+    done
+
 if [ $RET -eq 0 ]; then
     echo -e "\n***\n*** Test Passed\n***"
 else
diff --git a/qa/L0_decoupled/decoupled_test.py b/qa/L0_decoupled/decoupled_test.py
index 1f76f4845b..d7bc59f5c7 100755
--- a/qa/L0_decoupled/decoupled_test.py
+++ b/qa/L0_decoupled/decoupled_test.py
@@ -116,7 +116,13 @@ def _stream_infer_with_params(
             url="localhost:8001", verbose=True
         ) as triton_client:
             # Establish stream
-            triton_client.start_stream(callback=partial(callback, user_data))
+            if "TRITONSERVER_GRPC_STATUS_FLAG" in os.environ:
+                metadata = {"triton_grpc_error": "true"}
+                triton_client.start_stream(
+                    callback=partial(callback, user_data), headers=metadata
+                )
+            else:
+                triton_client.start_stream(callback=partial(callback, user_data))
             # Send specified many requests in parallel
             for i in range(request_count):
                 time.sleep((request_delay / 1000))
@@ -175,7 +181,13 @@ def _stream_infer(
             url="localhost:8001", verbose=True
         ) as triton_client:
             # Establish stream
-            triton_client.start_stream(callback=partial(callback, user_data))
+            if "TRITONSERVER_GRPC_STATUS_FLAG" in os.environ:
+                metadata = {"triton_grpc_error": "true"}
+                triton_client.start_stream(
+                    callback=partial(callback, user_data), headers=metadata
+                )
+            else:
+                triton_client.start_stream(callback=partial(callback, user_data))
             # Send specified many requests in parallel
             for i in range(request_count):
                 time.sleep((request_delay / 1000))
diff --git a/qa/L0_decoupled/test.sh b/qa/L0_decoupled/test.sh
index 98ad134d8b..22c37dff49 100755
--- a/qa/L0_decoupled/test.sh
+++ b/qa/L0_decoupled/test.sh
@@ -176,4 +176,4 @@ else
   echo -e "\n***\n*** Test Failed\n***"
 fi
 
-exit $RET
+exit $RET
\ No newline at end of file
diff --git a/qa/L0_dlpack_multi_gpu/test.sh b/qa/L0_dlpack_multi_gpu/test.sh
index 996f062f42..ae72daa7d0 100755
--- a/qa/L0_dlpack_multi_gpu/test.sh
+++ b/qa/L0_dlpack_multi_gpu/test.sh
@@ -27,7 +27,7 @@
 
 SERVER=/opt/tritonserver/bin/tritonserver
 SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=1"
-CLIENT_PY=./python_unittest.py
+CLIENT_PY=./test_infer_shm_leak.py
 CLIENT_LOG="./client.log"
 EXPECTED_NUM_TESTS="1"
 TEST_RESULT_FILE='test_results.txt'
@@ -52,8 +52,8 @@ rm -fr *.log ./models
 mkdir -p models/dlpack_test/1/
 cp ../python_models/dlpack_test/model.py models/dlpack_test/1/
 cp ../python_models/dlpack_test/config.pbtxt models/dlpack_test
-cp ../L0_backend_python/python_unittest.py .
-sed -i 's#sys.path.append("../../common")#sys.path.append("../common")#g' python_unittest.py
+cp ../L0_backend_python/test_infer_shm_leak.py .
+sed -i 's#sys.path.append("../../common")#sys.path.append("../common")#g' test_infer_shm_leak.py
 
 run_server
 if [ "$SERVER_PID" == "0" ]; then
diff --git a/qa/L0_grpc/test.sh b/qa/L0_grpc/test.sh
index 50cf5a6f91..93d22e75be 100755
--- a/qa/L0_grpc/test.sh
+++ b/qa/L0_grpc/test.sh
@@ -48,7 +48,7 @@ NGINX_CONF="./nginx.conf"
 # On windows the paths invoked by the script (running in WSL) must use
 # /mnt/c when needed but the paths on the tritonserver command-line
 # must be C:/ style.
-if [[ "$(< /proc/sys/kernel/osrelease)" == *microsoft* ]]; then
+if [[ -v WSL_DISTRO_NAME ]] || [[ -v MSYSTEM ]]; then
     SDKDIR=${SDKDIR:=C:/sdk}
     MODELDIR=${MODELDIR:=C:/models}
     CLIENT_PLUGIN_MODELDIR=${MODELDIR:=C:/client_plugin_models}
diff --git a/qa/L0_grpc_state_cleanup/cleanup_test.py b/qa/L0_grpc_state_cleanup/cleanup_test.py
index 431eeb1720..f7507747e9 100755
--- a/qa/L0_grpc_state_cleanup/cleanup_test.py
+++ b/qa/L0_grpc_state_cleanup/cleanup_test.py
@@ -161,9 +161,17 @@ def _stream_infer_with_params(
             url="localhost:8001", verbose=True
         ) as triton_client:
             # Establish stream
-            triton_client.start_stream(
-                callback=partial(callback, user_data), stream_timeout=stream_timeout
-            )
+            if "TRITONSERVER_GRPC_STATUS_FLAG" in os.environ:
+                metadata = {"triton_grpc_error": "true"}
+                triton_client.start_stream(
+                    callback=partial(callback, user_data),
+                    stream_timeout=stream_timeout,
+                    headers=metadata,
+                )
+            else:
+                triton_client.start_stream(
+                    callback=partial(callback, user_data), stream_timeout=stream_timeout
+                )
             # Send specified many requests in parallel
             for i in range(request_count):
                 time.sleep((request_delay / 1000))
@@ -229,9 +237,17 @@ def _stream_infer(
             url="localhost:8001", verbose=True
         ) as triton_client:
             # Establish stream
-            triton_client.start_stream(
-                callback=partial(callback, user_data), stream_timeout=stream_timeout
-            )
+            if "TRITONSERVER_GRPC_STATUS_FLAG" in os.environ:
+                metadata = {"triton_grpc_error": "true"}
+                triton_client.start_stream(
+                    callback=partial(callback, user_data),
+                    stream_timeout=stream_timeout,
+                    headers=metadata,
+                )
+            else:
+                triton_client.start_stream(
+                    callback=partial(callback, user_data), stream_timeout=stream_timeout
+                )
             # Send specified many requests in parallel
             for i in range(request_count):
                 time.sleep((request_delay / 1000))
@@ -608,9 +624,17 @@ def test_non_decoupled_streaming_multi_response(self):
             url="localhost:8001", verbose=True
         ) as client:
             # Establish stream
-            client.start_stream(
-                callback=partial(callback, user_data), stream_timeout=16
-            )
+            if "TRITONSERVER_GRPC_STATUS_FLAG" in os.environ:
+                metadata = {"triton_grpc_error": "true"}
+                client.start_stream(
+                    callback=partial(callback, user_data),
+                    stream_timeout=16,
+                    headers=metadata,
+                )
+            else:
+                client.start_stream(
+                    callback=partial(callback, user_data), stream_timeout=16
+                )
             # Send a request
             client.async_stream_infer(
                 model_name=self.repeat_non_decoupled_model_name,
diff --git a/qa/L0_http/generate_endpoint_test.py b/qa/L0_http/generate_endpoint_test.py
index a9a972e02a..3eb0b6ea5f 100755
--- a/qa/L0_http/generate_endpoint_test.py
+++ b/qa/L0_http/generate_endpoint_test.py
@@ -142,6 +142,21 @@ def test_generate(self):
         self.assertIn("TEXT", data)
         self.assertEqual(text, data["TEXT"])
 
+    def test_generate_with_all_inputs(self):
+        # Setup text-based input
+        text = "hello world"
+        inputs = {"PROMPT": text, "STREAM": False, "input_ids": [100, 200]}
+
+        r = self.generate(self._model_name, inputs)
+        r.raise_for_status()
+
+        self.assertIn("Content-Type", r.headers)
+        self.assertEqual(r.headers["Content-Type"], "application/json")
+
+        data = r.json()
+        self.assertIn("TEXT", data)
+        self.assertEqual(text, data["TEXT"])
+
     def test_request_id(self):
         # Setup text based input
         text = "hello world"
@@ -220,18 +235,26 @@ def test_missing_inputs(self):
         ]
         for inputs in missing_all_inputs:
             self.generate_expect_failure(
-                self._model_name, inputs, "expected 2 inputs but got 0"
+                self._model_name,
+                inputs,
+                "expected number of inputs between 2 and 3 but got 0",
             )
             self.generate_stream_expect_failure(
-                self._model_name, inputs, "expected 2 inputs but got 0"
+                self._model_name,
+                inputs,
+                "expected number of inputs between 2 and 3 but got 0",
             )
 
         for inputs in missing_one_input:
             self.generate_expect_failure(
-                self._model_name, inputs, "expected 2 inputs but got 1"
+                self._model_name,
+                inputs,
+                "expected number of inputs between 2 and 3 but got 1",
             )
             self.generate_stream_expect_failure(
-                self._model_name, inputs, "expected 2 inputs but got 1"
+                self._model_name,
+                inputs,
+                "expected number of inputs between 2 and 3 but got 1",
             )
 
     def test_invalid_input_types(self):
diff --git a/qa/L0_http/generate_models/mock_llm/config.pbtxt b/qa/L0_http/generate_models/mock_llm/config.pbtxt
index 6871661525..74a306052a 100644
--- a/qa/L0_http/generate_models/mock_llm/config.pbtxt
+++ b/qa/L0_http/generate_models/mock_llm/config.pbtxt
@@ -1,4 +1,4 @@
-# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -41,6 +41,12 @@ input [
     name: "STREAM"
     data_type: TYPE_BOOL
     dims: [ 1, 1 ]
+  },
+  {
+    name: "input_ids"
+    data_type: TYPE_INT32
+    dims: [ 1, -1 ]
+    optional: true
   }
 ]
 
diff --git a/qa/L0_http/test.sh b/qa/L0_http/test.sh
index 321c398995..572c527ba4 100755
--- a/qa/L0_http/test.sh
+++ b/qa/L0_http/test.sh
@@ -49,7 +49,7 @@ NGINX_CONF="./nginx.conf"
 # On windows the paths invoked by the script (running in WSL) must use
 # /mnt/c when needed but the paths on the tritonserver command-line
 # must be C:/ style.
-if [[ "$(< /proc/sys/kernel/osrelease)" == *microsoft* ]]; then
+if [[ -v WSL_DISTRO_NAME ]] || [[ -v MSYSTEM ]]; then
     SDKDIR=${SDKDIR:=C:/sdk}
     MODELDIR=${MODELDIR:=C:/models}
     DATADIR=${DATADIR:="/mnt/c/data/inferenceserver/${REPO_VERSION}"}
@@ -662,7 +662,7 @@ fi
 ## Python Unit Tests
 TEST_RESULT_FILE='test_results.txt'
 PYTHON_TEST=generate_endpoint_test.py
-EXPECTED_NUM_TESTS=16
+EXPECTED_NUM_TESTS=17
 set +e
 python $PYTHON_TEST >$CLIENT_LOG 2>&1
 if [ $? -ne 0 ]; then
diff --git a/qa/L0_infer/install_and_test.sh b/qa/L0_infer/install_and_test.sh
index 28e5dad52e..4c136cf1dd 100755
--- a/qa/L0_infer/install_and_test.sh
+++ b/qa/L0_infer/install_and_test.sh
@@ -25,14 +25,24 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+# Determine the operating system to call the correct package manager.
+ID_LIKE=$(grep -Po '(?<=ID_LIKE=).*' /etc/os-release | awk -F= '{print $1}' |  tr -d '"' | awk '{print $1}')
+
 # Note: This script is to be used with customized triton containers that need
 # dependencies to run L0_infer tests
-apt-get update && \
-    apt-get install -y --no-install-recommends \
-         curl \
-         jq \
-         python3 \
-         python3-pip
+if [[ "$ID_LIKE" =~ "debian" ]]; then
+    apt-get update && \
+        apt-get install -y --no-install-recommends \
+            curl \
+            jq \
+            python3 \
+            python3-pip
+else
+    yum install -y \
+        jq \
+        curl
+fi
+
 pip3 install --upgrade pip
 # install client libraries
 pip3 install tritonclient[all]
diff --git a/qa/L0_infer/test.sh b/qa/L0_infer/test.sh
index dba4d7dbcc..36f63053e3 100755
--- a/qa/L0_infer/test.sh
+++ b/qa/L0_infer/test.sh
@@ -87,7 +87,7 @@ DEFAULT_SHM_SIZE_BYTES=$((1024*1024*$DEFAULT_SHM_SIZE_MB))
 # On windows the paths invoked by the script (running in WSL) must use
 # /mnt/c when needed but the paths on the tritonserver command-line
 # must be C:/ style.
-if [[ "$(< /proc/sys/kernel/osrelease)" == *microsoft* ]]; then
+if [[ -v WSL_DISTRO_NAME ]] || [[ -v MSYSTEM ]]; then
     MODELDIR=${MODELDIR:=C:/models}
     DATADIR=${DATADIR:="/mnt/c/data/inferenceserver/${REPO_VERSION}"}
     BACKEND_DIR=${BACKEND_DIR:=C:/tritonserver/backends}
diff --git a/qa/L0_input_validation/input_validation_test.py b/qa/L0_input_validation/input_validation_test.py
index 33360b7a08..8e7f58bb0c 100755
--- a/qa/L0_input_validation/input_validation_test.py
+++ b/qa/L0_input_validation/input_validation_test.py
@@ -195,7 +195,7 @@ def get_input_array(input_size, np_dtype):
             triton_client.infer(model_name=model_name, inputs=inputs)
         err_str = str(e.exception)
         self.assertIn(
-            f"expected {input_size} string elements for inference input 'INPUT1', got {input_size-2}",
+            f"expected {input_size} string elements for inference input 'INPUT1' for model '{model_name}', got {input_size-2}",
             err_str,
         )
 
@@ -208,7 +208,7 @@ def get_input_array(input_size, np_dtype):
             triton_client.infer(model_name=model_name, inputs=inputs)
         err_str = str(e.exception)
         self.assertIn(
-            f"expected {input_size} string elements for inference input 'INPUT1', got {input_size+2}",
+            f"unexpected number of string elements {input_size+1} for inference input 'INPUT1' for model '{model_name}', expecting {input_size}",
             err_str,
         )
 
diff --git a/qa/L0_input_validation/test.sh b/qa/L0_input_validation/test.sh
index fc70abd969..22e0560959 100755
--- a/qa/L0_input_validation/test.sh
+++ b/qa/L0_input_validation/test.sh
@@ -68,7 +68,9 @@ set +e
 python3 -m pytest --junitxml="input_validation.report.xml" $TEST_PY::InputValTest >> $CLIENT_LOG 2>&1
 
 if [ $? -ne 0 ]; then
-    echo -e "\n***\n*** input_validation_test.py FAILED. \n***"
+    cat $CLIENT_LOG
+    cat $SERVER_LOG
+    echo -e "\n***\n*** input_validation_test.py::InputValTest FAILED. \n***"
     RET=1
 fi
 set -e
@@ -138,7 +140,9 @@ set +e
 python3 -m pytest --junitxml="input_shape_validation.report.xml" $TEST_PY::InputShapeTest >> $CLIENT_LOG 2>&1
 
 if [ $? -ne 0 ]; then
-    echo -e "\n***\n*** input_validation_test.py FAILED. \n***"
+    cat $CLIENT_LOG
+    cat $SERVER_LOG
+    echo -e "\n***\n*** input_validation_test.py::InputShapeTest FAILED. \n***"
     RET=1
 fi
 set -e
@@ -147,10 +151,13 @@ kill $SERVER_PID
 wait $SERVER_PID
 
 # input_byte_size_test
+cp -r /data/inferenceserver/${REPO_VERSION}/qa_identity_model_repository/{savedmodel_zero_1_float32,savedmodel_zero_1_object} ./models
+
 set +e
-LD_LIBRARY_PATH=/opt/tritonserver/lib:$LD_LIBRARY_PATH $TEST_EXEC >>$TEST_LOG 2>&1
+LD_LIBRARY_PATH=/opt/tritonserver/lib:$LD_LIBRARY_PATH $TEST_EXEC >> $TEST_LOG 2>&1
 if [ $? -ne 0 ]; then
-    echo -e "\n***\n*** Query Unit Test Failed\n***"
+    cat $TEST_LOG
+    echo -e "\n***\n*** input_byte_size_test FAILED\n***"
     RET=1
 fi
 set -e
@@ -158,8 +165,6 @@ set -e
 if [ $RET -eq 0 ]; then
     echo -e "\n***\n*** Input Validation Test Passed\n***"
 else
-    cat $CLIENT_LOG
-    cat $SERVER_LOG
     echo -e "\n***\n*** Input Validation Test FAILED\n***"
 fi
 
diff --git a/qa/L0_lifecycle/lifecycle_test.py b/qa/L0_lifecycle/lifecycle_test.py
index a2bfc067bc..49fe684ff1 100755
--- a/qa/L0_lifecycle/lifecycle_test.py
+++ b/qa/L0_lifecycle/lifecycle_test.py
@@ -3493,6 +3493,133 @@ def test_delete_custom_config(self):
             except Exception as ex:
                 self.assertTrue(False, "unexpected error {}".format(ex))
 
+    def test_load_new_model_version(self):
+        model_name = "identity_fp32"
+        client = self._get_client(use_grpc=True)
+
+        # version 1 and 2 are already loaded
+        # version 3 is in the model directory but not loaded
+        # version 4 does not exist anywhere
+        self.assertTrue(client.is_model_ready(model_name, "1"))
+        self.assertTrue(client.is_model_ready(model_name, "2"))
+        self.assertFalse(client.is_model_ready(model_name, "3"))
+        self.assertFalse(client.is_model_ready(model_name, "4"))
+        with open(os.environ["SERVER_LOG"]) as f:
+            server_log = f.read()
+        self.assertEqual(server_log.count("[PB model] Loading version 1"), 1)
+        self.assertEqual(server_log.count("[PB model] Loading version 2"), 1)
+        self.assertEqual(server_log.count("[PB model] Loading version 3"), 0)
+        self.assertEqual(server_log.count("[PB model] Loading version 4"), 0)
+        self.assertEqual(server_log.count("successfully loaded 'identity_fp32'"), 1)
+
+        # update version 2 model file
+        Path(os.path.join("models", model_name, "2", "model.py")).touch()
+        # add version 4 model file
+        src_path = os.path.join("models", model_name, "3")
+        dst_path = os.path.join("models", model_name, "4")
+        shutil.copytree(src_path, dst_path)
+        # update model config to load version 1 to 4
+        config_path = os.path.join("models", model_name, "config.pbtxt")
+        with open(config_path, mode="r+", encoding="utf-8", errors="strict") as f:
+            config = f.read()
+            config = config.replace(
+                "version_policy: { specific: { versions: [1, 2] } }",
+                "version_policy: { specific: { versions: [1, 2, 3, 4] } }",
+            )
+            f.truncate(0)
+            f.seek(0)
+            f.write(config)
+        # reload the model
+        client.load_model(model_name)
+
+        # version 1 is unmodified so it should not be reloaded
+        # version 2 is modified so it should be reloaded
+        # version 3 model file existed but not loaded so it should be loaded
+        # version 4 is a new version so it should be loaded
+        self.assertTrue(client.is_model_ready(model_name, "1"))
+        self.assertTrue(client.is_model_ready(model_name, "2"))
+        self.assertTrue(client.is_model_ready(model_name, "3"))
+        self.assertTrue(client.is_model_ready(model_name, "4"))
+        with open(os.environ["SERVER_LOG"]) as f:
+            server_log = f.read()
+        self.assertEqual(server_log.count("[PB model] Loading version 1"), 1)
+        self.assertEqual(server_log.count("[PB model] Loading version 2"), 2)
+        self.assertEqual(server_log.count("[PB model] Loading version 3"), 1)
+        self.assertEqual(server_log.count("[PB model] Loading version 4"), 1)
+        self.assertEqual(server_log.count("successfully loaded 'identity_fp32'"), 2)
+
+        # simulate a dependency change to all versions
+        Path(os.path.join("models", model_name, "dummy_dependency.py")).touch()
+        # reload the model
+        client.load_model(model_name)
+
+        # all 4 versions should be reloaded
+        self.assertTrue(client.is_model_ready(model_name, "1"))
+        self.assertTrue(client.is_model_ready(model_name, "2"))
+        self.assertTrue(client.is_model_ready(model_name, "3"))
+        self.assertTrue(client.is_model_ready(model_name, "4"))
+        with open(os.environ["SERVER_LOG"]) as f:
+            server_log = f.read()
+        self.assertEqual(server_log.count("[PB model] Loading version 1"), 2)
+        self.assertEqual(server_log.count("[PB model] Loading version 2"), 3)
+        self.assertEqual(server_log.count("[PB model] Loading version 3"), 2)
+        self.assertEqual(server_log.count("[PB model] Loading version 4"), 2)
+        self.assertEqual(server_log.count("successfully loaded 'identity_fp32'"), 3)
+
+        # update model config to only load version 4
+        config_path = os.path.join("models", model_name, "config.pbtxt")
+        with open(config_path, mode="r+", encoding="utf-8", errors="strict") as f:
+            config = f.read()
+            config = config.replace(
+                "version_policy: { specific: { versions: [1, 2, 3, 4] } }",
+                "version_policy: { specific: { versions: [4] } }",
+            )
+            f.truncate(0)
+            f.seek(0)
+            f.write(config)
+        # reload the model
+        client.load_model(model_name)
+
+        # only version 4 should be available and no reloads should happen
+        self.assertFalse(client.is_model_ready(model_name, "1"))
+        self.assertFalse(client.is_model_ready(model_name, "2"))
+        self.assertFalse(client.is_model_ready(model_name, "3"))
+        self.assertTrue(client.is_model_ready(model_name, "4"))
+        with open(os.environ["SERVER_LOG"]) as f:
+            server_log = f.read()
+        self.assertEqual(server_log.count("[PB model] Loading version 1"), 2)
+        self.assertEqual(server_log.count("[PB model] Loading version 2"), 3)
+        self.assertEqual(server_log.count("[PB model] Loading version 3"), 2)
+        self.assertEqual(server_log.count("[PB model] Loading version 4"), 2)
+        self.assertEqual(server_log.count("successfully loaded 'identity_fp32'"), 4)
+
+        # update model config to load version 1 and 4
+        config_path = os.path.join("models", model_name, "config.pbtxt")
+        with open(config_path, mode="r+", encoding="utf-8", errors="strict") as f:
+            config = f.read()
+            config = config.replace(
+                "version_policy: { specific: { versions: [4] } }",
+                "version_policy: { specific: { versions: [1, 4] } }",
+            )
+            f.truncate(0)
+            f.seek(0)
+            f.write(config)
+        # reload the model
+        client.load_model(model_name)
+
+        # version 1 should be loaded and version 4 should not be reloaded
+        self.assertTrue(client.is_model_ready(model_name, "1"))
+        self.assertFalse(client.is_model_ready(model_name, "2"))
+        self.assertFalse(client.is_model_ready(model_name, "3"))
+        self.assertTrue(client.is_model_ready(model_name, "4"))
+        with open(os.environ["SERVER_LOG"]) as f:
+            server_log = f.read()
+        self.assertEqual(server_log.count("[PB model] Loading version 1"), 3)
+        self.assertEqual(server_log.count("[PB model] Loading version 2"), 3)
+        self.assertEqual(server_log.count("[PB model] Loading version 3"), 2)
+        self.assertEqual(server_log.count("[PB model] Loading version 4"), 2)
+        self.assertEqual(server_log.count("successfully loaded 'identity_fp32'"), 5)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_lifecycle/test.sh b/qa/L0_lifecycle/test.sh
index 9236fdabfb..4efd244c76 100755
--- a/qa/L0_lifecycle/test.sh
+++ b/qa/L0_lifecycle/test.sh
@@ -2196,6 +2196,41 @@ set -e
 kill $SERVER_PID
 wait $SERVER_PID
 
+LOG_IDX=$((LOG_IDX+1))
+
+# LifeCycleTest.test_load_new_model_version
+rm -rf models
+mkdir models
+cp -r ../python_models/identity_fp32 models/ && (cd models/identity_fp32 && \
+    echo "version_policy: { specific: { versions: [1, 2] } }" >> config.pbtxt && \
+    echo "    def initialize(self, args):" >> model.py && \
+    echo "        pb_utils.Logger.log_info(f'[PB model] Loading version {args[\"model_version\"]}')" >> model.py && \
+    mkdir 1 && cp model.py 1 && \
+    mkdir 2 && cp model.py 2 && \
+    mkdir 3 && mv model.py 3)
+
+export PYTHONDONTWRITEBYTECODE="True"
+SERVER_ARGS="--model-repository=`pwd`/models --model-control-mode=explicit --load-model=*"
+SERVER_LOG="./inference_server_$LOG_IDX.log"
+run_server
+if [ "$SERVER_PID" == "0" ]; then
+    echo -e "\n***\n*** Failed to start $SERVER\n***"
+    cat $SERVER_LOG
+    exit 1
+fi
+
+set +e
+SERVER_LOG=$SERVER_LOG python $LC_TEST LifeCycleTest.test_load_new_model_version >>$CLIENT_LOG 2>&1
+if [ $? -ne 0 ]; then
+    cat $CLIENT_LOG
+    echo -e "\n***\n*** Test Failed\n***"
+    RET=1
+fi
+set -e
+
+kill $SERVER_PID
+wait $SERVER_PID
+unset PYTHONDONTWRITEBYTECODE
 
 if [ $RET -eq 0 ]; then
   echo -e "\n***\n*** Test Passed\n***"
diff --git a/qa/L0_perf_analyzer_capi/test.sh b/qa/L0_perf_analyzer_capi/test.sh
index d031e2cacf..3e3f9e4af6 100755
--- a/qa/L0_perf_analyzer_capi/test.sh
+++ b/qa/L0_perf_analyzer_capi/test.sh
@@ -56,7 +56,7 @@ SHAPETENSORADTAFILE=`pwd`/../common/perf_analyzer_input_data_json/shape_tensor_d
 
 ERROR_STRING="error | Request count: 0 | : 0 infer/sec"
 
-STABILITY_THRESHOLD="15"
+STABILITY_THRESHOLD="9999"
 
 source ../common/util.sh
 
diff --git a/qa/L0_perf_analyzer_doc_links/test.sh b/qa/L0_perf_analyzer_doc_links/test.sh
index db80e84974..d0757bca9e 100755
--- a/qa/L0_perf_analyzer_doc_links/test.sh
+++ b/qa/L0_perf_analyzer_doc_links/test.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -35,10 +35,10 @@ python3 -m pip install mkdocs-htmlproofer-plugin==0.10.3
 
 #Download perf_analyzer docs
 TRITON_REPO_ORGANIZATION=${TRITON_REPO_ORGANIZATION:="http://github.com/triton-inference-server"}
-TRITON_CLIENT_REPO_TAG="${TRITON_CLIENT_REPO_TAG:=main}"
-git clone -b ${TRITON_CLIENT_REPO_TAG} ${TRITON_REPO_ORGANIZATION}/client.git
-cp `pwd`/client/src/c++/perf_analyzer/README.md .
-cp -rf `pwd`/client/src/c++/perf_analyzer/docs .
+TRITON_PERF_ANALYZER_REPO_TAG="${TRITON_PERF_ANALYZER_REPO_TAG:=main}"
+git clone -b ${TRITON_PERF_ANALYZER_REPO_TAG} ${TRITON_REPO_ORGANIZATION}/perf_analyzer.git
+cp `pwd`/perf_analyzer/README.md .
+cp -rf `pwd`/perf_analyzer/docs .
 
 # Need to remove all links that start with -- or -. Mkdocs converts all -- to - for anchor links.
 # This breaks all links to cli commands throughout the docs. This will iterate over all
diff --git a/qa/L0_perf_tensorrt_llm/test.sh b/qa/L0_perf_tensorrt_llm/test.sh
index 35d360498d..e74b01e568 100755
--- a/qa/L0_perf_tensorrt_llm/test.sh
+++ b/qa/L0_perf_tensorrt_llm/test.sh
@@ -34,7 +34,7 @@ TRT_ROOT="/usr/local/tensorrt"
 MODEL_NAME="gpt2_tensorrt_llm"
 NAME="tensorrt_llm_benchmarking_test"
 MODEL_REPOSITORY="$(pwd)/triton_model_repo"
-TENSORRTLLM_BACKEND_DIR="/opt/tritonserver/tensorrtllm_backend"
+TENSORRTLLM_BACKEND_DIR="/workspace/tensorrtllm_backend"
 GPT_DIR="$TENSORRTLLM_BACKEND_DIR/tensorrt_llm/examples/gpt"
 TOKENIZER_DIR="$GPT_DIR/gpt2"
 ENGINES_DIR="${BASE_DIR}/engines/inflight_batcher_llm/${NUM_GPUS}-gpu"
@@ -47,40 +47,27 @@ SERVER_TIMEOUT=${SERVER_TIMEOUT:=120}
 function clone_tensorrt_llm_backend_repo {
     rm -rf $TENSORRTLLM_BACKEND_DIR && mkdir $TENSORRTLLM_BACKEND_DIR
     apt-get update && apt-get install git-lfs -y --no-install-recommends
-    git clone --single-branch --depth=1 -b ${TENSORRTLLM_BACKEND_REPO_TAG} https://github.com/triton-inference-server/tensorrtllm_backend.git $TENSORRTLLM_BACKEND_DIR
+    git clone --single-branch --depth=1 -b ${TENSORRTLLM_BACKEND_REPO_TAG} ${TRITON_REPO_ORG}/tensorrtllm_backend.git $TENSORRTLLM_BACKEND_DIR
     cd $TENSORRTLLM_BACKEND_DIR && git lfs install && git submodule update --init --recursive
 }
 
 # Update Open MPI to a version compatible with SLURM.
 function upgrade_openmpi {
-    cd /tmp/
     local CURRENT_VERSION=$(mpirun --version 2>&1 | awk '/Open MPI/ {gsub(/rc[0-9]+/, "", $NF); print $NF}')
 
     if [ -n "$CURRENT_VERSION" ] && dpkg --compare-versions "$CURRENT_VERSION" lt "5.0.1"; then
         # Uninstall the current version of Open MPI
-        wget "https://download.open-mpi.org/release/open-mpi/v$(echo "${CURRENT_VERSION}" | awk -F. '{print $1"."$2}')/openmpi-${CURRENT_VERSION}.tar.gz" || {
-            echo "Failed to download Open MPI ${CURRENT_VERSION}"
-            exit 1
-        }
-        rm -rf "openmpi-${CURRENT_VERSION}" && tar -xzf "openmpi-${CURRENT_VERSION}.tar.gz" && cd "openmpi-${CURRENT_VERSION}" || {
-            echo "Failed to extract Open MPI ${CURRENT_VERSION}"
-            exit 1
-        }
-        unset PMIX_VERSION && ./configure --prefix=/opt/hpcx/ompi/ && make uninstall || {
-            echo "Failed to uninstall Open MPI ${CURRENT_VERSION}"
-            exit 1
-        }
-        rm -rf /opt/hpcx/ompi/ /usr/local/mpi/ || {
-            echo "Failed to remove Open MPI ${CURRENT_VERSION} installation directories"
+        rm -r /opt/hpcx/ompi/ /usr/local/mpi && rm -rf /usr/lib/$(gcc -print-multiarch)/openmpi || {
+            echo "Failed to uninstall the existing Open MPI version $CURRENT_VERSION."
             exit 1
         }
-        cd ../ && rm -r openmpi-${CURRENT_VERSION}
     else
-        echo "Installed Open MPI version is not less than 5.0.1. Skipping the upgrade."
+        echo "The installed Open MPI version ($CURRENT_VERSION) is 5.0.1 or higher. Skipping the upgrade."
         return
     fi
 
     # Install SLURM supported Open MPI version
+    cd /tmp/
     wget "https://download.open-mpi.org/release/open-mpi/v5.0/openmpi-5.0.1.tar.gz" || {
         echo "Failed to download Open MPI 5.0.1"
         exit 1
@@ -108,18 +95,6 @@ function upgrade_openmpi {
     mpirun --version
 }
 
-function install_tensorrt_llm {
-    # Install CMake
-    bash ${TENSORRTLLM_BACKEND_DIR}/tensorrt_llm/docker/common/install_cmake.sh
-    export PATH="/usr/local/cmake/bin:${PATH}"
-
-    TORCH_INSTALL_TYPE="pypi" &&
-        (cd ${TENSORRTLLM_BACKEND_DIR}/tensorrt_llm &&
-            bash docker/common/install_pytorch.sh $TORCH_INSTALL_TYPE &&
-            python3 ./scripts/build_wheel.py --trt_root=/usr/local/tensorrt &&
-            pip3 install ./build/tensorrt_llm*.whl)
-}
-
 function build_gpt2_base_model {
     # Download weights from HuggingFace Transformers
     cd ${GPT_DIR} && rm -rf gpt2 && git clone https://huggingface.co/gpt2-medium gpt2 && cd gpt2
@@ -131,24 +106,21 @@ function build_gpt2_base_model {
     cd ${GPT_DIR}
 
     # Convert weights from HF Tranformers to FT format
-    python3 hf_gpt_convert.py -p 1 -i gpt2 -o ./c-model/gpt2 --tensor-parallelism ${NUM_GPUS} --storage-type float16
+    python3 convert_checkpoint.py --model_dir gpt2 --dtype float16 --tp_size ${NUM_GPUS} --output_dir "./c-model/gpt2/${NUM_GPUS}-gpu/"
     cd ${BASE_DIR}
 }
 
 function build_gpt2_tensorrt_engine {
     # Build TensorRT engines
     cd ${GPT_DIR}
-    python3 build.py --model_dir="./c-model/gpt2/${NUM_GPUS}-gpu/" \
-        --world_size="${NUM_GPUS}" \
-        --dtype float16 \
-        --use_inflight_batching \
-        --use_gpt_attention_plugin float16 \
-        --paged_kv_cache \
-        --use_gemm_plugin float16 \
-        --remove_input_padding \
-        --hidden_act gelu \
-        --parallel_build \
-        --output_dir="${ENGINES_DIR}"
+    trtllm-build --checkpoint_dir "./c-model/gpt2/${NUM_GPUS}-gpu/" \
+        --gpt_attention_plugin float16 \
+        --remove_input_padding enable \
+        --paged_kv_cache enable \
+        --gemm_plugin float16 \
+        --workers "${NUM_GPUS}" \
+        --output_dir "${ENGINES_DIR}"
+
     cd ${BASE_DIR}
 }
 
@@ -172,18 +144,18 @@ function prepare_model_repository {
     replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_REPOSITORY}/preprocessing/config.pbtxt"
     replace_config_tags '${preprocessing_instance_count}' '1' "${MODEL_REPOSITORY}/preprocessing/config.pbtxt"
     replace_config_tags '${tokenizer_dir}' "${TOKENIZER_DIR}/" "${MODEL_REPOSITORY}/preprocessing/config.pbtxt"
-    replace_config_tags '${tokenizer_type}' 'auto' "${MODEL_REPOSITORY}/preprocessing/config.pbtxt"
 
     replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_REPOSITORY}/postprocessing/config.pbtxt"
     replace_config_tags '${postprocessing_instance_count}' '1' "${MODEL_REPOSITORY}/postprocessing/config.pbtxt"
     replace_config_tags '${tokenizer_dir}' "${TOKENIZER_DIR}/" "${MODEL_REPOSITORY}/postprocessing/config.pbtxt"
-    replace_config_tags '${tokenizer_type}' 'auto' "${MODEL_REPOSITORY}/postprocessing/config.pbtxt"
 
     replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt"
     replace_config_tags '${decoupled_mode}' 'true' "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt"
     replace_config_tags '${max_queue_delay_microseconds}' "1000000" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt"
     replace_config_tags '${batching_strategy}' 'inflight_fused_batching' "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt"
     replace_config_tags '${engine_dir}' "${ENGINES_DIR}" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt"
+    replace_config_tags '${triton_backend}' "tensorrtllm" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt"
+    replace_config_tags '${max_queue_size}' "0" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt"
 }
 
 # Wait until server health endpoint shows ready. Sets WAIT_RET to 0 on
@@ -244,13 +216,12 @@ function kill_server {
 
 upgrade_openmpi
 clone_tensorrt_llm_backend_repo
-install_tensorrt_llm
 build_gpt2_base_model
 build_gpt2_tensorrt_engine
 prepare_model_repository
 
 # Install perf_analyzer
-pip3 install tritonclient nvidia-ml-py3
+pip3 install tritonclient
 
 ARCH="amd64"
 STATIC_BATCH=1
diff --git a/qa/L0_perf_vllm/test.sh b/qa/L0_perf_vllm/test.sh
index 498f6f8e14..e1ce8cf2ed 100755
--- a/qa/L0_perf_vllm/test.sh
+++ b/qa/L0_perf_vllm/test.sh
@@ -41,7 +41,7 @@ SERVER_ARGS="--model-repository=${MODEL_REPO} --backend-directory=${BACKEND_DIR}
 export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:=0}
 EXPORT_FILE=profile-export-vllm-model.json
 
-pip3 install tritonclient nvidia-ml-py3
+pip3 install tritonclient
 rm -rf $MODEL_REPO $EXPORT_FILE *.tjson *.json *.csv
 
 mkdir -p $MODEL_REPO/$MODEL_NAME/1
diff --git a/qa/L0_python_api/test.sh b/qa/L0_python_api/test.sh
index 6dc7206fe3..0d87d16771 100755
--- a/qa/L0_python_api/test.sh
+++ b/qa/L0_python_api/test.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -49,6 +49,15 @@ if [ $? -ne 0 ]; then
     RET=1
 fi
 
+
+FRONTEND_TEST_LOG="./python_kserve.log"
+python -m pytest --junitxml=test_kserve.xml test_kserve.py > $FRONTEND_TEST_LOG 2>&1
+if [ $? -ne 0 ]; then
+    cat $FRONTEND_TEST_LOG
+    echo -e "\n***\n*** Test Failed\n***"
+    RET=1
+fi
+
 set -e
 
 if [ $RET -eq 0 ]; then
diff --git a/qa/L0_python_api/test_kserve.py b/qa/L0_python_api/test_kserve.py
new file mode 100644
index 0000000000..9e8b82eb43
--- /dev/null
+++ b/qa/L0_python_api/test_kserve.py
@@ -0,0 +1,298 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import time
+from functools import partial
+
+import numpy as np
+import pytest
+import testing_utils as utils
+import tritonclient.grpc as grpcclient
+import tritonclient.http as httpclient
+import tritonserver
+from tritonclient.utils import InferenceServerException
+from tritonfrontend import KServeGrpc, KServeHttp
+
+
+class TestHttpOptions:
+    def test_correct_http_parameters(self):
+        KServeHttp.Options(
+            address="0.0.0.1", port=8080, reuse_port=True, thread_count=16
+        )
+
+    def test_wrong_http_parameters(self):
+        # Out of range
+        with pytest.raises(Exception):
+            KServeHttp.Options(port=-15)
+        with pytest.raises(Exception):
+            KServeHttp.Options(thread_count=-5)
+
+        # Wrong data type
+        with pytest.raises(Exception):
+            KServeHttp.Options(header_forward_pattern=10)
+
+
+class TestGrpcOptions:
+    def test_correct_grpc_parameters(self):
+        KServeGrpc.Options(
+            infer_compression_level=KServeGrpc.Grpc_compression_level.HIGH,
+            reuse_port=True,
+            infer_allocation_pool_size=12,
+            http2_max_pings_without_data=10,
+        )
+
+    def test_wrong_grpc_parameters(self):
+        # Out of Range
+        with pytest.raises(Exception):
+            KServeGrpc.Options(port=-5)
+        with pytest.raises(Exception):
+            KServeGrpc.Options(keepalive_timeout_ms=-20_000)
+
+        # Wrong data type
+        with pytest.raises(Exception):
+            KServeGrpc.Options(infer_allocation_pool_size="big pool")
+        with pytest.raises(Exception):
+            KServeGrpc.Options(server_key=10)
+
+
+HTTP_ARGS = (KServeHttp, httpclient, "localhost:8000")  # Default HTTP args
+GRPC_ARGS = (KServeGrpc, grpcclient, "localhost:8001")  # Default GRPC args
+
+
+class TestKServe:
+    @pytest.mark.parametrize("frontend, client_type, url", [HTTP_ARGS, GRPC_ARGS])
+    def test_server_ready(self, frontend, client_type, url):
+        server = utils.setup_server()
+        service = utils.setup_service(server, frontend)
+        client = utils.setup_client(client_type, url=url)
+
+        assert client.is_server_ready()
+
+        utils.teardown_client(client)
+        utils.teardown_service(service)
+        utils.teardown_server(server)
+
+    @pytest.mark.parametrize("frontend", [HTTP_ARGS[0], GRPC_ARGS[0]])
+    def test_service_double_start(self, frontend):
+        server = utils.setup_server()
+        # setup_service() performs service.start()
+        service = utils.setup_service(server, frontend)
+
+        with pytest.raises(
+            tritonserver.AlreadyExistsError, match="server is already running."
+        ):
+            service.start()
+
+        utils.teardown_server(server)
+        utils.teardown_service(service)
+
+    @pytest.mark.parametrize("frontend", [HTTP_ARGS[0], GRPC_ARGS[0]])
+    def test_invalid_options(self, frontend):
+        server = utils.setup_server()
+        # Current flow is KServeHttp.Options or KServeGrpc.Options have to be
+        # provided to ensure type and range validation occurs.
+        with pytest.raises(
+            tritonserver.InvalidArgumentError,
+            match="Incorrect type for options. options argument must be of type",
+        ):
+            frontend(server, {"port": 8001})
+
+        utils.teardown_server(server)
+
+    @pytest.mark.parametrize("frontend", [HTTP_ARGS[0], GRPC_ARGS[0]])
+    def test_server_service_order(self, frontend):
+        server = utils.setup_server()
+        service = utils.setup_service(server, frontend)
+
+        utils.teardown_server(server)
+        utils.teardown_service(service)
+
+    @pytest.mark.parametrize("frontend, client_type", [HTTP_ARGS[:2], GRPC_ARGS[:2]])
+    def test_service_custom_port(self, frontend, client_type):
+        server = utils.setup_server()
+        options = frontend.Options(port=8005)
+        service = utils.setup_service(server, frontend, options)
+        client = utils.setup_client(client_type, url="localhost:8005")
+
+        # Confirms that service starts at port 8005
+        client.is_server_ready()
+
+        utils.teardown_client(client)
+        utils.teardown_service(service)
+        utils.teardown_server(server)
+
+    @pytest.mark.parametrize("frontend, client_type, url", [HTTP_ARGS, GRPC_ARGS])
+    def test_inference(self, frontend, client_type, url):
+        server = utils.setup_server()
+        service = utils.setup_service(server, frontend)
+
+        # TODO: use common/test_infer
+        assert utils.send_and_test_inference_identity(client_type, url=url)
+
+        utils.teardown_service(service)
+        utils.teardown_server(server)
+
+    @pytest.mark.parametrize("frontend, client_type, url", [GRPC_ARGS])
+    def test_streaming_inference(self, frontend, client_type, url):
+        server = utils.setup_server()
+        service = utils.setup_service(server, frontend)
+
+        assert utils.send_and_test_stream_inference(client_type, url)
+
+        utils.teardown_service(service)
+        utils.teardown_server(server)
+
+    @pytest.mark.parametrize("frontend, client_type, url", [HTTP_ARGS])
+    def test_http_generate_inference(self, frontend, client_type, url):
+        server = utils.setup_server()
+        service = utils.setup_service(server, frontend)
+
+        assert utils.send_and_test_generate_inference()
+
+        utils.teardown_service(service)
+        utils.teardown_server(server)
+
+    @pytest.mark.parametrize("frontend, client_type, url", [HTTP_ARGS])
+    def test_http_req_during_shutdown(self, frontend, client_type, url):
+        server = utils.setup_server()
+        http_service = utils.setup_service(server, frontend)
+        http_client = httpclient.InferenceServerClient(url="localhost:8000")
+        model_name = "delayed_identity"
+        delay = 2  # seconds
+        input_data0 = np.array([[delay]], dtype=np.float32)
+
+        input0 = httpclient.InferInput("INPUT0", input_data0.shape, "FP32")
+        input0.set_data_from_numpy(input_data0)
+
+        inputs = [input0]
+        outputs = [httpclient.InferRequestedOutput("OUTPUT0")]
+
+        async_request = http_client.async_infer(
+            model_name=model_name, inputs=inputs, outputs=outputs
+        )
+        # http_service.stop() does not use graceful shutdown
+        utils.teardown_service(http_service)
+
+        # So, inference request will fail as http endpoints have been stopped.
+        with pytest.raises(
+            InferenceServerException, match="failed to obtain inference response"
+        ):
+            async_request.get_result(block=True, timeout=delay)
+
+        # http_client.close() calls join() to terminate pool of greenlets
+        # However, due to an unsuccessful get_result(), async_request is still
+        # an active thread. Hence, join stalls until greenlet timeouts.
+        # Does not throw an exception, but displays error in logs.
+        utils.teardown_client(http_client)
+
+        # delayed_identity will still be an active model
+        # Hence, server.stop() causes InternalError: Timeout.
+        with pytest.raises(
+            tritonserver.InternalError,
+            match="Exit timeout expired. Exiting immediately.",
+        ):
+            utils.teardown_server(server)
+
+    @pytest.mark.parametrize("frontend, client_type, url", [GRPC_ARGS])
+    def test_grpc_req_during_shutdown(self, frontend, client_type, url):
+        server = utils.setup_server()
+        grpc_service = utils.setup_service(server, frontend)
+        grpc_client = grpcclient.InferenceServerClient(url=url)
+        user_data = []
+
+        def callback(user_data, result, error):
+            if error:
+                user_data.append(error)
+            else:
+                user_data.append(result)
+
+        model_name = "delayed_identity"
+        delay = 2  # seconds
+
+        input_data0 = np.array([[delay]], dtype=np.float32)
+        input0 = client_type.InferInput("INPUT0", input_data0.shape, "FP32")
+        input0.set_data_from_numpy(input_data0)
+
+        inputs = [input0]
+        outputs = [client_type.InferRequestedOutput("OUTPUT0")]
+
+        grpc_client.async_infer(
+            model_name=model_name,
+            inputs=inputs,
+            outputs=outputs,
+            callback=partial(callback, user_data),
+        )
+
+        utils.teardown_service(grpc_service)
+
+        time_out = delay + 1
+        while (len(user_data) == 0) and time_out > 0:
+            time_out = time_out - 1
+            time.sleep(1)
+
+        # Depending on when gRPC frontend shut down StatusCode can vary
+        acceptable_failure_msgs = [
+            "[StatusCode.CANCELLED] CANCELLED",
+            "[StatusCode.UNAVAILABLE] failed to connect to all addresses",
+        ]
+
+        assert (
+            len(user_data) == 1
+            and isinstance(user_data[0], InferenceServerException)
+            and any(
+                failure_msg in str(user_data[0])
+                for failure_msg in acceptable_failure_msgs
+            )
+        )
+
+        utils.teardown_client(grpc_client)
+        utils.teardown_server(server)
+
+    # KNOWN ISSUE: CAUSES SEGFAULT
+    # Created  [DLIS-7231] to address at future date
+    # Once the server has been stopped, the underlying TRITONSERVER_Server instance
+    # is deleted. However, the frontend does not know the server instance
+    # is no longer valid.
+    # def test_inference_after_server_stop(self):
+    #     server = utils.setup_server()
+    #     http_service = utils.setup_service(server, KServeHttp)
+    #     http_client = setup_client(httpclient, url="localhost:8000")
+
+    #     teardown_server(server) # Server has been stopped
+
+    #     model_name = "identity"
+    #     input_data = np.array([["testing"]], dtype=object)
+    #     # Create input and output objects
+    #     inputs = [httpclient.InferInput("INPUT0", input_data.shape, "BYTES")]
+    #     outputs = [httpclient.InferRequestedOutput("OUTPUT0")]
+
+    #     # Set the data for the input tensor
+    #     inputs[0].set_data_from_numpy(input_data)
+
+    #     results = http_client.infer(model_name, inputs=inputs, outputs=outputs)
+
+    #     utils.teardown_client(http_client)
+    #     utils.teardown_service(http_service)
diff --git a/qa/L0_python_api/test_model_repository/delayed_identity/1/model.py b/qa/L0_python_api/test_model_repository/delayed_identity/1/model.py
new file mode 100644
index 0000000000..b6095cec8f
--- /dev/null
+++ b/qa/L0_python_api/test_model_repository/delayed_identity/1/model.py
@@ -0,0 +1,51 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import time
+
+import triton_python_backend_utils as pb_utils
+
+
+class TritonPythonModel:
+    def execute(self, requests):
+        """
+        Mock Model that uses the input data to determine how long to wait
+        before returning identity data
+        """
+        assert len(requests) == 1
+        delay = 0
+        request = requests[0]
+        responses = []
+
+        delay_tensor = pb_utils.get_input_tensor_by_name(request, "INPUT0")
+        delay_as_numpy = delay_tensor.as_numpy()
+        delay = float(delay_as_numpy[0][0])
+
+        out_tensor = pb_utils.Tensor("OUTPUT0", delay_as_numpy)
+        responses.append(pb_utils.InferenceResponse([out_tensor]))
+
+        time.sleep(delay)
+        return responses
diff --git a/qa/L0_python_api/test_model_repository/delayed_identity/config.pbtxt b/qa/L0_python_api/test_model_repository/delayed_identity/config.pbtxt
new file mode 100644
index 0000000000..9ac8f1aaff
--- /dev/null
+++ b/qa/L0_python_api/test_model_repository/delayed_identity/config.pbtxt
@@ -0,0 +1,52 @@
+# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+name: "delayed_identity"
+backend: "python"
+max_batch_size: 64
+
+input [
+  {
+    name: "INPUT0"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+  }
+]
+
+output [
+  {
+    name: "OUTPUT0"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+  }
+]
+
+instance_group [
+  {
+    count: 1
+    kind : KIND_CPU
+  }
+]
\ No newline at end of file
diff --git a/qa/L0_python_api/test_model_repository/identity/1/model.py b/qa/L0_python_api/test_model_repository/identity/1/model.py
new file mode 100644
index 0000000000..629b6469c9
--- /dev/null
+++ b/qa/L0_python_api/test_model_repository/identity/1/model.py
@@ -0,0 +1,49 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import numpy as np
+import triton_python_backend_utils as pb_utils
+
+
+class TritonPythonModel:
+    """This model loops through different dtypes to make sure that
+    serialize_byte_tensor works correctly in the Python backend.
+    """
+
+    def initialize(self, args):
+        self._index = 0
+        self._dtypes = [np.bytes_, np.object_]
+
+    def execute(self, requests):
+        responses = []
+        for request in requests:
+            in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0")
+            out_tensor_0 = pb_utils.Tensor(
+                "OUTPUT0", in_0.as_numpy().astype(self._dtypes[self._index])
+            )
+            self._index += 1
+            responses.append(pb_utils.InferenceResponse([out_tensor_0]))
+        return responses
diff --git a/qa/L0_python_api/test_model_repository/identity/config.pbtxt b/qa/L0_python_api/test_model_repository/identity/config.pbtxt
new file mode 100644
index 0000000000..3f22e14468
--- /dev/null
+++ b/qa/L0_python_api/test_model_repository/identity/config.pbtxt
@@ -0,0 +1,44 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+name: "identity"
+backend: "python"
+max_batch_size: 0
+
+input [
+  {
+    name: "INPUT0"
+    data_type: TYPE_STRING
+    dims: [ 1 ]
+  }
+]
+output [
+  {
+    name: "OUTPUT0"
+    data_type: TYPE_STRING
+    dims: [ 1 ]
+  }
+]
\ No newline at end of file
diff --git a/qa/L0_python_api/testing_utils.py b/qa/L0_python_api/testing_utils.py
new file mode 100644
index 0000000000..48cb3ccc37
--- /dev/null
+++ b/qa/L0_python_api/testing_utils.py
@@ -0,0 +1,153 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+import queue
+from functools import partial
+from typing import Union
+
+import numpy as np
+import requests
+import tritonserver
+from tritonclient.utils import InferenceServerException
+from tritonfrontend import KServeGrpc, KServeHttp
+
+# TODO: Re-Format documentation to fit:
+# https://google.github.io/styleguide/pyguide.html#38-comments-and-docstrings
+
+
+def setup_server(model_repository="test_model_repository") -> tritonserver.Server:
+    module_directory = os.path.split(os.path.abspath(__file__))[0]
+    model_path = os.path.abspath(os.path.join(module_directory, model_repository))
+
+    # Starting Server Instance
+    server_options = tritonserver.Options(
+        server_id="TestServer",
+        model_repository=model_path,
+        log_error=True,
+        log_warn=True,
+        log_info=True,
+    )
+
+    return tritonserver.Server(server_options).start(wait_until_ready=True)
+
+
+def teardown_server(server: tritonserver.Server) -> None:
+    server.stop()
+
+
+def setup_service(
+    server: tritonserver.Server,
+    frontend: Union[KServeHttp, KServeGrpc],
+    options=None,
+) -> Union[KServeHttp, KServeGrpc]:
+    service = frontend(server=server, options=options)
+    service.start()
+    return service
+
+
+def teardown_service(service: Union[KServeHttp, KServeGrpc]) -> None:
+    service.stop()
+
+
+def setup_client(frontend_client, url: str):
+    return frontend_client.InferenceServerClient(url=url)
+
+
+def teardown_client(client) -> None:
+    client.close()
+
+
+# Sends an inference to test_model_repository/identity model and verifies input == output.
+def send_and_test_inference_identity(frontend_client, url: str) -> bool:
+    model_name = "identity"
+    client = setup_client(frontend_client, url)
+    input_data = np.array(["testing"], dtype=object)
+
+    # Create input and output objects
+    inputs = [frontend_client.InferInput("INPUT0", input_data.shape, "BYTES")]
+    outputs = [frontend_client.InferRequestedOutput("OUTPUT0")]
+    # Set the data for the input tensor
+    inputs[0].set_data_from_numpy(input_data)
+
+    # Perform inference request
+    results = client.infer(model_name=model_name, inputs=inputs, outputs=outputs)
+
+    output_data = results.as_numpy("OUTPUT0")  # Gather output data
+
+    teardown_client(client)
+    return input_data[0] == output_data[0].decode()
+
+
+# Sends multiple streaming requests to "delayed_identity" model with negligible delays,
+# and verifies the inputs matches outputs and the ordering is preserved.
+def send_and_test_stream_inference(frontend_client, url: str) -> bool:
+    num_requests = 100
+    requests = []
+    for i in range(num_requests):
+        input0_np = np.array([[float(i) / 1000]], dtype=np.float32)
+        inputs = [frontend_client.InferInput("INPUT0", input0_np.shape, "FP32")]
+        inputs[0].set_data_from_numpy(input0_np)
+        requests.append(inputs)
+
+    responses = []
+
+    def callback(responses, result, error):
+        responses.append({"result": result, "error": error})
+
+    client = frontend_client.InferenceServerClient(url=url)
+    client.start_stream(partial(callback, responses))
+    for inputs in requests:
+        client.async_stream_infer("delayed_identity", inputs)
+    client.stop_stream()
+    teardown_client(client)
+
+    assert len(responses) == num_requests
+    for i in range(len(responses)):
+        assert responses[i]["error"] is None
+        output0_np = responses[i]["result"].as_numpy(name="OUTPUT0")
+        assert np.allclose(output0_np, [[float(i) / 1000]])
+
+    return True  # test passed
+
+
+def send_and_test_generate_inference() -> bool:
+    model_name = "identity"
+    url = f"http://localhost:8000/v2/models/{model_name}/generate"
+    input_text = "testing"
+    data = {
+        "INPUT0": input_text,
+    }
+
+    response = requests.post(url, json=data, stream=True)
+    if response.status_code == 200:
+        result = response.json()
+        output_text = result.get("OUTPUT0", "")
+
+        if output_text == input_text:
+            return True
+
+    return False
diff --git a/qa/L0_sequence_batcher/test.sh b/qa/L0_sequence_batcher/test.sh
index d91b433966..ac34458b4e 100755
--- a/qa/L0_sequence_batcher/test.sh
+++ b/qa/L0_sequence_batcher/test.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2018-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -93,7 +93,7 @@ TF_VERSION=${TF_VERSION:=2}
 # /mnt/c when needed but the paths on the tritonserver command-line
 # must be C:/ style.
 WINDOWS=0
-if [[ "$(< /proc/sys/kernel/osrelease)" == *microsoft* ]]; then
+if [[ -v WSL_DISTRO_NAME ]] || [[ -v MSYSTEM ]]; then
     MODELDIR=${MODELDIR:=C:/models}
     DATADIR=${DATADIR:="/mnt/c/data/inferenceserver/${REPO_VERSION}"}
     BACKEND_DIR=${BACKEND_DIR:=C:/tritonserver/backends}
@@ -183,6 +183,16 @@ export USE_SINGLE_BUFFER
 #   models4 - four instances with batch-size 1
 rm -fr *.log  models{0,1,2,4} queue_delay_models && mkdir models{0,1,2,4} queue_delay_models
 
+# Search BACKENDS to determine if a backend should be tested
+function should_test_backend() {
+  local target_backend=$1
+  if [[ $(echo "${BACKENDS[@]}" | grep -c "${target_backend}") -ne 0 ]]; then
+    echo "true"
+    return
+  fi
+  echo "false"
+}
+
 # Get the datatype to use based on the backend
 function get_datatype () {
   local dtype="int32 bool"
@@ -827,8 +837,13 @@ fi
 
 ### Start Preserve Ordering Tests ###
 
-# Test only supported on windows currently due to use of python backend models
-if [ ${WINDOWS} -ne 1 ]; then
+# FIXME: Test only supported on windows currently due to use of python backend models.
+# Now that Windows supports the PYBE, we should check that this tests works once Windows
+# CI is stable.
+
+# These subtests use python models. They should not be executed if 'python' is not one
+# of the backends under test.
+if [[ $(should_test_backend "python") == "true" &&  !( -v WSL_DISTRO_NAME || -v MSYSTEM )]]; then
     # Test preserve ordering true/false and decoupled/non-decoupled
     TEST_CASE=SequenceBatcherPreserveOrderingTest
     MODEL_PATH=preserve_ordering_models
diff --git a/qa/L0_shared_memory/shared_memory_test.py b/qa/L0_shared_memory/shared_memory_test.py
index c38ecb4814..871fca9b2a 100755
--- a/qa/L0_shared_memory/shared_memory_test.py
+++ b/qa/L0_shared_memory/shared_memory_test.py
@@ -31,7 +31,9 @@
 sys.path.append("../common")
 
 import os
+import time
 import unittest
+from functools import partial
 
 import infer_util as iu
 import numpy as np
@@ -43,7 +45,7 @@
 from tritonclient import utils
 
 
-class SharedMemoryTest(tu.TestResultCollector):
+class SystemSharedMemoryTestBase(tu.TestResultCollector):
     DEFAULT_SHM_BYTE_SIZE = 64
 
     def setUp(self):
@@ -62,6 +64,68 @@ def _setup_client(self):
                 self.url, verbose=True
             )
 
+    def _configure_server(
+        self,
+        create_byte_size=DEFAULT_SHM_BYTE_SIZE,
+        register_byte_size=DEFAULT_SHM_BYTE_SIZE,
+        register_offset=0,
+    ):
+        """Creates and registers shared memory regions for testing.
+
+        Parameters
+        ----------
+        create_byte_size: int
+            Size of each system shared memory region to create.
+            NOTE: This should be sufficiently large to hold the inputs/outputs
+                  stored in shared memory.
+
+        register_byte_size: int
+            Size of each system shared memory region to register with server.
+            NOTE: The (offset + register_byte_size) should be less than or equal
+            to the create_byte_size. Otherwise an exception will be raised for
+            an invalid set of registration args.
+
+        register_offset: int
+            Offset into the shared memory object to start the registered region.
+
+        """
+        shm_ip0_handle = shm.create_shared_memory_region(
+            "input0_data", "/input0_data", create_byte_size
+        )
+        shm_ip1_handle = shm.create_shared_memory_region(
+            "input1_data", "/input1_data", create_byte_size
+        )
+        shm_op0_handle = shm.create_shared_memory_region(
+            "output0_data", "/output0_data", create_byte_size
+        )
+        shm_op1_handle = shm.create_shared_memory_region(
+            "output1_data", "/output1_data", create_byte_size
+        )
+        # Implicit assumption that input and output byte_sizes are 64 bytes for now
+        input0_data = np.arange(start=0, stop=16, dtype=np.int32)
+        input1_data = np.ones(shape=16, dtype=np.int32)
+        shm.set_shared_memory_region(shm_ip0_handle, [input0_data])
+        shm.set_shared_memory_region(shm_ip1_handle, [input1_data])
+        self.triton_client.register_system_shared_memory(
+            "input0_data", "/input0_data", register_byte_size, offset=register_offset
+        )
+        self.triton_client.register_system_shared_memory(
+            "input1_data", "/input1_data", register_byte_size, offset=register_offset
+        )
+        self.triton_client.register_system_shared_memory(
+            "output0_data", "/output0_data", register_byte_size, offset=register_offset
+        )
+        self.triton_client.register_system_shared_memory(
+            "output1_data", "/output1_data", register_byte_size, offset=register_offset
+        )
+        return [shm_ip0_handle, shm_ip1_handle, shm_op0_handle, shm_op1_handle]
+
+    def _cleanup_server(self, shm_handles):
+        for shm_handle in shm_handles:
+            shm.destroy_shared_memory_region(shm_handle)
+
+
+class SharedMemoryTest(SystemSharedMemoryTestBase):
     def test_invalid_create_shm(self):
         # Raises error since tried to create invalid system shared memory region
         try:
@@ -128,66 +192,6 @@ def test_reregister_after_register(self):
             self.assertTrue(len(shm_status.regions) == 1)
         shm.destroy_shared_memory_region(shm_op0_handle)
 
-    def _configure_server(
-        self,
-        create_byte_size=DEFAULT_SHM_BYTE_SIZE,
-        register_byte_size=DEFAULT_SHM_BYTE_SIZE,
-        register_offset=0,
-    ):
-        """Creates and registers shared memory regions for testing.
-
-        Parameters
-        ----------
-        create_byte_size: int
-            Size of each system shared memory region to create.
-            NOTE: This should be sufficiently large to hold the inputs/outputs
-                  stored in shared memory.
-
-        register_byte_size: int
-            Size of each system shared memory region to register with server.
-            NOTE: The (offset + register_byte_size) should be less than or equal
-            to the create_byte_size. Otherwise an exception will be raised for
-            an invalid set of registration args.
-
-        register_offset: int
-            Offset into the shared memory object to start the registered region.
-
-        """
-        shm_ip0_handle = shm.create_shared_memory_region(
-            "input0_data", "/input0_data", create_byte_size
-        )
-        shm_ip1_handle = shm.create_shared_memory_region(
-            "input1_data", "/input1_data", create_byte_size
-        )
-        shm_op0_handle = shm.create_shared_memory_region(
-            "output0_data", "/output0_data", create_byte_size
-        )
-        shm_op1_handle = shm.create_shared_memory_region(
-            "output1_data", "/output1_data", create_byte_size
-        )
-        # Implicit assumption that input and output byte_sizes are 64 bytes for now
-        input0_data = np.arange(start=0, stop=16, dtype=np.int32)
-        input1_data = np.ones(shape=16, dtype=np.int32)
-        shm.set_shared_memory_region(shm_ip0_handle, [input0_data])
-        shm.set_shared_memory_region(shm_ip1_handle, [input1_data])
-        self.triton_client.register_system_shared_memory(
-            "input0_data", "/input0_data", register_byte_size, offset=register_offset
-        )
-        self.triton_client.register_system_shared_memory(
-            "input1_data", "/input1_data", register_byte_size, offset=register_offset
-        )
-        self.triton_client.register_system_shared_memory(
-            "output0_data", "/output0_data", register_byte_size, offset=register_offset
-        )
-        self.triton_client.register_system_shared_memory(
-            "output1_data", "/output1_data", register_byte_size, offset=register_offset
-        )
-        return [shm_ip0_handle, shm_ip1_handle, shm_op0_handle, shm_op1_handle]
-
-    def _cleanup_server(self, shm_handles):
-        for shm_handle in shm_handles:
-            shm.destroy_shared_memory_region(shm_handle)
-
     def test_unregister_after_inference(self):
         # Unregister after inference
         error_msg = []
@@ -443,5 +447,169 @@ def test_python_client_leak(self):
         )
 
 
+class TestSharedMemoryUnregister(SystemSharedMemoryTestBase):
+    def _test_unregister_shm_fail(self):
+        second_client = httpclient.InferenceServerClient("localhost:8000", verbose=True)
+
+        with self.assertRaises(utils.InferenceServerException) as ex:
+            second_client.unregister_system_shared_memory()
+        self.assertIn(
+            "Failed to unregister the following system shared memory regions: input0_data ,input1_data ,output0_data ,output1_data",
+            str(ex.exception),
+        )
+
+        with self.assertRaises(utils.InferenceServerException) as ex:
+            second_client.unregister_system_shared_memory("input0_data")
+        self.assertIn(
+            "Cannot unregister shared memory region 'input0_data', it is currently in use.",
+            str(ex.exception),
+        )
+
+        with self.assertRaises(utils.InferenceServerException) as ex:
+            second_client.unregister_system_shared_memory("input1_data")
+        self.assertIn(
+            "Cannot unregister shared memory region 'input1_data', it is currently in use.",
+            str(ex.exception),
+        )
+
+        with self.assertRaises(utils.InferenceServerException) as ex:
+            second_client.unregister_system_shared_memory("output0_data")
+        self.assertIn(
+            "Cannot unregister shared memory region 'output0_data', it is currently in use.",
+            str(ex.exception),
+        )
+
+        with self.assertRaises(utils.InferenceServerException) as ex:
+            second_client.unregister_system_shared_memory("output1_data")
+        self.assertIn(
+            "Cannot unregister shared memory region 'output1_data', it is currently in use.",
+            str(ex.exception),
+        )
+
+    def _test_shm_not_found(self):
+        second_client = httpclient.InferenceServerClient("localhost:8000", verbose=True)
+
+        with self.assertRaises(utils.InferenceServerException) as ex:
+            second_client.get_system_shared_memory_status("input0_data")
+        self.assertIn(
+            "Unable to find system shared memory region: 'input0_data'",
+            str(ex.exception),
+        )
+
+        with self.assertRaises(utils.InferenceServerException) as ex:
+            second_client.get_system_shared_memory_status("input1_data")
+        self.assertIn(
+            "Unable to find system shared memory region: 'input1_data'",
+            str(ex.exception),
+        )
+
+        with self.assertRaises(utils.InferenceServerException) as ex:
+            second_client.get_system_shared_memory_status("output0_data")
+        self.assertIn(
+            "Unable to find system shared memory region: 'output0_data'",
+            str(ex.exception),
+        )
+
+        with self.assertRaises(utils.InferenceServerException) as ex:
+            second_client.get_system_shared_memory_status("output1_data")
+        self.assertIn(
+            "Unable to find system shared memory region: 'output1_data'",
+            str(ex.exception),
+        )
+
+    def test_unregister_shm_during_inference_http(self):
+        try:
+            self.triton_client.unregister_system_shared_memory()
+            shm_handles = self._configure_server()
+
+            inputs = [
+                httpclient.InferInput("INPUT0", [1, 16], "INT32"),
+                httpclient.InferInput("INPUT1", [1, 16], "INT32"),
+            ]
+            outputs = [
+                httpclient.InferRequestedOutput("OUTPUT0", binary_data=True),
+                httpclient.InferRequestedOutput("OUTPUT1", binary_data=False),
+            ]
+
+            inputs[0].set_shared_memory("input0_data", self.DEFAULT_SHM_BYTE_SIZE)
+            inputs[1].set_shared_memory("input1_data", self.DEFAULT_SHM_BYTE_SIZE)
+            outputs[0].set_shared_memory("output0_data", self.DEFAULT_SHM_BYTE_SIZE)
+            outputs[1].set_shared_memory("output1_data", self.DEFAULT_SHM_BYTE_SIZE)
+
+            async_request = self.triton_client.async_infer(
+                model_name="simple", inputs=inputs, outputs=outputs
+            )
+
+            # Ensure inference started
+            time.sleep(2)
+
+            # Try unregister shm regions during inference
+            self._test_unregister_shm_fail()
+
+            # Blocking call
+            async_request.get_result()
+
+            # Try unregister shm regions after inference
+            self.triton_client.unregister_system_shared_memory()
+            self._test_shm_not_found()
+
+        finally:
+            self._cleanup_server(shm_handles)
+
+    def test_unregister_shm_during_inference_grpc(self):
+        try:
+            self.triton_client.unregister_system_shared_memory()
+            shm_handles = self._configure_server()
+
+            inputs = [
+                grpcclient.InferInput("INPUT0", [1, 16], "INT32"),
+                grpcclient.InferInput("INPUT1", [1, 16], "INT32"),
+            ]
+            outputs = [
+                grpcclient.InferRequestedOutput("OUTPUT0"),
+                grpcclient.InferRequestedOutput("OUTPUT1"),
+            ]
+
+            inputs[0].set_shared_memory("input0_data", self.DEFAULT_SHM_BYTE_SIZE)
+            inputs[1].set_shared_memory("input1_data", self.DEFAULT_SHM_BYTE_SIZE)
+            outputs[0].set_shared_memory("output0_data", self.DEFAULT_SHM_BYTE_SIZE)
+            outputs[1].set_shared_memory("output1_data", self.DEFAULT_SHM_BYTE_SIZE)
+
+            def callback(user_data, result, error):
+                if error:
+                    user_data.append(error)
+                else:
+                    user_data.append(result)
+
+            user_data = []
+
+            self.triton_client.async_infer(
+                model_name="simple",
+                inputs=inputs,
+                outputs=outputs,
+                callback=partial(callback, user_data),
+            )
+
+            # Ensure inference started
+            time.sleep(2)
+
+            # Try unregister shm regions during inference
+            self._test_unregister_shm_fail()
+
+            # Wait until the results are available in user_data
+            time_out = 20
+            while (len(user_data) == 0) and time_out > 0:
+                time_out = time_out - 1
+                time.sleep(1)
+            time.sleep(2)
+
+            # Try unregister shm regions after inference
+            self.triton_client.unregister_system_shared_memory()
+            self._test_shm_not_found()
+
+        finally:
+            self._cleanup_server(shm_handles)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_shared_memory/test.sh b/qa/L0_shared_memory/test.sh
index ba6a2fa8f2..e711de9cff 100755
--- a/qa/L0_shared_memory/test.sh
+++ b/qa/L0_shared_memory/test.sh
@@ -95,6 +95,46 @@ for i in \
     done
 done
 
+mkdir -p python_models/simple/1/
+cp ../python_models/execute_delayed_model/model.py ./python_models/simple/1/
+cp ../python_models/execute_delayed_model/config.pbtxt ./python_models/simple/
+
+for client_type in http grpc; do
+    SERVER_ARGS="--model-repository=`pwd`/python_models --log-verbose=1 ${SERVER_ARGS_EXTRA}"
+    SERVER_LOG="./unregister_shm.$client_type.server.log"
+    run_server
+    if [ "$SERVER_PID" == "0" ]; then
+        echo -e "\n***\n*** Failed to start $SERVER\n***"
+        cat $SERVER_LOG
+        exit 1
+    fi
+
+    export CLIENT_TYPE=$client_type
+    CLIENT_LOG="./unregister_shm.$client_type.client.log"
+    set +e
+    python3 $SHM_TEST TestSharedMemoryUnregister.test_unregister_shm_during_inference_$client_type >>$CLIENT_LOG 2>&1
+    if [ $? -ne 0 ]; then
+        cat $CLIENT_LOG
+        echo -e "\n***\n*** Test Failed\n***"
+        RET=1
+    else
+        check_test_results $TEST_RESULT_FILE 1
+        if [ $? -ne 0 ]; then
+            cat $TEST_RESULT_FILE
+            echo -e "\n***\n*** Test Result Verification Failed\n***"
+            RET=1
+        fi
+    fi
+
+    kill $SERVER_PID
+    wait $SERVER_PID
+    if [ $? -ne 0 ]; then
+        echo -e "\n***\n*** Test Server shut down non-gracefully\n***"
+        RET=1
+    fi
+    set -e
+    done
+
 if [ $RET -eq 0 ]; then
     echo -e "\n***\n*** Test Passed\n***"
 else
diff --git a/qa/L0_trt_plugin/test.sh b/qa/L0_trt_plugin/test.sh
index 075dd54eab..a9d04331f0 100755
--- a/qa/L0_trt_plugin/test.sh
+++ b/qa/L0_trt_plugin/test.sh
@@ -47,7 +47,7 @@ PLUGIN_TEST=trt_plugin_test.py
 # On windows the paths invoked by the script (running in WSL) must use
 # /mnt/c when needed but the paths on the tritonserver command-line
 # must be C:/ style.
-if [[ "$(< /proc/sys/kernel/osrelease)" == *microsoft* ]]; then
+if [[ -v WSL_DISTRO_NAME ]] || [[ -v MSYSTEM ]]; then
     DATADIR=${DATADIR:="/mnt/c/data/inferenceserver/${REPO_VERSION}"}
     MODELDIR=${MODELDIR:=C:/models}
     CUSTOMPLUGIN=${CUSTOMPLUGIN:=$MODELDIR/HardmaxPlugin.dll}
@@ -135,7 +135,7 @@ SERVER_LD_PRELOAD=$CUSTOMPLUGIN
 SERVER_ARGS=$SERVER_ARGS_BASE
 SERVER_LOG="./inference_server_$LOG_IDX.log"
 
-if [[ "$(< /proc/sys/kernel/osrelease)" != *microsoft* ]]; then
+if  [[ ! -v WSL_DISTRO_NAME ]] || [[ ! -v MSYSTEM ]]; then
     run_server
     if [ "$SERVER_PID" == "0" ]; then
         echo -e "\n***\n*** Failed to start $SERVER\n***"
diff --git a/qa/L0_trt_shape_tensors/test.sh b/qa/L0_trt_shape_tensors/test.sh
index f08ed339b0..548ebb55af 100755
--- a/qa/L0_trt_shape_tensors/test.sh
+++ b/qa/L0_trt_shape_tensors/test.sh
@@ -45,7 +45,7 @@ CLIENT_LOG="./client.log"
 SHAPE_TENSOR_TEST=trt_shape_tensor_test.py
 
 SERVER=/opt/tritonserver/bin/tritonserver
-SERVER_ARGS="--model-repository=`pwd`/models"
+SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=1"
 SERVER_LOG="./inference_server.log"
 source ../common/util.sh
 
diff --git a/qa/L0_warmup/test.sh b/qa/L0_warmup/test.sh
index aeed873b25..a535aed25b 100755
--- a/qa/L0_warmup/test.sh
+++ b/qa/L0_warmup/test.sh
@@ -42,7 +42,7 @@ export CUDA_VISIBLE_DEVICES=0
 
 CLIENT=../clients/image_client
 CLIENT_LOG="./client.log"
-CLIENT_PY=./python_unittest.py
+CLIENT_PY=./test_infer_shm_leak.py
 EXPECTED_NUM_TESTS="1"
 TEST_RESULT_FILE='test_results.txt'
 
@@ -449,8 +449,8 @@ mkdir -p models/bls_onnx_warmup/1/
 cp ../python_models/bls_onnx_warmup/model.py models/bls_onnx_warmup/1/
 cp ../python_models/bls_onnx_warmup/config.pbtxt models/bls_onnx_warmup/.
 
-cp ../L0_backend_python/python_unittest.py .
-sed -i 's#sys.path.append("../../common")#sys.path.append("../common")#g' python_unittest.py
+cp ../L0_backend_python/test_infer_shm_leak.py .
+sed -i 's#sys.path.append("../../common")#sys.path.append("../common")#g' test_infer_shm_leak.py
 
 run_server
 if [ "$SERVER_PID" == "0" ]; then
diff --git a/qa/common/gen_jetson_trt_models b/qa/common/gen_jetson_trt_models
index 99a6175a08..21e9fe53ff 100755
--- a/qa/common/gen_jetson_trt_models
+++ b/qa/common/gen_jetson_trt_models
@@ -34,7 +34,7 @@
 # Make all generated files accessible outside of container
 umask 0000
 # Set the version of the models
-TRITON_VERSION=${TRITON_VERSION:=24.07}
+TRITON_VERSION=${TRITON_VERSION:=24.09}
 # Set the CUDA device to use
 CUDA_DEVICE=${RUNNER_ID:=0}
 # Set TensorRT image
@@ -48,9 +48,7 @@ HOST_MODEL_DIR=${HOST_MODEL_DIR:="${HOST_BUILD_DIR}/${TRITON_VERSION}"}
 HOST_SOURCE_DIR=$HOST_BUILD_DIR/gen_srcdir
 
 # Set CI specific parameters
-DOCKER_GPU_ARGS="${DOCKER_GPU_ARGS:="--gpus device=$CUDA_DEVICE"}"
-[[ $RUNNER_GPUS =~ ^[0-9] ]] && DOCKER_GPU_ARGS=$(eval $NV_DOCKER_ARGS)
-
+DOCKER_GPU_ARGS=${DOCKER_GPU_ARGS:-$([[ $RUNNER_GPUS =~ ^[0-9] ]] && eval $NV_DOCKER_ARGS || echo "--gpus device=$CUDA_DEVICE" )}
 
 # Set model output directories
 
diff --git a/qa/common/gen_qa_custom_ops b/qa/common/gen_qa_custom_ops
index 4ae0f006b3..286052914b 100755
--- a/qa/common/gen_qa_custom_ops
+++ b/qa/common/gen_qa_custom_ops
@@ -37,14 +37,14 @@
 ##
 ############################################################################
 
-TRITON_VERSION=${TRITON_VERSION:=24.07}
+TRITON_VERSION=${TRITON_VERSION:=24.09}
 NVIDIA_UPSTREAM_VERSION=${NVIDIA_UPSTREAM_VERSION:=$TRITON_VERSION}
 TENSORFLOW_IMAGE=${TENSORFLOW_IMAGE:=nvcr.io/nvidia/tensorflow:$NVIDIA_UPSTREAM_VERSION-tf2-py3}
 PYTORCH_IMAGE=${PYTORCH_IMAGE:=nvcr.io/nvidia/pytorch:$NVIDIA_UPSTREAM_VERSION-py3}
 
 CUDA_DEVICE=${NV_GPU:=0}
 
-[[ $RUNNER_GPUS =~ ^[0-9] ]] && DOCKER_GPU_ARGS=$(eval $NV_DOCKER_ARGS) || DOCKER_GPU_ARGS="--gpus device=$CUDA_DEVICE"
+DOCKER_GPU_ARGS=${DOCKER_GPU_ARGS:-$([[ $RUNNER_GPUS =~ ^[0-9] ]] && eval $NV_DOCKER_ARGS || echo "--gpus device=$CUDA_DEVICE" )}
 
 ###
 HOST_BUILD_DIR=${HOST_BUILD_DIR:=/tmp}
diff --git a/qa/common/gen_qa_model_repository b/qa/common/gen_qa_model_repository
index cab497aa86..f26ba863ce 100755
--- a/qa/common/gen_qa_model_repository
+++ b/qa/common/gen_qa_model_repository
@@ -48,7 +48,7 @@
 ##
 ############################################################################
 
-TRITON_VERSION=${TRITON_VERSION:=24.07}
+TRITON_VERSION=${TRITON_VERSION:=24.09}
 
 # ONNX. Use ONNX_OPSET 0 to use the default for ONNX version
 ONNX_VERSION=1.13.0
@@ -63,7 +63,8 @@ TENSORFLOW_IMAGE=${TENSORFLOW_IMAGE:=nvcr.io/nvidia/tensorflow:$TRITON_VERSION-t
 TENSORRT_IMAGE=${TENSORRT_IMAGE:=nvcr.io/nvidia/tensorrt:$TRITON_VERSION-py3}
 CUDA_DEVICE=${NV_GPU:=0}
 
-[[ $RUNNER_GPUS =~ ^[0-9] ]] && DOCKER_GPU_ARGS=$(eval $NV_DOCKER_ARGS) || DOCKER_GPU_ARGS="--gpus device=$CUDA_DEVICE"
+DOCKER_GPU_ARGS=${DOCKER_GPU_ARGS:-$([[ $RUNNER_GPUS =~ ^[0-9] ]] && eval $NV_DOCKER_ARGS || echo "--gpus device=$CUDA_DEVICE" )}
+MODEL_TYPE=${MODEL_TYPE:-""}
 
 ###
 HOST_BUILD_DIR=${HOST_BUILD_DIR:=/tmp}
@@ -360,8 +361,10 @@ python3 $SRCDIR/gen_qa_implicit_models.py --libtorch --variable --models_dir=$VA
 chmod -R 777 $VARIMPLICITSEQDESTDIR
 python3 $SRCDIR/gen_qa_dyna_sequence_models.py --libtorch --models_dir=$DYNASEQDESTDIR
 chmod -R 777 $DYNASEQDESTDIR
-python3 $SRCDIR/gen_qa_torchtrt_models.py --models_dir=$TORCHTRTDESTDIR
-chmod -R 777 $TORCHTRTDESTDIR
+if [ -z "$MODEL_TYPE" ] || [ "$MODEL_TYPE" != "igpu" ]; then
+  python3 $SRCDIR/gen_qa_torchtrt_models.py --models_dir=$TORCHTRTDESTDIR
+  chmod -R 777 $TORCHTRTDESTDIR
+fi
 python3 $SRCDIR/gen_qa_ragged_models.py --libtorch --models_dir=$RAGGEDDESTDIR
 chmod -R 777 $RAGGEDDESTDIR
 # Export torchvision image models to ONNX
diff --git a/qa/common/shm_util.py b/qa/common/shm_util.py
index 16e5ce4e45..0e533bcdbb 100755
--- a/qa/common/shm_util.py
+++ b/qa/common/shm_util.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 
-# Copyright 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2018-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -441,6 +441,9 @@ def __exit__(self, type, value, traceback):
                     print(
                         f"Shared memory leak detected [{shm_region}]: {curr_shm_free_size} (curr free) < {prev_shm_free_size} (prev free)."
                     )
+                    # FIXME DLIS-7122: Known shared memory leak of 480 bytes in BLS test.
+                    if curr_shm_free_size == 1006576 and prev_shm_free_size == 1007056:
+                        assert False, f"Known shared memory leak of 480 bytes detected."
             assert not shm_leak_detected, f"Shared memory leak detected."
 
         def _get_shm_free_sizes(self, delay_sec=0):
diff --git a/qa/common/util.sh b/qa/common/util.sh
index 3297dd2914..3874916573 100755
--- a/qa/common/util.sh
+++ b/qa/common/util.sh
@@ -257,7 +257,7 @@ function run_server_nowait () {
         return
     fi
 
-    if [[ "$(< /proc/sys/kernel/osrelease)" == *microsoft* ]]; then
+    if [[ -v WSL_DISTRO_NAME ]] || [[ -v MSYSTEM ]]; then
         # LD_PRELOAD not yet supported on windows
         if [ -z "$SERVER_LD_PRELOAD" ]; then
             echo "=== Running $SERVER $SERVER_ARGS"
@@ -329,7 +329,7 @@ function kill_server () {
     # causes the entire WSL shell to just exit. So instead we must use
     # taskkill.exe which can only forcefully kill tritonserver which
     # means that it does not gracefully exit.
-    if [[ "$(< /proc/sys/kernel/osrelease)" == *microsoft* ]]; then
+    if [[ -v WSL_DISTRO_NAME ]]; then
         # Disable -x as it makes output below hard to read
         oldstate="$(set +o)"; [[ -o errexit ]] && oldstate="$oldstate; set -e"
         set +x
@@ -353,6 +353,8 @@ function kill_server () {
         fi
 
         set +vx; eval "$oldstate"
+    elif [[ -v MSYSTEM ]] ; then
+        taskkill //F //IM tritonserver.exe
     else
         # Non-windows...
         kill $SERVER_PID
@@ -512,17 +514,23 @@ remove_array_outliers() {
 
 function setup_virtualenv() {
     # Create and activate virtual environment
-    virtualenv --system-site-packages venv
-    source venv/bin/activate
-    pip install pytest
+    if [[ -v MSYSTEM ]]; then
+      pip3 install pytest
+    else
+      virtualenv --system-site-packages venv
+      source venv/bin/activate
+      pip install pytest
+    fi
 
     if [[ ${TEST_WINDOWS} == 1 ]]; then
-        pip3 install "numpy<2" tritonclient[all]
+      pip3 install "numpy<2" tritonclient[all]
     fi
 }
 
 function deactivate_virtualenv() {
     # Deactivate virtual environment and clean up
+  if [[ ! -v MSYSTEM ]]; then
     deactivate
     rm -fr venv
+  fi
 }
diff --git a/qa/python_models/custom_metrics/model.py b/qa/python_models/custom_metrics/model.py
index 31f105a1dd..7c78b46894 100644
--- a/qa/python_models/custom_metrics/model.py
+++ b/qa/python_models/custom_metrics/model.py
@@ -1,4 +1,4 @@
-# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -74,6 +74,96 @@ def _metric_api_helper(self, metric, kind):
             self.assertEqual(metric.value(), value)
             logger.log_info("Set metric to : {}".format(metric.value()))
 
+        # Test observe value
+        observe = 0.05
+        # Counter and gauge do not support observe
+        with self.assertRaises(pb_utils.TritonModelException):
+            metric.observe(observe)
+
+    def _histogram_api_helper(self, metric, name, labels):
+        def histogram_str_builder(name, type, labels, value, le=None):
+            if type == "count" or type == "sum":
+                return f"{name}_{type}{{{labels}}} {value}"
+            elif type == "bucket":
+                return f'{name}_bucket{{{labels},le="{le}"}} {value}'
+            else:
+                raise
+
+        # Adding logger to test if custom metrics and logging work together
+        # as they use the same message queue.
+        logger = pb_utils.Logger
+
+        # All values should be 0.0 before the test
+        metrics = self._get_metrics()
+        self.assertIn(histogram_str_builder(name, "count", labels, "0"), metrics)
+        self.assertIn(histogram_str_builder(name, "sum", labels, "0"), metrics)
+        self.assertIn(
+            histogram_str_builder(name, "bucket", labels, "0", le="0.1"), metrics
+        )
+        self.assertIn(
+            histogram_str_builder(name, "bucket", labels, "0", le="1"), metrics
+        )
+        self.assertIn(
+            histogram_str_builder(name, "bucket", labels, "0", le="2.5"), metrics
+        )
+        self.assertIn(
+            histogram_str_builder(name, "bucket", labels, "0", le="5"), metrics
+        )
+        self.assertIn(
+            histogram_str_builder(name, "bucket", labels, "0", le="10"), metrics
+        )
+        self.assertIn(
+            histogram_str_builder(name, "bucket", labels, "0", le="+Inf"), metrics
+        )
+
+        # Histogram does not support value
+        with self.assertRaises(pb_utils.TritonModelException):
+            metric.value()
+
+        # Test increment value
+        increment = 2023.0
+        # Histogram does not support increment
+        with self.assertRaises(pb_utils.TritonModelException):
+            metric.increment(increment)
+
+        # Test set value
+        value = 999.9
+        # Histogram does not support set
+        with self.assertRaises(pb_utils.TritonModelException):
+            metric.set(value)
+
+        # Test observe value
+        data = [0.05, 1.5, 6.0]
+        for datum in data:
+            metric.observe(datum)
+            logger.log_info("Observe histogram metric with value : {}".format(datum))
+
+        metrics = self._get_metrics()
+        self.assertIn(
+            histogram_str_builder(name, "count", labels, str(len(data))), metrics
+        )
+        self.assertIn(
+            histogram_str_builder(name, "sum", labels, str(sum(data))), metrics
+        )
+        self.assertIn(
+            histogram_str_builder(name, "bucket", labels, "1", le="0.1"), metrics
+        )
+        self.assertIn(
+            histogram_str_builder(name, "bucket", labels, "1", le="1"), metrics
+        )
+        self.assertIn(
+            histogram_str_builder(name, "bucket", labels, "2", le="2.5"), metrics
+        )
+        self.assertIn(
+            histogram_str_builder(name, "bucket", labels, "2", le="5"), metrics
+        )
+        self.assertIn(
+            histogram_str_builder(name, "bucket", labels, "3", le="10"), metrics
+        )
+        self.assertIn(
+            histogram_str_builder(name, "bucket", labels, "3", le="+Inf"), metrics
+        )
+
     def _dup_metric_helper(self, labels={}):
         # Adding logger to test if custom metrics and logging work together
         # as they use the same message queue.
@@ -128,14 +218,62 @@ def test_gauge_e2e(self):
             description="test metric gauge kind end to end",
             kind=pb_utils.MetricFamily.GAUGE,
         )
-        labels = {"example1": "counter_label1", "example2": "counter_label2"}
+        labels = {"example1": "gauge_label1", "example2": "gauge_label2"}
         metric = metric_family.Metric(labels=labels)
         self._metric_api_helper(metric, "gauge")
 
-        pattern = 'test_gauge_e2e{example1="counter_label1",example2="counter_label2"}'
+        pattern = 'test_gauge_e2e{example1="gauge_label1",example2="gauge_label2"}'
         metrics = self._get_metrics()
         self.assertIn(pattern, metrics)
 
+    def test_histogram_e2e(self):
+        name = "test_histogram_e2e"
+        metric_family = pb_utils.MetricFamily(
+            name=name,
+            description="test metric histogram kind end to end",
+            kind=pb_utils.MetricFamily.HISTOGRAM,
+        )
+
+        labels = {"example1": "histogram_label1", "example2": "histogram_label2"}
+        buckets = [0.1, 1.0, 2.5, 5.0, 10.0]
+        metric = metric_family.Metric(labels=labels, buckets=buckets)
+
+        labels_str = 'example1="histogram_label1",example2="histogram_label2"'
+        self._histogram_api_helper(metric, name, labels_str)
+
+        metrics = self._get_metrics()
+        count_pattern = f"{name}_count{{{labels_str}}}"
+        sum_pattern = f"{name}_sum{{{labels_str}}}"
+        bucket_pattern = f"{name}_bucket{{{labels_str}"
+        self.assertEqual(metrics.count(count_pattern), 1)
+        self.assertEqual(metrics.count(sum_pattern), 1)
+        self.assertEqual(metrics.count(bucket_pattern), len(buckets) + 1)
+
+    def test_histogram_args(self):
+        name = "test_histogram_args"
+        metric_family = pb_utils.MetricFamily(
+            name=name,
+            description="test metric histogram args",
+            kind=pb_utils.MetricFamily.HISTOGRAM,
+        )
+
+        # Test "None" value buckets
+        with self.assertRaises(pb_utils.TritonModelException):
+            metric_family.Metric(labels={})
+        with self.assertRaises(pb_utils.TritonModelException):
+            metric_family.Metric(labels={}, buckets=None)
+
+        # Test non-ascending order buckets
+        with self.assertRaises(pb_utils.TritonModelException):
+            metric_family.Metric(labels={}, buckets=[2.5, 0.1, 1.0, 10.0, 5.0])
+
+        # Test duplicate value buckets
+        with self.assertRaises(pb_utils.TritonModelException):
+            metric_family.Metric(labels={}, buckets=[1, 1, 2, 5, 5])
+
+        # Test empty list bucket
+        metric_family.Metric(labels={}, buckets=[])
+
     def test_dup_metric_family_diff_kind(self):
         # Test that a duplicate metric family can't be added with a conflicting type/kind
         metric_family1 = pb_utils.MetricFamily(
diff --git a/qa/python_models/execute_delayed_model/config.pbtxt b/qa/python_models/execute_delayed_model/config.pbtxt
new file mode 100644
index 0000000000..0a4ee59d3e
--- /dev/null
+++ b/qa/python_models/execute_delayed_model/config.pbtxt
@@ -0,0 +1,55 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+name: "simple"
+backend: "python"
+max_batch_size: 8
+input [
+  {
+    name: "INPUT0"
+    data_type: TYPE_INT32
+    dims: [ 16 ]
+  },
+  {
+    name: "INPUT1"
+    data_type: TYPE_INT32
+    dims: [ 16 ]
+  }
+]
+output [
+  {
+    name: "OUTPUT0"
+    data_type: TYPE_INT32
+    dims: [ 16 ]
+  },
+  {
+    name: "OUTPUT1"
+    data_type: TYPE_INT32
+    dims: [ 16 ]
+  }
+]
+
+instance_group [ { kind: KIND_CPU }]
diff --git a/qa/python_models/execute_delayed_model/model.py b/qa/python_models/execute_delayed_model/model.py
new file mode 100644
index 0000000000..055b321a93
--- /dev/null
+++ b/qa/python_models/execute_delayed_model/model.py
@@ -0,0 +1,72 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import json
+import time
+
+import triton_python_backend_utils as pb_utils
+
+
+class TritonPythonModel:
+    def initialize(self, args):
+        self.model_config = model_config = json.loads(args["model_config"])
+        output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0")
+        output1_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT1")
+        self.output0_dtype = pb_utils.triton_string_to_numpy(
+            output0_config["data_type"]
+        )
+        self.output1_dtype = pb_utils.triton_string_to_numpy(
+            output1_config["data_type"]
+        )
+
+    def execute(self, requests):
+        output0_dtype = self.output0_dtype
+        output1_dtype = self.output1_dtype
+        responses = []
+
+        time.sleep(15)
+
+        for request in requests:
+            in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0")
+            in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1")
+
+            out_0, out_1 = (
+                in_0.as_numpy() + in_1.as_numpy(),
+                in_0.as_numpy() - in_1.as_numpy(),
+            )
+
+            out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0.astype(output0_dtype))
+            out_tensor_1 = pb_utils.Tensor("OUTPUT1", out_1.astype(output1_dtype))
+
+            inference_response = pb_utils.InferenceResponse(
+                output_tensors=[out_tensor_0, out_tensor_1]
+            )
+            responses.append(inference_response)
+
+        return responses
+
+    def finalize(self):
+        print("Cleaning up...")
diff --git a/qa/python_models/execute_grpc_error/config.pbtxt b/qa/python_models/execute_grpc_error/config.pbtxt
new file mode 100644
index 0000000000..70e247148a
--- /dev/null
+++ b/qa/python_models/execute_grpc_error/config.pbtxt
@@ -0,0 +1,51 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+backend: "python"
+max_batch_size: 64
+
+input [
+  {
+    name: "IN"
+    data_type: TYPE_FP32
+    dims: [ -1 ]
+  }
+]
+
+output [
+  {
+    name: "OUT"
+    data_type: TYPE_FP32
+    dims: [ -1 ]
+  }
+]
+
+instance_group [
+  {
+    count: 1
+    kind : KIND_CPU
+  }
+]
diff --git a/qa/python_models/execute_grpc_error/model.py b/qa/python_models/execute_grpc_error/model.py
new file mode 100644
index 0000000000..d5087a49ec
--- /dev/null
+++ b/qa/python_models/execute_grpc_error/model.py
@@ -0,0 +1,52 @@
+# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import triton_python_backend_utils as pb_utils
+
+
+class TritonPythonModel:
+    def __init__(self):
+        # Maintain total inference count, so as to return error on 2nd request, all of this to simulate model failure
+        self.inf_count = 1
+
+    def execute(self, requests):
+        """This function is called on inference request."""
+        responses = []
+
+        # Generate the error for the second request
+        for request in requests:
+            input_tensor = pb_utils.get_input_tensor_by_name(request, "IN")
+            out_tensor = pb_utils.Tensor("OUT", input_tensor.as_numpy())
+            if self.inf_count % 2:
+                # Every odd request is success
+                responses.append(pb_utils.InferenceResponse([out_tensor]))
+            else:
+                # Every even request is failure
+                error = pb_utils.TritonError("An error occurred during execution")
+                responses.append(pb_utils.InferenceResponse([out_tensor], error))
+            self.inf_count += 1
+
+        return responses
diff --git a/qa/python_models/response_sender_complete_final/config.pbtxt b/qa/python_models/response_sender_complete_final/config.pbtxt
new file mode 100644
index 0000000000..f08ed6da5b
--- /dev/null
+++ b/qa/python_models/response_sender_complete_final/config.pbtxt
@@ -0,0 +1,47 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+backend: "python"
+max_batch_size: 8
+
+input [
+  {
+    name: "INPUT0"
+    data_type: TYPE_FP32
+    dims: [ -1 ]
+  }
+]
+
+output [
+  {
+    name: "OUTPUT0"
+    data_type: TYPE_FP32
+    dims: [ -1 ]
+  }
+]
+
+instance_group [{ kind: KIND_CPU }]
+model_transaction_policy { decoupled: True }
diff --git a/qa/python_models/response_sender_complete_final/model.py b/qa/python_models/response_sender_complete_final/model.py
new file mode 100644
index 0000000000..e17f0b04f6
--- /dev/null
+++ b/qa/python_models/response_sender_complete_final/model.py
@@ -0,0 +1,63 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import triton_python_backend_utils as pb_utils
+
+
+class TritonPythonModel:
+    def execute(self, requests):
+        # Expect exactly one request per execute() call.
+        if len(requests) != 1:
+            pb_utils.Logger.log_error(f"Unexpected request length: {len(requests)}")
+            raise Exception("Test FAILED")
+
+        # Send a response with complete final flag, and then send another response and
+        # and assert an exception is raised, for all requests.
+        for request in requests:
+            in_tensor = pb_utils.get_input_tensor_by_name(request, "INPUT0")
+            out_tensor = pb_utils.Tensor("OUTPUT0", in_tensor.as_numpy())
+            response = pb_utils.InferenceResponse([out_tensor])
+            response_sender = request.get_response_sender()
+            response_sender.send(
+                response, flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL
+            )
+            test_passed = False
+            try:
+                response_sender.send(response)
+            except Exception as e:
+                pb_utils.Logger.log_info(f"Raised exception: {e}")
+                if (
+                    str(e)
+                    == "Unable to send response. Response sender has been closed."
+                ):
+                    test_passed = True
+            finally:
+                if not test_passed:
+                    pb_utils.Logger.log_error("Expected exception not raised")
+                    raise Exception("Test FAILED")
+            pb_utils.Logger.log_info("Test Passed")
+        return None
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 783275d8d7..9488fc6233 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -138,6 +138,15 @@ else()
   )
 endif()
 
+set(LIB_DIR "lib")
+if(LINUX)
+  file(STRINGS "/etc/os-release" DISTRO_ID_LIKE REGEX "ID_LIKE")
+  if(${DISTRO_ID_LIKE} MATCHES "rhel|centos")
+    set (LIB_DIR "lib64")
+  endif(${DISTRO_ID_LIKE} MATCHES "rhel|centos")
+endif(LINUX)
+set(TRITON_CORE_HEADERS_ONLY OFF)
+
 set_target_properties(
   main
   PROPERTIES
@@ -145,7 +154,7 @@ set_target_properties(
     SKIP_BUILD_RPATH TRUE
     BUILD_WITH_INSTALL_RPATH TRUE
     INSTALL_RPATH_USE_LINK_PATH FALSE
-    INSTALL_RPATH "$\{ORIGIN\}/../lib"
+    INSTALL_RPATH "$\{ORIGIN\}/../${LIB_DIR}"
 )
 
 target_link_libraries(
@@ -773,7 +782,14 @@ if (NOT WIN32)
   endif() # TRITON_ENABLE_GPU
 endif() # NOT WIN32
 
+# DLIS-7292: Extend tritonfrontend to build for Windows
+if (NOT WIN32)
+  # tritonfrontend python package
+  add_subdirectory(python)
+endif (NOT WIN32)
+
 # Currently unit tests do not build for windows...
 if ( NOT WIN32)
   add_subdirectory(test test)
 endif() # NOT WIN32
+
diff --git a/src/common.h b/src/common.h
index aa160f394f..011546d637 100644
--- a/src/common.h
+++ b/src/common.h
@@ -27,7 +27,11 @@
 
 #include <iostream>
 #include <sstream>
+#include <stdexcept>
 #include <string>
+#include <typeinfo>
+#include <unordered_map>
+#include <variant>
 #include <vector>
 
 #include "triton/core/tritonserver.h"
@@ -184,4 +188,60 @@ Join(const T& container, const std::string& delim)
   return ss.str();
 }
 
+
+// Used by Python Bindings to accept arguments to initialize Frontends.
+// Known pybind11 issue: bool has to come before int for std::variant
+using VariantType = std::variant<bool, int, std::string>;
+using UnorderedMapType = std::unordered_map<std::string, VariantType>;
+
+
+template <typename T>
+TRITONSERVER_Error*
+GetValue(const UnorderedMapType& options, const std::string& key, T* arg)
+{
+  auto curr = options.find(key);
+  bool is_present = (curr != options.end());
+  std::string msg;
+
+  if (!is_present) {
+    msg = "Key: " + key + " not found in options provided.";
+    return TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INVALID_ARG, msg.c_str());
+  }
+
+  bool correct_type = std::holds_alternative<T>(curr->second);
+  if (!correct_type) {
+    std::string expected;
+    std::string found;
+    VariantType value = *arg;
+    if (std::holds_alternative<int>(value)) {
+      expected = "int";
+    } else if (std::holds_alternative<bool>(value)) {
+      expected = "bool";
+    } else if (std::holds_alternative<std::string>(value)) {
+      expected = "string";
+    }
+
+    switch (curr->second.index()) {
+      case 0:
+        found = "bool";
+        break;
+      case 1:
+        found = "int";
+        break;
+      case 2:
+        found = "string";
+        break;
+    }
+
+    msg = "Key: " + key + " found, but incorrect type. Expected " + expected +
+          " Found: " + found;
+
+    return TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INVALID_ARG, msg.c_str());
+  }
+
+  *arg = std::get<T>(curr->second);
+  return nullptr;
+}
+
+
 }}  // namespace triton::server
diff --git a/src/grpc/grpc_server.cc b/src/grpc/grpc_server.cc
index c0a92ebd33..74ec443ae6 100644
--- a/src/grpc/grpc_server.cc
+++ b/src/grpc/grpc_server.cc
@@ -2435,6 +2435,101 @@ Server::Create(
   return nullptr;  // success
 }
 
+TRITONSERVER_Error*
+Server::Create(
+    std::shared_ptr<TRITONSERVER_Server>& server, UnorderedMapType& options,
+    triton::server::TraceManager* trace_manager,
+    const std::shared_ptr<SharedMemoryManager>& shm_manager,
+    const RestrictedFeatures& restricted_features,
+    std::unique_ptr<Server>* service)
+{
+  Options grpc_options;
+
+  RETURN_IF_ERR(GetOptions(grpc_options, options));
+
+  return Create(server, trace_manager, shm_manager, grpc_options, service);
+}
+
+TRITONSERVER_Error*
+Server::GetOptions(Options& options, UnorderedMapType& options_map)
+{
+  SocketOptions socket_selection;
+  SslOptions ssl_selection;
+  KeepAliveOptions keep_alive_selection;
+
+  RETURN_IF_ERR(GetSocketOptions(options.socket_, options_map));
+  RETURN_IF_ERR(GetSslOptions(options.ssl_, options_map));
+  RETURN_IF_ERR(GetKeepAliveOptions(options.keep_alive_, options_map));
+
+  int infer_compression_level_key;
+
+  RETURN_IF_ERR(GetValue(
+      options_map, "infer_compression_level", &infer_compression_level_key));
+
+  options.infer_compression_level_ =
+      static_cast<grpc_compression_level>(infer_compression_level_key);
+
+  RETURN_IF_ERR(GetValue(
+      options_map, "infer_allocation_pool_size",
+      &options.infer_allocation_pool_size_));
+  RETURN_IF_ERR(GetValue(
+      options_map, "forward_header_pattern", &options.forward_header_pattern_));
+
+  return nullptr;
+}
+
+TRITONSERVER_Error*
+Server::GetSocketOptions(SocketOptions& options, UnorderedMapType& options_map)
+{
+  RETURN_IF_ERR(GetValue(options_map, "address", &options.address_));
+  RETURN_IF_ERR(GetValue(options_map, "port", &options.port_));
+  RETURN_IF_ERR(GetValue(options_map, "reuse_port", &options.reuse_port_));
+
+  return nullptr;
+}
+
+TRITONSERVER_Error*
+Server::GetSslOptions(SslOptions& options, UnorderedMapType& options_map)
+{
+  RETURN_IF_ERR(GetValue(options_map, "use_ssl", &options.use_ssl_));
+  RETURN_IF_ERR(GetValue(options_map, "server_cert", &options.server_cert_));
+  RETURN_IF_ERR(GetValue(options_map, "server_key", &options.server_key_));
+  RETURN_IF_ERR(GetValue(options_map, "root_cert", &options.root_cert_));
+  RETURN_IF_ERR(
+      GetValue(options_map, "use_mutual_auth", &options.use_mutual_auth_));
+
+  return nullptr;
+}
+
+TRITONSERVER_Error*
+Server::GetKeepAliveOptions(
+    KeepAliveOptions& options, UnorderedMapType& options_map)
+{
+  RETURN_IF_ERR(
+      GetValue(options_map, "keepalive_time_ms", &options.keepalive_time_ms_));
+  RETURN_IF_ERR(GetValue(
+      options_map, "keepalive_timeout_ms", &options.keepalive_timeout_ms_));
+  RETURN_IF_ERR(GetValue(
+      options_map, "keepalive_permit_without_calls",
+      &options.keepalive_permit_without_calls_));
+  RETURN_IF_ERR(GetValue(
+      options_map, "http2_max_pings_without_data",
+      &options.http2_max_pings_without_data_));
+  RETURN_IF_ERR(GetValue(
+      options_map, "http2_min_recv_ping_interval_without_data_ms",
+      &options.http2_min_recv_ping_interval_without_data_ms_));
+  RETURN_IF_ERR(GetValue(
+      options_map, "http2_max_ping_strikes", &options.http2_max_ping_strikes_));
+  RETURN_IF_ERR(GetValue(
+      options_map, "max_connection_age_ms", &options.max_connection_age_ms_));
+  RETURN_IF_ERR(GetValue(
+      options_map, "max_connection_age_grace_ms",
+      &options.max_connection_age_grace_ms_));
+
+  return nullptr;
+}
+
+
 TRITONSERVER_Error*
 Server::Start()
 {
diff --git a/src/grpc/grpc_server.h b/src/grpc/grpc_server.h
index 8a38cdd4fe..89d8dc7388 100644
--- a/src/grpc/grpc_server.h
+++ b/src/grpc/grpc_server.h
@@ -29,6 +29,7 @@
 
 #include <vector>
 
+#include "../common.h"
 #include "../restricted_features.h"
 #include "../shared_memory_manager.h"
 #include "../tracer.h"
@@ -100,6 +101,13 @@ class Server {
       const std::shared_ptr<SharedMemoryManager>& shm_manager,
       const Options& server_options, std::unique_ptr<Server>* server);
 
+  static TRITONSERVER_Error* Create(
+      std::shared_ptr<TRITONSERVER_Server>& server, UnorderedMapType& options,
+      triton::server::TraceManager* trace_manager,
+      const std::shared_ptr<SharedMemoryManager>& shm_manager,
+      const RestrictedFeatures& restricted_features,
+      std::unique_ptr<Server>* service);
+
   ~Server();
 
   TRITONSERVER_Error* Start();
@@ -112,6 +120,16 @@ class Server {
       const std::shared_ptr<SharedMemoryManager>& shm_manager,
       const Options& server_options);
 
+  static TRITONSERVER_Error* GetSocketOptions(
+      SocketOptions& options, UnorderedMapType& options_map);
+  static TRITONSERVER_Error* GetSslOptions(
+      SslOptions& options, UnorderedMapType& options_map);
+  static TRITONSERVER_Error* GetKeepAliveOptions(
+      KeepAliveOptions& options, UnorderedMapType& options_map);
+
+  static TRITONSERVER_Error* GetOptions(
+      Options& options, UnorderedMapType& options_map);
+
   std::shared_ptr<TRITONSERVER_Server> tritonserver_;
   TraceManager* trace_manager_;
   std::shared_ptr<SharedMemoryManager> shm_manager_;
diff --git a/src/grpc/grpc_utils.h b/src/grpc/grpc_utils.h
index 898e4acb4f..032dec3ad9 100644
--- a/src/grpc/grpc_utils.h
+++ b/src/grpc/grpc_utils.h
@@ -76,6 +76,46 @@ typedef enum {
   PARTIAL_COMPLETION
 } Steps;
 
+typedef enum {
+  // No error from CORE seen yet
+  NONE,
+  // Error from CORE encountered, waiting to be picked up by completion queue to
+  // initiate cancellation
+  ERROR_ENCOUNTERED,
+  // Error from CORE encountered, stream closed
+  // This state is added to avoid double cancellation
+  ERROR_HANDLING_COMPLETE
+} TritonGRPCErrorSteps;
+
+class gRPCErrorTracker {
+ public:
+  // True if set by user via header
+  // Can be accessed without a lock, as set only once in startstream
+  std::atomic<bool> triton_grpc_error_;
+
+  // Indicates the state of triton_grpc_error, only relevant if special
+  // triton_grpc_error feature set to true by client
+  TritonGRPCErrorSteps grpc_stream_error_state_;
+
+  // Constructor
+  gRPCErrorTracker()
+      : triton_grpc_error_(false),
+        grpc_stream_error_state_(TritonGRPCErrorSteps::NONE)
+  {
+  }
+  // Changes the state of grpc_stream_error_state_ to ERROR_HANDLING_COMPLETE,
+  // indicating we have closed the stream and initiated the cancel flow
+  void MarkGRPCErrorHandlingComplete();
+
+  // Returns true ONLY when GRPC_ERROR from CORE is waiting to be processed.
+  bool CheckAndUpdateGRPCError();
+
+  // Marks error after it has been responded to
+  void MarkGRPCErrorEncountered();
+
+  // Checks if error already responded to in triton_grpc_error mode
+  bool GRPCErrorEncountered();
+};
 // Debugging helper
 std::ostream& operator<<(std::ostream& out, const Steps& step);
 
@@ -183,5 +223,4 @@ TRITONSERVER_Error* ParseClassificationParams(
 
 
 void ReadFile(const std::string& filename, std::string& data);
-
 }}}  // namespace triton::server::grpc
diff --git a/src/grpc/infer_handler.cc b/src/grpc/infer_handler.cc
index 35659f4900..c4ba9338cb 100644
--- a/src/grpc/infer_handler.cc
+++ b/src/grpc/infer_handler.cc
@@ -158,18 +158,6 @@ InferResponseFree(
   return nullptr;  // Success
 }
 
-TRITONSERVER_Error* InferGRPCToInputHelper(
-    const std::string& input_name, const std::string& model_name,
-    const TRITONSERVER_DataType tensor_dt, const TRITONSERVER_DataType input_dt,
-    const size_t binary_data_byte_size);
-
-TRITONSERVER_Error* InferGRPCToInput(
-    const std::shared_ptr<TRITONSERVER_Server>& tritonserver,
-    const std::shared_ptr<SharedMemoryManager>& shm_manager,
-    const inference::ModelInferRequest& request,
-    std::list<std::string>* serialized_data,
-    TRITONSERVER_InferenceRequest* inference_request);
-
 TRITONSERVER_Error*
 InferGRPCToInputHelper(
     const std::string& input_name, const std::string& model_name,
@@ -391,7 +379,9 @@ InferGRPCToInput(
     const std::shared_ptr<SharedMemoryManager>& shm_manager,
     const inference::ModelInferRequest& request,
     std::list<std::string>* serialized_data,
-    TRITONSERVER_InferenceRequest* inference_request)
+    TRITONSERVER_InferenceRequest* inference_request,
+    std::vector<std::shared_ptr<const SharedMemoryManager::SharedMemoryInfo>>*
+        shm_regions_info)
 {
   // Verify that the batch-byte-size of each input matches the size of
   // the provided tensor data (provided raw or from shared memory)
@@ -432,9 +422,14 @@ InferGRPCToInput(
                 .c_str());
       }
       void* tmp;
+      std::shared_ptr<const SharedMemoryManager::SharedMemoryInfo> shm_info =
+          nullptr;
       RETURN_IF_ERR(shm_manager->GetMemoryInfo(
-          region_name, offset, byte_size, &tmp, &memory_type, &memory_type_id));
+          region_name, offset, byte_size, &tmp, &memory_type, &memory_type_id,
+          &shm_info));
       base = tmp;
+      shm_regions_info->emplace_back(shm_info);
+
       if (memory_type == TRITONSERVER_MEMORY_GPU) {
 #ifdef TRITON_ENABLE_GPU
         RETURN_IF_ERR(shm_manager->GetCUDAHandle(
@@ -911,18 +906,32 @@ ModelInferHandler::Execute(InferHandler::State* state)
   // tensors are present in the request.
   std::list<std::string> serialized_data;
 
+  // Maintain shared pointers(read-only reference) to the shared memory block's
+  // information for the shared memory regions used by the request. These
+  // pointers will automatically increase the usage count, preventing
+  // unregistration of the shared memory. This vector must be cleared in the
+  // `InferResponseComplete` callback (after inference) to decrease the count
+  // and permit unregistration. The vector will be included in
+  // `response_release_payload` for the callback.
+  std::vector<std::shared_ptr<const SharedMemoryManager::SharedMemoryInfo>>
+      shm_regions_info;
+
   if (err == nullptr) {
     err = InferGRPCToInput(
-        tritonserver_, shm_manager_, request, &serialized_data, irequest);
+        tritonserver_, shm_manager_, request, &serialized_data, irequest,
+        &shm_regions_info);
   }
   if (err == nullptr) {
     err = InferAllocatorPayload<inference::ModelInferResponse>(
         tritonserver_, shm_manager_, request, std::move(serialized_data),
-        response_queue, &state->alloc_payload_);
+        response_queue, &state->alloc_payload_, &shm_regions_info);
   }
 
   auto request_release_payload =
       std::make_unique<RequestReleasePayload>(state->inference_request_);
+  auto response_release_payload = std::make_unique<ResponseReleasePayload>(
+      state, std::move(shm_regions_info));
+
   if (err == nullptr) {
     err = TRITONSERVER_InferenceRequestSetReleaseCallback(
         irequest, InferRequestComplete,
@@ -932,7 +941,8 @@ ModelInferHandler::Execute(InferHandler::State* state)
     err = TRITONSERVER_InferenceRequestSetResponseCallback(
         irequest, allocator_,
         &state->alloc_payload_ /* response_allocator_userp */,
-        InferResponseComplete, reinterpret_cast<void*>(state));
+        InferResponseComplete,
+        response_release_payload.get() /* response_userp */);
   }
   // Get request ID for logging in case of error.
   const char* request_id = "";
@@ -948,12 +958,14 @@ ModelInferHandler::Execute(InferHandler::State* state)
   if (err == nullptr) {
     TRITONSERVER_InferenceTrace* triton_trace = nullptr;
 #ifdef TRITON_ENABLE_TRACING
-    GrpcServerCarrier carrier(state->context_->ctx_.get());
-    auto start_options =
-        trace_manager_->GetTraceStartOptions(carrier, request.model_name());
-    state->trace_ = std::move(trace_manager_->SampleTrace(start_options));
-    if (state->trace_ != nullptr) {
-      triton_trace = state->trace_->trace_;
+    if (trace_manager_) {
+      GrpcServerCarrier carrier(state->context_->ctx_.get());
+      auto start_options =
+          trace_manager_->GetTraceStartOptions(carrier, request.model_name());
+      state->trace_ = std::move(trace_manager_->SampleTrace(start_options));
+      if (state->trace_ != nullptr) {
+        triton_trace = state->trace_->trace_;
+      }
     }
 #endif  // TRITON_ENABLE_TRACING
 
@@ -968,8 +980,9 @@ ModelInferHandler::Execute(InferHandler::State* state)
   // to handle gRPC stream cancellation.
   if (err == nullptr) {
     state->context_->InsertInflightState(state);
-    // The payload will be cleaned in request release callback.
+    // The payload will be cleaned in release callback.
     request_release_payload.release();
+    response_release_payload.release();
   } else {
     // If error go immediately to COMPLETE.
     LOG_VERBOSE(1) << "[request id: " << request_id << "] "
@@ -982,8 +995,10 @@ ModelInferHandler::Execute(InferHandler::State* state)
     inference::ModelInferResponse error_response;
 
 #ifdef TRITON_ENABLE_TRACING
-    state->trace_timestamps_.emplace_back(
-        std::make_pair("GRPC_SEND_START", TraceManager::CaptureTimestamp()));
+    if (trace_manager_) {
+      state->trace_timestamps_.emplace_back(
+          std::make_pair("GRPC_SEND_START", TraceManager::CaptureTimestamp()));
+    }
 #endif  // TRITON_ENABLE_TRACING
 
     state->step_ = COMPLETE;
@@ -996,7 +1011,9 @@ ModelInferHandler::InferResponseComplete(
     TRITONSERVER_InferenceResponse* iresponse, const uint32_t flags,
     void* userp)
 {
-  State* state = reinterpret_cast<State*>(userp);
+  ResponseReleasePayload* response_release_payload(
+      static_cast<ResponseReleasePayload*>(userp));
+  auto state = response_release_payload->state_;
 
   // There are multiple handlers registered in the gRPC service
   // Hence, we would need to properly synchronize this thread
@@ -1038,6 +1055,7 @@ ModelInferHandler::InferResponseComplete(
     // in the next cycle.
     state->context_->PutTaskBackToQueue(state);
 
+    delete response_release_payload;
     return;
   }
 
@@ -1100,6 +1118,8 @@ ModelInferHandler::InferResponseComplete(
   if (response_created) {
     delete response;
   }
+
+  delete response_release_payload;
 }
 
 }}}  // namespace triton::server::grpc
diff --git a/src/grpc/infer_handler.h b/src/grpc/infer_handler.h
index 0e1091feb8..87536dd173 100644
--- a/src/grpc/infer_handler.h
+++ b/src/grpc/infer_handler.h
@@ -299,7 +299,9 @@ InferAllocatorPayload(
     const inference::ModelInferRequest& request,
     std::list<std::string>&& serialized_data,
     std::shared_ptr<ResponseQueue<ResponseType>> response_queue,
-    AllocPayload<ResponseType>* alloc_payload)
+    AllocPayload<ResponseType>* alloc_payload,
+    std::vector<std::shared_ptr<const SharedMemoryManager::SharedMemoryInfo>>*
+        shm_regions_info)
 {
   alloc_payload->response_queue_ = response_queue;
   alloc_payload->shm_map_.clear();
@@ -335,9 +337,12 @@ InferAllocatorPayload(
       void* base;
       TRITONSERVER_MemoryType memory_type;
       int64_t memory_type_id;
+      std::shared_ptr<const SharedMemoryManager::SharedMemoryInfo> shm_info =
+          nullptr;
       RETURN_IF_ERR(shm_manager->GetMemoryInfo(
-          region_name, offset, byte_size, &base, &memory_type,
-          &memory_type_id));
+          region_name, offset, byte_size, &base, &memory_type, &memory_type_id,
+          &shm_info));
+      shm_regions_info->emplace_back(shm_info);
 
       if (memory_type == TRITONSERVER_MEMORY_GPU) {
 #ifdef TRITON_ENABLE_GPU
@@ -373,7 +378,9 @@ TRITONSERVER_Error* InferGRPCToInput(
     const std::shared_ptr<SharedMemoryManager>& shm_manager,
     const inference::ModelInferRequest& request,
     std::list<std::string>* serialized_data,
-    TRITONSERVER_InferenceRequest* inference_request);
+    TRITONSERVER_InferenceRequest* inference_request,
+    std::vector<std::shared_ptr<const SharedMemoryManager::SharedMemoryInfo>>*
+        shm_regions_info);
 
 TRITONSERVER_Error* ResponseAllocatorHelper(
     TRITONSERVER_ResponseAllocator* allocator, const char* tensor_name,
@@ -646,6 +653,7 @@ class InferHandlerState {
     {
       ctx_.reset(new ::grpc::ServerContext());
       responder_.reset(new ServerResponderType(ctx_.get()));
+      gRPCErrorTracker_ = std::make_unique<gRPCErrorTracker>();
     }
 
     void SetCompressionLevel(grpc_compression_level compression_level)
@@ -666,9 +674,12 @@ class InferHandlerState {
 
     bool IsCancelled()
     {
-      return received_notification_ ? ctx_->IsCancelled() : false;
+      std::lock_guard<std::recursive_mutex> lock(mu_);
+      return received_notification_
+                 ? (ctx_->IsCancelled() ||
+                    gRPCErrorTracker_->CheckAndUpdateGRPCError())
+                 : false;
     }
-
     // Increments the ongoing request counter
     void IncrementRequestCounter() { ongoing_requests_++; }
 
@@ -710,6 +721,37 @@ class InferHandlerState {
       return false;
     }
 
+    // Extracts headers from GRPC request and updates state
+    void ExtractStateFromHeaders(InferHandlerStateType* state)
+    {
+      const auto& metadata = state->context_->ctx_->client_metadata();
+      std::string triton_grpc_error_key = "triton_grpc_error";
+
+      auto it = metadata.find(
+          {triton_grpc_error_key.data(), triton_grpc_error_key.size()});
+
+      if (it != metadata.end()) {
+        if (it->second == "true") {
+          LOG_VERBOSE(2)
+              << "GRPC: triton_grpc_error mode detected in new grpc stream";
+          state->context_->gRPCErrorTracker_->triton_grpc_error_ = true;
+        }
+      }
+    }
+
+    void WriteGRPCErrorResponse(InferHandlerStateType* state)
+    {
+      std::lock_guard<std::recursive_mutex> lock(state->context_->mu_);
+      // Check if Error not responded previously
+      // Avoid closing connection twice on multiple errors from core
+      if (!state->context_->gRPCErrorTracker_->GRPCErrorEncountered()) {
+        state->step_ = Steps::COMPLETE;
+        state->context_->responder_->Finish(state->status_, state);
+        // Mark error for this stream
+        state->context_->gRPCErrorTracker_->MarkGRPCErrorEncountered();
+      }
+    }
+
     const std::string DebugString(InferHandlerStateType* state)
     {
       std::string debug_string("");
@@ -793,6 +835,7 @@ class InferHandlerState {
     bool HandleCancellation(
         InferHandlerStateType* state, bool rpc_ok, const std::string& name)
     {
+      // Check to avoid early exit in case of triton_grpc_error
       if (!IsCancelled()) {
         LOG_ERROR
             << "[INTERNAL] HandleCancellation called even when the context was "
@@ -816,7 +859,6 @@ class InferHandlerState {
           IssueRequestCancellation();
           // Mark the context as cancelled
           state->context_->step_ = Steps::CANCELLED;
-
           // The state returns true because the CancelExecution
           // call above would have raised alarm objects on all
           // pending inflight states objects. This state will
@@ -999,6 +1041,8 @@ class InferHandlerState {
     // Tracks whether the async notification has been delivered by
     // completion queue.
     bool received_notification_;
+
+    std::unique_ptr<gRPCErrorTracker> gRPCErrorTracker_;
   };
 
   // This constructor is used to build a wrapper state object
@@ -1090,7 +1134,6 @@ class InferHandlerState {
 
   void MarkAsAsyncNotifyState() { async_notify_state_ = true; }
   bool IsAsyncNotifyState() { return async_notify_state_; }
-
   // Needed in the response handle for classification outputs.
   TRITONSERVER_Server* tritonserver_;
 
@@ -1227,6 +1270,23 @@ class InferHandler : public HandlerBase {
     delete state;
   }
 
+  // Simple structure that carries the payload needed for
+  // response release callback.
+  struct ResponseReleasePayload final {
+    State* state_;
+    std::vector<std::shared_ptr<const SharedMemoryManager::SharedMemoryInfo>>
+        shm_regions_info_;
+
+    ResponseReleasePayload(
+        State* state,
+        std::vector<
+            std::shared_ptr<const SharedMemoryManager::SharedMemoryInfo>>&&
+            shm_regions_info)
+        : state_(state), shm_regions_info_(std::move(shm_regions_info))
+    {
+    }
+  };
+
   virtual void StartNewRequest() = 0;
   virtual bool Process(State* state, bool rpc_ok) = 0;
   bool ExecutePrecondition(InferHandler::State* state);
diff --git a/src/grpc/stream_infer_handler.cc b/src/grpc/stream_infer_handler.cc
index 585f88d536..e912e1512c 100644
--- a/src/grpc/stream_infer_handler.cc
+++ b/src/grpc/stream_infer_handler.cc
@@ -189,7 +189,7 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok)
       state->context_->responder_->Finish(status, state);
       return !finished;
     }
-
+    state->context_->ExtractStateFromHeaders(state);
   } else if (state->step_ == Steps::READ) {
     TRITONSERVER_Error* err = nullptr;
     const inference::ModelInferRequest& request = state->request_;
@@ -282,18 +282,32 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok)
     // tensors are present in the request.
     std::list<std::string> serialized_data;
 
+    // Maintain shared pointers(read-only reference) to the shared memory
+    // block's information for the shared memory regions used by the request.
+    // These pointers will automatically increase the usage count, preventing
+    // unregistration of the shared memory. This vector must be cleared in the
+    // `StreamInferResponseComplete` callback (after inference) to decrease the
+    // count and permit unregistration. The vector will be included in
+    // `response_release_payload` for the callback.
+    std::vector<std::shared_ptr<const SharedMemoryManager::SharedMemoryInfo>>
+        shm_regions_info;
+
     if (err == nullptr) {
       err = InferGRPCToInput(
-          tritonserver_, shm_manager_, request, &serialized_data, irequest);
+          tritonserver_, shm_manager_, request, &serialized_data, irequest,
+          &shm_regions_info);
     }
     if (err == nullptr) {
       err = InferAllocatorPayload<inference::ModelStreamInferResponse>(
           tritonserver_, shm_manager_, request, std::move(serialized_data),
-          response_queue_, &state->alloc_payload_);
+          response_queue_, &state->alloc_payload_, &shm_regions_info);
     }
 
     auto request_release_payload =
         std::make_unique<RequestReleasePayload>(state->inference_request_);
+    auto response_release_payload = std::make_unique<ResponseReleasePayload>(
+        state, std::move(shm_regions_info));
+
     if (err == nullptr) {
       err = TRITONSERVER_InferenceRequestSetReleaseCallback(
           irequest, InferRequestComplete,
@@ -303,18 +317,21 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok)
       err = TRITONSERVER_InferenceRequestSetResponseCallback(
           irequest, allocator_,
           &state->alloc_payload_ /* response_allocator_userp */,
-          StreamInferResponseComplete, reinterpret_cast<void*>(state));
+          StreamInferResponseComplete,
+          response_release_payload.get() /* response_userp */);
     }
 
     if (err == nullptr) {
       TRITONSERVER_InferenceTrace* triton_trace = nullptr;
 #ifdef TRITON_ENABLE_TRACING
-      GrpcServerCarrier carrier(state->context_->ctx_.get());
-      auto start_options =
-          trace_manager_->GetTraceStartOptions(carrier, request.model_name());
-      state->trace_ = std::move(trace_manager_->SampleTrace(start_options));
-      if (state->trace_ != nullptr) {
-        triton_trace = state->trace_->trace_;
+      if (trace_manager_ != nullptr) {
+        GrpcServerCarrier carrier(state->context_->ctx_.get());
+        auto start_options =
+            trace_manager_->GetTraceStartOptions(carrier, request.model_name());
+        state->trace_ = std::move(trace_manager_->SampleTrace(start_options));
+        if (state->trace_ != nullptr) {
+          triton_trace = state->trace_->trace_;
+        }
       }
 #endif  // TRITON_ENABLE_TRACING
 
@@ -330,8 +347,9 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok)
     // irequest to handle gRPC stream cancellation.
     if (err == nullptr) {
       state->context_->InsertInflightState(state);
-      // The payload will be cleaned in request release callback.
+      // The payload will be cleaned in release callback.
       request_release_payload.release();
+      response_release_payload.release();
     } else {
       // If there was an error then enqueue the error response and show
       // it to be ready for writing.
@@ -355,7 +373,6 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok)
       GrpcStatusUtil::Create(&status, err);
       TRITONSERVER_ErrorDelete(err);
       response->set_error_message(status.error_message());
-
       response->mutable_infer_response()->Clear();
       // repopulate the id so that client knows which request failed.
       response->mutable_infer_response()->set_id(request.id());
@@ -522,15 +539,18 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok)
     } else if (state->step_ == Steps::WRITEREADY) {
       // Finish the state if all the transactions associated with
       // the state have completed.
-      if (state->IsComplete()) {
-        state->context_->DecrementRequestCounter();
-        finished = Finish(state);
-      } else {
-        LOG_ERROR << "Should not print this! Decoupled should NOT write via "
-                     "WRITEREADY!";
-        // Remove the state from the completion queue
-        std::lock_guard<std::recursive_mutex> lock(state->step_mtx_);
-        state->step_ = Steps::ISSUED;
+      std::lock_guard<std::recursive_mutex> lk1(state->context_->mu_);
+      {
+        if (state->IsComplete()) {
+          state->context_->DecrementRequestCounter();
+          finished = Finish(state);
+        } else {
+          LOG_ERROR << "Should not print this! Decoupled should NOT write via "
+                       "WRITEREADY!";
+          // Remove the state from the completion queue
+          std::lock_guard<std::recursive_mutex> lock(state->step_mtx_);
+          state->step_ = Steps::ISSUED;
+        }
       }
     }
   }
@@ -595,8 +615,17 @@ ModelStreamInferHandler::StreamInferResponseComplete(
     TRITONSERVER_InferenceResponse* iresponse, const uint32_t flags,
     void* userp)
 {
-  State* state = reinterpret_cast<State*>(userp);
-
+  ResponseReleasePayload* response_release_payload(
+      static_cast<ResponseReleasePayload*>(userp));
+  auto state = response_release_payload->state_;
+
+  // Ignore Response from CORE in case GRPC Strict as we dont care about
+  if (state->context_->gRPCErrorTracker_->triton_grpc_error_) {
+    std::lock_guard<std::recursive_mutex> lock(state->context_->mu_);
+    if (state->context_->gRPCErrorTracker_->GRPCErrorEncountered()) {
+      return;
+    }
+  }
   // Increment the callback index
   uint32_t response_index = state->cb_count_++;
 
@@ -643,6 +672,7 @@ ModelStreamInferHandler::StreamInferResponseComplete(
     if (is_complete) {
       state->step_ = Steps::CANCELLED;
       state->context_->PutTaskBackToQueue(state);
+      delete response_release_payload;
     }
 
     state->complete_ = is_complete;
@@ -671,14 +701,28 @@ ModelStreamInferHandler::StreamInferResponseComplete(
     } else {
       LOG_ERROR << "expected the response allocator to have added the response";
     }
-
     if (err != nullptr) {
       failed = true;
       ::grpc::Status status;
+      // Converts CORE errors to GRPC error codes
       GrpcStatusUtil::Create(&status, err);
       response->mutable_infer_response()->Clear();
       response->set_error_message(status.error_message());
       LOG_VERBOSE(1) << "Failed for ID: " << log_request_id << std::endl;
+      if (state->context_->gRPCErrorTracker_->triton_grpc_error_) {
+        state->status_ = status;
+        // Finish only once, if backend ignores cancellation
+        LOG_VERBOSE(1) << "GRPC streaming error detected with status: "
+                       << status.error_code() << "Closing stream connection."
+                       << std::endl;
+        state->context_->WriteGRPCErrorResponse(state);
+        TRITONSERVER_ErrorDelete(err);
+        LOG_TRITONSERVER_ERROR(
+            TRITONSERVER_InferenceResponseDelete(iresponse),
+            "deleting GRPC inference response");
+        delete response_release_payload;
+        return;
+      }
     }
 
     TRITONSERVER_ErrorDelete(err);
@@ -756,6 +800,7 @@ ModelStreamInferHandler::StreamInferResponseComplete(
     if (is_complete) {
       state->step_ = Steps::CANCELLED;
       state->context_->PutTaskBackToQueue(state);
+      delete response_release_payload;
     }
 
     state->complete_ = is_complete;
@@ -800,6 +845,48 @@ ModelStreamInferHandler::StreamInferResponseComplete(
     }
     state->complete_ = is_complete;
   }
+
+  if (is_complete) {
+    delete response_release_payload;
+  }
+}
+
+// Changes the state of grpc_stream_error_state_ to ERROR_HANDLING_COMPLETE,
+// indicating we have closed the stream and initiated the cancel flow
+void
+gRPCErrorTracker::MarkGRPCErrorHandlingComplete()
+{
+  grpc_stream_error_state_ = TritonGRPCErrorSteps::ERROR_HANDLING_COMPLETE;
+}
+
+// Returns true ONLY when GRPC_ERROR from CORE is waiting to be processed.
+bool
+gRPCErrorTracker::CheckAndUpdateGRPCError()
+{
+  if (grpc_stream_error_state_ == TritonGRPCErrorSteps::ERROR_ENCOUNTERED) {
+    // Change the state to ERROR_HANDLING_COMPLETE as we have called
+    // HandleCancellation
+    MarkGRPCErrorHandlingComplete();
+    return true;
+  }
+  return false;
+}
+
+// Marks error after it has been responded to
+void
+gRPCErrorTracker::MarkGRPCErrorEncountered()
+{
+  grpc_stream_error_state_ = TritonGRPCErrorSteps::ERROR_ENCOUNTERED;
+}
+
+// Checks if error already responded to in triton_grpc_error mode
+bool
+gRPCErrorTracker::GRPCErrorEncountered()
+{
+  if (grpc_stream_error_state_ == TritonGRPCErrorSteps::NONE) {
+    return false;
+  }
+  return true;
 }
 
 }}}  // namespace triton::server::grpc
diff --git a/src/http_server.cc b/src/http_server.cc
index 68b22ae649..99aed411b5 100644
--- a/src/http_server.cc
+++ b/src/http_server.cc
@@ -1181,6 +1181,7 @@ HTTPAPIServer::HTTPAPIServer(
 
 HTTPAPIServer::~HTTPAPIServer()
 {
+  LOG_VERBOSE(1) << "~HTTPAPIServer()";
   if (server_metadata_err_ != nullptr) {
     TRITONSERVER_ErrorDelete(server_metadata_err_);
   }
@@ -1809,6 +1810,10 @@ HTTPAPIServer::HandleTrace(evhtp_request_t* req, const std::string& model_name)
   }
 
 #ifdef TRITON_ENABLE_TRACING
+  if (trace_manager_ == nullptr) {
+    return;
+  }
+
   TRITONSERVER_InferenceTraceLevel level = TRITONSERVER_TRACE_LEVEL_DISABLED;
   uint32_t rate;
   int32_t count;
@@ -2680,9 +2685,13 @@ HTTPAPIServer::ParseJsonTritonIO(
         void* base;
         TRITONSERVER_MemoryType memory_type;
         int64_t memory_type_id;
+        std::shared_ptr<const SharedMemoryManager::SharedMemoryInfo> shm_info =
+            nullptr;
         RETURN_IF_ERR(shm_manager_->GetMemoryInfo(
             shm_region, shm_offset, byte_size, &base, &memory_type,
-            &memory_type_id));
+            &memory_type_id, &shm_info));
+        infer_req->AddShmRegionInfo(shm_info);
+
         if (memory_type == TRITONSERVER_MEMORY_GPU) {
 #ifdef TRITON_ENABLE_GPU
           cudaIpcMemHandle_t* cuda_handle;
@@ -2795,9 +2804,12 @@ HTTPAPIServer::ParseJsonTritonIO(
         void* base;
         TRITONSERVER_MemoryType memory_type;
         int64_t memory_type_id;
+        std::shared_ptr<const SharedMemoryManager::SharedMemoryInfo> shm_info =
+            nullptr;
         RETURN_IF_ERR(shm_manager_->GetMemoryInfo(
-            shm_region, offset, byte_size, &base, &memory_type,
-            &memory_type_id));
+            shm_region, offset, byte_size, &base, &memory_type, &memory_type_id,
+            &shm_info));
+        infer_req->AddShmRegionInfo(shm_info);
 
         if (memory_type == TRITONSERVER_MEMORY_GPU) {
 #ifdef TRITON_ENABLE_GPU
@@ -3225,8 +3237,11 @@ HTTPAPIServer::HandleGenerate(
 
   // If tracing is enabled see if this request should be traced.
   TRITONSERVER_InferenceTrace* triton_trace = nullptr;
-  std::shared_ptr<TraceManager::Trace> trace =
-      StartTrace(req, model_name, &triton_trace);
+  std::shared_ptr<TraceManager::Trace> trace;
+  if (trace_manager_) {
+    // If tracing is enabled see if this request should be traced.
+    trace = StartTrace(req, model_name, &triton_trace);
+  }
 
   std::map<std::string, triton::common::TritonJson::Value> input_metadata;
   triton::common::TritonJson::Value meta_data_root;
@@ -3549,6 +3564,8 @@ HTTPAPIServer::GenerateRequestClass::ExactMappingInput(
       }
     }
 
+    // get original element count back
+    element_cnt = tensor_data.IsArray() ? tensor_data.ArraySize() : 1;
     serialized_data_.emplace_back();
     std::vector<char>& serialized = serialized_data_.back();
     serialized.resize(byte_size);
@@ -3586,10 +3603,12 @@ HTTPAPIServer::HandleInfer(
   RETURN_AND_RESPOND_IF_ERR(
       req, CheckTransactionPolicy(req, model_name, requested_model_version));
 
-  // If tracing is enabled see if this request should be traced.
   TRITONSERVER_InferenceTrace* triton_trace = nullptr;
-  std::shared_ptr<TraceManager::Trace> trace =
-      StartTrace(req, model_name, &triton_trace);
+  std::shared_ptr<TraceManager::Trace> trace;
+  if (trace_manager_) {
+    // If tracing is enabled see if this request should be traced.
+    trace = StartTrace(req, model_name, &triton_trace);
+  }
 
   // Decompress request body if it is compressed in supported type
   evbuffer* decompressed_buffer = nullptr;
@@ -4696,6 +4715,35 @@ HTTPAPIServer::Create(
   return nullptr;
 }
 
+
+TRITONSERVER_Error*
+HTTPAPIServer::Create(
+    std::shared_ptr<TRITONSERVER_Server>& server,
+    const UnorderedMapType& options,
+    triton::server::TraceManager* trace_manager,
+    const std::shared_ptr<SharedMemoryManager>& shm_manager,
+    const RestrictedFeatures& restricted_features,
+    std::unique_ptr<HTTPServer>* service)
+{
+  int port;
+  bool reuse_port;
+  std::string address;
+  std::string header_forward_pattern;
+  int thread_count;
+
+  RETURN_IF_ERR(GetValue(options, "port", &port));
+  RETURN_IF_ERR(GetValue(options, "reuse_port", &reuse_port));
+  RETURN_IF_ERR(GetValue(options, "address", &address));
+  RETURN_IF_ERR(
+      GetValue(options, "header_forward_pattern", &header_forward_pattern));
+  RETURN_IF_ERR(GetValue(options, "thread_count", &thread_count));
+
+  return Create(
+      server, trace_manager, shm_manager, port, reuse_port, address,
+      header_forward_pattern, thread_count, restricted_features, service);
+}
+
+
 bool
 HTTPAPIServer::RespondIfRestricted(
     evhtp_request_t* req, const Restriction& restriction)
diff --git a/src/http_server.h b/src/http_server.h
index 077324cba3..3949f97e27 100644
--- a/src/http_server.h
+++ b/src/http_server.h
@@ -196,6 +196,14 @@ class HTTPAPIServer : public HTTPServer {
       const RestrictedFeatures& restricted_apis,
       std::unique_ptr<HTTPServer>* http_server);
 
+  static TRITONSERVER_Error* Create(
+      std::shared_ptr<TRITONSERVER_Server>& server,
+      const UnorderedMapType& options,
+      triton::server::TraceManager* trace_manager,
+      const std::shared_ptr<SharedMemoryManager>& shm_manager,
+      const RestrictedFeatures& restricted_features,
+      std::unique_ptr<HTTPServer>* service);
+
   virtual ~HTTPAPIServer();
 
   //
@@ -303,6 +311,13 @@ class HTTPAPIServer : public HTTPServer {
 
     static void ReplyCallback(evthr_t* thr, void* arg, void* shared);
 
+    void AddShmRegionInfo(
+        const std::shared_ptr<const SharedMemoryManager::SharedMemoryInfo>&
+            shm_info)
+    {
+      shm_regions_info_.push_back(shm_info);
+    }
+
    protected:
     TRITONSERVER_Server* server_{nullptr};
     evhtp_request_t* req_{nullptr};
@@ -322,6 +337,14 @@ class HTTPAPIServer : public HTTPServer {
     // TRITONSERVER_ServerInferAsync (except for cancellation).
     std::shared_ptr<TRITONSERVER_InferenceRequest> triton_request_{nullptr};
 
+    // Maintain shared pointers(read-only reference) to the shared memory
+    // block's information for the shared memory regions used by the request.
+    // These pointers will automatically increase the usage count, preventing
+    // unregistration of the shared memory. This vector must be cleared when no
+    // longer needed to decrease the count and permit unregistration.
+    std::vector<std::shared_ptr<const SharedMemoryManager::SharedMemoryInfo>>
+        shm_regions_info_;
+
     evhtp_res response_code_{EVHTP_RES_OK};
   };
 
diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt
new file mode 100644
index 0000000000..f447f7eab2
--- /dev/null
+++ b/src/python/CMakeLists.txt
@@ -0,0 +1,78 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+cmake_minimum_required(VERSION 3.18)
+
+message("tritonfrontend python package build skipped when relevant frontends are disabled.")
+message("In order to build tritonfrontend, the following flags are needed: -DTRITON_ENABLE_HTTP=ON -DTRITON_ENABLE_GRPC=ON")
+
+# [DLIS-7232] tritonfrontend package expects all supported packages to be
+# built, without any check/verification for respective frontend enable flags.
+# Support for partial builds(ex: HTTP but not gRPC) will be addressed later.
+if(NOT (${TRITON_ENABLE_HTTP} AND ${TRITON_ENABLE_GRPC}))
+  return()
+endif()
+
+add_subdirectory(tritonfrontend)
+
+file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/TRITON_VERSION ${TRITON_VERSION})
+configure_file(../../LICENSE LICENSE.txt COPYONLY)
+configure_file(setup.py setup.py @ONLY)
+
+set(WHEEL_DEPENDS
+      ${CMAKE_CURRENT_BINARY_DIR}/TRITON_VERSION
+      ${CMAKE_CURRENT_BINARY_DIR}/LICENSE.txt
+      ${CMAKE_CURRENT_BINARY_DIR}/setup.py
+      ${CMAKE_CURRENT_BINARY_DIR}/tritonfrontend
+      py-bindings
+)
+
+set(wheel_stamp_file "stamp.whl")
+
+add_custom_command(
+  OUTPUT "${wheel_stamp_file}"
+  COMMAND python3
+  ARGS
+    "${CMAKE_CURRENT_SOURCE_DIR}/build_wheel.py"
+    --dest-dir "${CMAKE_CURRENT_BINARY_DIR}/generic"
+    --binding-path $<TARGET_FILE:py-bindings>
+  DEPENDS ${WHEEL_DEPENDS}
+)
+
+add_custom_target(
+  frontend-server-wheel ALL
+  DEPENDS
+    "${wheel_stamp_file}"
+)
+
+
+# Wheel
+set(WHEEL_OUT_DIR "${CMAKE_CURRENT_BINARY_DIR}/generic/wheel/dist/")
+install(
+  DIRECTORY
+  ${WHEEL_OUT_DIR}
+  DESTINATION "${CMAKE_INSTALL_PREFIX}/python"
+)
\ No newline at end of file
diff --git a/src/python/build_wheel.py b/src/python/build_wheel.py
new file mode 100755
index 0000000000..875dd32a70
--- /dev/null
+++ b/src/python/build_wheel.py
@@ -0,0 +1,135 @@
+#!/usr/bin/env python3
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import argparse
+import os
+import pathlib
+import re
+import shutil
+import subprocess
+import sys
+from distutils.dir_util import copy_tree
+from tempfile import mkstemp
+
+
+def fail_if(p, msg):
+    if p:
+        print("error: {}".format(msg), file=sys.stderr)
+        sys.exit(1)
+
+
+def mkdir(path):
+    pathlib.Path(path).mkdir(parents=True, exist_ok=True)
+
+
+def touch(path):
+    pathlib.Path(path).touch()
+
+
+def cpdir(src, dest):
+    copy_tree(src, dest, preserve_symlinks=1)
+
+
+def sed(pattern, replace, source, dest=None):
+    name = None
+    if dest:
+        name = dest
+    if dest is None:
+        fd, name = mkstemp()
+
+    with open(source, "r") as fin, open(name, "w") as fout:
+        for line in fin:
+            out = re.sub(pattern, replace, line)
+            fout.write(out)
+
+    if not dest:
+        shutil.copyfile(name, source)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--dest-dir", type=str, required=True, help="Destination directory."
+    )
+    parser.add_argument(
+        "--binding-path",
+        type=str,
+        required=True,
+        help="Path to Triton Frontend Python binding.",
+    )
+
+    FLAGS = parser.parse_args()
+
+    FLAGS.triton_version = None
+    with open("TRITON_VERSION", "r") as vfile:
+        FLAGS.triton_version = vfile.readline().strip()
+
+    FLAGS.whl_dir = os.path.join(FLAGS.dest_dir, "wheel")
+
+    print("=== Building in: {}".format(os.getcwd()))
+    print("=== Using builddir: {}".format(FLAGS.whl_dir))
+    print("Adding package files")
+    mkdir(os.path.join(FLAGS.whl_dir, "tritonfrontend"))
+    shutil.copy(
+        "tritonfrontend/__init__.py", os.path.join(FLAGS.whl_dir, "tritonfrontend")
+    )
+    # Type checking marker file indicating support for type checkers.
+    # https://peps.python.org/pep-0561/
+    shutil.copy(
+        "tritonfrontend/py.typed", os.path.join(FLAGS.whl_dir, "tritonfrontend")
+    )
+    cpdir("tritonfrontend/_c", os.path.join(FLAGS.whl_dir, "tritonfrontend", "_c"))
+    cpdir("tritonfrontend/_api", os.path.join(FLAGS.whl_dir, "tritonfrontend", "_api"))
+    PYBIND_LIB = os.path.basename(FLAGS.binding_path)
+    shutil.copyfile(
+        FLAGS.binding_path,
+        os.path.join(FLAGS.whl_dir, "tritonfrontend", "_c", PYBIND_LIB),
+    )
+
+    shutil.copyfile("LICENSE.txt", os.path.join(FLAGS.whl_dir, "LICENSE.txt"))
+    shutil.copyfile("setup.py", os.path.join(FLAGS.whl_dir, "setup.py"))
+
+    os.chdir(FLAGS.whl_dir)
+    print("=== Building wheel")
+    args = ["python3", "setup.py", "bdist_wheel"]
+
+    wenv = os.environ.copy()
+    wenv["VERSION"] = FLAGS.triton_version
+    wenv["TRITON_PYBIND"] = PYBIND_LIB
+    p = subprocess.Popen(args, env=wenv)
+    p.wait()
+    fail_if(p.returncode != 0, "setup.py failed")
+
+    cpdir("dist", FLAGS.dest_dir)
+
+    print(f"=== Output wheel file is in: {FLAGS.dest_dir}")
+    touch(os.path.join(FLAGS.dest_dir, "stamp.whl"))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/python/examples/example.py b/src/python/examples/example.py
new file mode 100644
index 0000000000..2d2ca78920
--- /dev/null
+++ b/src/python/examples/example.py
@@ -0,0 +1,84 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pathlib
+
+import numpy as np
+import tritonclient.http as httpclient
+import tritonserver
+from tritonfrontend import KServeHttp
+
+
+def main():
+    # Constructing path to Model Repository
+    model_path = f"{pathlib.Path(__file__).parent.resolve()}/example_model_repository"
+    # Selecting Server Options
+    server_options = tritonserver.Options(
+        server_id="ExampleServer",
+        model_repository=model_path,
+        log_error=True,
+        log_info=True,
+        log_warn=True,
+    )
+
+    # Creating server instance
+    server = tritonserver.Server(server_options).start(wait_until_ready=True)
+
+    # Selecting Options for KServeHttp Frontend
+    http_options = KServeHttp.Options(port=8005)
+
+    # or http_service = KServeHttp.Server(server, http_options) & http_service.stop()
+    with KServeHttp(server, http_options) as http_service:
+        # The identity model returns an exact duplicate of the input data as output
+        model_name = "identity"
+        url = "localhost:8005"
+
+        # Create a Triton client
+        client = httpclient.InferenceServerClient(url=url)
+
+        # Prepare input data
+        input_data = np.array([["Roger Roger"]], dtype=object)
+
+        # Create input and output objects
+        inputs = [httpclient.InferInput("INPUT0", input_data.shape, "BYTES")]
+
+        # Set the data for the input tensor
+        inputs[0].set_data_from_numpy(input_data)
+
+        results = client.infer(model_name, inputs=inputs)
+
+        # Get the output data
+        output_data = results.as_numpy("OUTPUT0")
+
+        print("--------------------- INFERENCE RESULTS ---------------------")
+        print("Output data:", output_data)
+        print("-------------------------------------------------------------")
+
+    server.stop()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/python/examples/example_model_repository/identity/1/model.savedmodel/saved_model.pb b/src/python/examples/example_model_repository/identity/1/model.savedmodel/saved_model.pb
new file mode 100755
index 0000000000..63f78fecb4
Binary files /dev/null and b/src/python/examples/example_model_repository/identity/1/model.savedmodel/saved_model.pb differ
diff --git a/src/python/examples/example_model_repository/identity/config.pbtxt b/src/python/examples/example_model_repository/identity/config.pbtxt
new file mode 100644
index 0000000000..ae83e47556
--- /dev/null
+++ b/src/python/examples/example_model_repository/identity/config.pbtxt
@@ -0,0 +1,44 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+name: "identity"
+platform: "tensorflow_savedmodel"
+max_batch_size: 8
+
+input [
+  {
+    name: "INPUT0"
+    data_type: TYPE_STRING
+    dims: [ -1 ]
+  }
+]
+output [
+  {
+    name: "OUTPUT0"
+    data_type: TYPE_STRING
+    dims: [ -1 ]
+  }
+]
diff --git a/src/python/setup.py b/src/python/setup.py
new file mode 100755
index 0000000000..ee1e7c0ec4
--- /dev/null
+++ b/src/python/setup.py
@@ -0,0 +1,112 @@
+#!/usr/bin/env python3
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+import sys
+
+from setuptools import find_packages, setup
+
+if "--plat-name" in sys.argv:
+    PLATFORM_FLAG = sys.argv[sys.argv.index("--plat-name") + 1]
+else:
+    PLATFORM_FLAG = "any"
+
+if "VERSION" not in os.environ:
+    raise Exception("envvar VERSION must be specified")
+
+VERSION = os.environ["VERSION"]
+
+try:
+    from wheel.bdist_wheel import bdist_wheel as _bdist_wheel
+
+    class bdist_wheel(_bdist_wheel):
+        def finalize_options(self):
+            _bdist_wheel.finalize_options(self)
+            self.root_is_pure = False
+
+        def get_tag(self):
+            pyver, abi, plat = "py3", "none", PLATFORM_FLAG
+            return pyver, abi, plat
+
+except ImportError:
+    bdist_wheel = None
+
+this_directory = os.path.abspath(os.path.dirname(__file__))
+
+data_files = [
+    ("", ["LICENSE.txt"]),
+]
+
+# Type checking marker file indicating support for type checkers.
+# https://peps.python.org/pep-0561/
+# Type hints for c extension generated by mypy
+platform_package_data = [
+    os.environ["TRITON_PYBIND"],
+    "py.typed",
+    "_c/__init__.pyi",
+    "_c/triton_bindings.pyi",
+]
+
+gpu_extras = ["cupy-cuda12x"]
+test_extras = ["pytest"]
+all_extras = gpu_extras + test_extras
+
+setup(
+    name="tritonfrontend",
+    version=VERSION,
+    author="NVIDIA Inc.",
+    author_email="sw-dl-triton@nvidia.com",
+    description="Triton Inference Server In-Process Python API",
+    license="BSD",
+    url="https://developer.nvidia.com/nvidia-triton-inference-server",
+    classifiers=[
+        "Development Status :: 5 - Production/Stable",
+        "Intended Audience :: Developers",
+        "Intended Audience :: Science/Research",
+        "Intended Audience :: Information Technology",
+        "Topic :: Scientific/Engineering",
+        "Topic :: Scientific/Engineering :: Image Recognition",
+        "Topic :: Scientific/Engineering :: Artificial Intelligence",
+        "Topic :: Software Development :: Libraries",
+        "Topic :: Utilities",
+        "License :: OSI Approved :: BSD License",
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3.10",
+        "Environment :: Console",
+        "Natural Language :: English",
+        "Operating System :: OS Independent",
+    ],
+    packages=find_packages(),
+    package_data={
+        "": platform_package_data,
+    },
+    zip_safe=False,
+    cmdclass={"bdist_wheel": bdist_wheel},
+    data_files=data_files,
+    install_requires=["tritonserver", "pydantic"],
+    extras_require={"GPU": gpu_extras, "test": test_extras, "all": all_extras},
+)
diff --git a/src/python/tritonfrontend/CMakeLists.txt b/src/python/tritonfrontend/CMakeLists.txt
new file mode 100644
index 0000000000..e22be30602
--- /dev/null
+++ b/src/python/tritonfrontend/CMakeLists.txt
@@ -0,0 +1,181 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+cmake_minimum_required(VERSION 3.18)
+
+# ================= Ensures Package is Structured Properly ==================
+# Top level module entry point and typed marker
+file(COPY __init__.py DESTINATION .)
+file(COPY py.typed DESTINATION .)
+# Copy the '__init__.py' for the '_c' module
+file(COPY _c/__init__.py DESTINATION ./_c/.)
+file(COPY _c/__init__.pyi DESTINATION ./_c/.)
+file(COPY _c/tritonfrontend_bindings.pyi DESTINATION ./_c/.)
+# Find and copy _api modules
+file(GLOB PYTHON_MODULE_FILES ./_api/*.py)
+file(COPY ${PYTHON_MODULE_FILES} DESTINATION ./_api/.)
+# ================================= END =====================================
+
+
+# =================== Downloading and Installing pybind11 ===================
+include(FetchContent)
+
+FetchContent_Declare(
+    pybind11
+    GIT_REPOSITORY https://github.com/pybind/pybind11.git
+    GIT_TAG v2.13.1
+    GIT_SHALLOW ON
+)
+
+FetchContent_MakeAvailable(pybind11)
+# ================================= END =====================================
+
+# ================== Collect the Dependencies ===============================
+set(
+  PYTHON_FRONTEND_BINDING_DEPS
+  ../../shared_memory_manager.h
+  ../../shared_memory_manager.cc
+  ../../data_compressor.h
+  ../../common.h
+  ../../common.cc
+  ../../restricted_features.h
+  ../../tracer.h
+  $<$<BOOL:${TRITON_ENABLE_TRACING}>:../../tracer.cc>
+  ../../classification.cc
+)
+
+set(PY_BINDING_DEPENDENCY_LIBS
+      triton-common-json
+      triton-common-logging
+      triton-core-serverapi
+      triton-core-serverstub
+  )
+
+# Conditional Linking Based on Flags
+if(${TRITON_ENABLE_HTTP})
+  list(APPEND PY_BINDING_DEPENDENCY_LIBS
+      http-endpoint-library
+    )
+endif()
+
+if(${TRITON_ENABLE_GRPC})
+  list(APPEND PY_BINDING_DEPENDENCY_LIBS
+      grpc-endpoint-library
+  )
+endif()
+
+if(${TRITON_ENABLE_GPU})
+  find_package(CUDAToolkit REQUIRED)
+  list(APPEND PY_BINDING_DEPENDENCY_LIBS
+      CUDA::cudart
+  )
+endif()
+
+if(${TRITON_ENABLE_TRACING})
+  message("TRACING/STATS IS CURRENTLY NOT SUPPORTED.")
+  find_package(absl CONFIG REQUIRED)
+  find_package(CURL CONFIG REQUIRED)
+  find_package(nlohmann_json CONFIG REQUIRED)
+  find_package(opentelemetry-cpp CONFIG REQUIRED)
+  list(APPEND PY_BINDING_DEPENDENCY_LIBS
+    tracing-library
+  )
+endif()
+
+# ===================== End of Collection ===================================
+
+
+# ================== Create Python Frontend Bindings ========================
+set(
+  PYTHON_FRONTEND_BINDING_SRCS
+  _c/tritonfrontend.h
+  _c/tritonfrontend_pybind.cc
+)
+
+pybind11_add_module(
+  py-bindings
+  MODULE
+  ${PYTHON_FRONTEND_BINDING_DEPS}
+  ${PYTHON_FRONTEND_BINDING_SRCS}
+)
+
+target_include_directories(py-bindings PRIVATE ${CMAKE_SOURCE_DIR}/src)
+
+target_link_libraries(
+    py-bindings
+    PRIVATE
+      ${PY_BINDING_DEPENDENCY_LIBS}
+)
+
+if(${TRITON_ENABLE_HTTP})
+  target_compile_definitions(
+    py-bindings
+    PRIVATE TRITON_ENABLE_HTTP=1
+  )
+endif()
+
+if(${TRITON_ENABLE_GRPC})
+  target_compile_definitions(
+    py-bindings
+    PRIVATE TRITON_ENABLE_GRPC=1
+  )
+endif()
+
+if(${TRITON_ENABLE_GPU})
+  target_compile_definitions(
+    py-bindings
+    PRIVATE TRITON_ENABLE_GPU=1
+    PRIVATE TRITON_MIN_COMPUTE_CAPABILITY=${TRITON_MIN_COMPUTE_CAPABILITY}
+  )
+endif()
+
+if(${TRITON_ENABLE_TRACING})
+  target_include_directories(
+      py-bindings
+      PRIVATE ${OPENTELEMETRY_CPP_INCLUDE_DIRS}
+    )
+  target_compile_definitions(
+    py-bindings
+    PRIVATE TRITON_ENABLE_TRACING=1
+  )
+endif()
+
+if(${TRITON_ENABLE_STATS})
+  target_compile_definitions(
+    py-bindings
+    PRIVATE TRITON_ENABLE_STATS=1
+  )
+endif()
+
+
+set_property(TARGET py-bindings PROPERTY OUTPUT_NAME tritonfrontend_bindings)
+
+set_target_properties(
+    py-bindings
+    PROPERTIES
+      BUILD_RPATH "$ORIGIN:/opt/tritonserver/lib"
+)
+# ===================== End of Python Bindings ==============================
diff --git a/src/python/tritonfrontend/__init__.py b/src/python/tritonfrontend/__init__.py
new file mode 100644
index 0000000000..48eaf64e8b
--- /dev/null
+++ b/src/python/tritonfrontend/__init__.py
@@ -0,0 +1,33 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# triton/server/src/python/tritonfrontend/__init__.py
+
+import builtins
+from importlib.metadata import PackageNotFoundError, version
+
+from tritonfrontend._api._kservegrpc import KServeGrpc
+from tritonfrontend._api._kservehttp import KServeHttp
diff --git a/src/python/tritonfrontend/__init__.pyi b/src/python/tritonfrontend/__init__.pyi
new file mode 100644
index 0000000000..0afb0cb886
--- /dev/null
+++ b/src/python/tritonfrontend/__init__.pyi
@@ -0,0 +1,27 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# Need to automate stubgen process as a part of build: https://github.com/triton-inference-server/server/pull/7501#discussion_r1720135228
diff --git a/src/python/tritonfrontend/_api/__init__.py b/src/python/tritonfrontend/_api/__init__.py
new file mode 100644
index 0000000000..dc1c939c66
--- /dev/null
+++ b/src/python/tritonfrontend/_api/__init__.py
@@ -0,0 +1,25 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/src/python/tritonfrontend/_api/_error_mapping.py b/src/python/tritonfrontend/_api/_error_mapping.py
new file mode 100644
index 0000000000..39a1e9aeb1
--- /dev/null
+++ b/src/python/tritonfrontend/_api/_error_mapping.py
@@ -0,0 +1,48 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import tritonserver
+from tritonfrontend._c.tritonfrontend_bindings import (
+    AlreadyExistsError,
+    InternalError,
+    InvalidArgumentError,
+    NotFoundError,
+    TritonError,
+    UnavailableError,
+    UnknownError,
+    UnsupportedError,
+)
+
+ERROR_MAPPING = {
+    TritonError: tritonserver.TritonError,
+    NotFoundError: tritonserver.NotFoundError,
+    UnknownError: tritonserver.UnknownError,
+    InternalError: tritonserver.InternalError,
+    InvalidArgumentError: tritonserver.InvalidArgumentError,
+    UnavailableError: tritonserver.UnavailableError,
+    AlreadyExistsError: tritonserver.AlreadyExistsError,
+    UnsupportedError: tritonserver.UnsupportedError,
+}
diff --git a/src/python/tritonfrontend/_api/_kservegrpc.py b/src/python/tritonfrontend/_api/_kservegrpc.py
new file mode 100644
index 0000000000..b8f199ac53
--- /dev/null
+++ b/src/python/tritonfrontend/_api/_kservegrpc.py
@@ -0,0 +1,136 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import sys
+from enum import IntEnum
+from typing import Union
+
+import tritonserver
+from pydantic import Field
+from pydantic.dataclasses import dataclass
+from tritonfrontend._api._error_mapping import ERROR_MAPPING
+from tritonfrontend._c.tritonfrontend_bindings import (
+    InvalidArgumentError,
+    TritonError,
+    TritonFrontendGrpc,
+)
+
+
+# Enum (mirroring C++ format)
+class Grpc_compression_level(IntEnum):
+    NONE = 0
+    LOW = 1
+    MED = 2
+    HIGH = 3
+    COUNT = 4
+
+
+class KServeGrpc:
+    Grpc_compression_level = (
+        Grpc_compression_level  # Include the enum as a class attribute
+    )
+
+    # triton::server::grpc::Options
+    @dataclass
+    class Options:
+        # triton::server::grpc::SocketOptions
+        address: str = "0.0.0.0"
+        port: int = Field(8001, ge=0, le=65535)
+        reuse_port: bool = False
+        # triton::server::grpc::SslOptions
+        use_ssl: bool = False
+        server_cert: str = ""
+        server_key: str = ""
+        root_cert: str = ""
+        use_mutual_auth: bool = False
+        # triton::server::grpc::KeepAliveOptions
+        keepalive_time_ms: int = Field(7_200_000, ge=0)
+        keepalive_timeout_ms: int = Field(20_000, ge=0)
+        keepalive_permit_without_calls: bool = False
+        http2_max_pings_without_data: int = Field(2, ge=0)
+        http2_min_recv_ping_interval_without_data_ms: int = Field(300_000, ge=0)
+        http2_max_ping_strikes: int = Field(2, ge=0)
+        max_connection_age_ms: int = Field(0, ge=0)
+        max_connection_age_grace_ms: int = Field(0, ge=0)
+
+        # triton::server::grpc::Options
+
+        infer_compression_level: Union[
+            int, Grpc_compression_level
+        ] = Grpc_compression_level.NONE
+        infer_allocation_pool_size: int = Field(8, ge=0)
+        forward_header_pattern: str = ""
+        # DLIS-7215: Add restricted protocol support
+        # restricted_protocols: str = ""
+
+        def __post_init__(self):
+            if isinstance(self.infer_compression_level, Grpc_compression_level):
+                self.infer_compression_level = self.infer_compression_level.value
+
+    def __init__(self, server: tritonserver, options: "KServeGrpc.Options" = None):
+        try:
+            server_ptr = server._ptr()  # TRITONSERVER_Server pointer
+
+            # If no options provided, default options are selected
+            if options is None:
+                options = KServeGrpc.Options()
+
+            if not isinstance(options, KServeGrpc.Options):
+                raise InvalidArgumentError(
+                    "Incorrect type for options. options argument must be of type KServeGrpc.Options"
+                )
+
+            # Converts dataclass instance -> python dictionary -> unordered_map<string, std::variant<...>>
+            options_dict: dict[str, Union[int, bool, str]] = options.__dict__
+
+            self.triton_frontend = TritonFrontendGrpc(server_ptr, options_dict)
+        except TritonError:
+            exc_type, exc_value, _ = sys.exc_info()
+            # raise ... from None masks the tritonfrontend Error from being added in traceback
+            raise ERROR_MAPPING[exc_type](exc_value) from None
+
+    def __enter__(self):
+        self.triton_frontend.start()
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.triton_frontend.stop()
+        if exc_type:
+            raise ERROR_MAPPING[exc_type](exc_value) from None
+
+    def start(self):
+        try:
+            self.triton_frontend.start()
+        except TritonError:
+            exc_type, exc_value, _ = sys.exc_info()
+            raise ERROR_MAPPING[exc_type](exc_value) from None
+
+    def stop(self):
+        try:
+            self.triton_frontend.stop()
+        except TritonError:
+            exc_type, exc_value, _ = sys.exc_info()
+            raise ERROR_MAPPING[exc_type](exc_value) from None
diff --git a/src/python/tritonfrontend/_api/_kservegrpc.pyi b/src/python/tritonfrontend/_api/_kservegrpc.pyi
new file mode 100644
index 0000000000..c81d3d6afc
--- /dev/null
+++ b/src/python/tritonfrontend/_api/_kservegrpc.pyi
@@ -0,0 +1,74 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from enum import IntEnum
+
+import tritonserver
+from _typeshed import Incomplete
+from tritonfrontend._c.tritonfrontend_bindings import (
+    InvalidArgumentError as InvalidArgumentError,
+)
+from tritonfrontend._c.tritonfrontend_bindings import (
+    TritonFrontendGrpc as TritonFrontendGrpc,
+)
+
+class Grpc_compression_level(IntEnum):
+    NONE = 0
+    LOW = 1
+    MED = 2
+    HIGH = 3
+    COUNT = 4
+
+class KServeGrpc:
+    Grpc_compression_level = Grpc_compression_level
+    class Options:
+        address: str
+        port: int
+        reuse_port: bool
+        use_ssl: bool
+        server_cert: str
+        server_key: str
+        root_cert: str
+        use_mutual_auth: bool
+        keepalive_time_ms: int
+        keepalive_timeout_ms: int
+        keepalive_permit_without_calls: bool
+        http2_max_pings_without_data: int
+        http2_min_recv_ping_interval_without_data_ms: int
+        http2_max_ping_strikes: int
+        max_connection_age_ms: int
+        max_connection_age_grace_ms: int
+        infer_compression_level: int | Grpc_compression_level
+        infer_allocation_pool_size: int
+        forward_header_pattern: str
+        def __post_init__(self) -> None: ...
+    class Server:
+        triton_frontend: Incomplete
+        def __init__(self, server: tritonserver, options: KServeGrpc.Options = None) -> None: ...
+        def __enter__(self): ...
+        def __exit__(self, exc_type: type[BaseException] | None, exc_value: BaseException | None, traceback: types.TracebackType | None) -> None: ...
+        def start(self): ...
+        def stop(self): ...
diff --git a/src/python/tritonfrontend/_api/_kservehttp.py b/src/python/tritonfrontend/_api/_kservehttp.py
new file mode 100644
index 0000000000..4a5abef4a3
--- /dev/null
+++ b/src/python/tritonfrontend/_api/_kservehttp.py
@@ -0,0 +1,96 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import sys
+from typing import Union
+
+import tritonserver
+from pydantic import Field
+from pydantic.dataclasses import dataclass
+from tritonfrontend._api._error_mapping import ERROR_MAPPING
+from tritonfrontend._c.tritonfrontend_bindings import (
+    InvalidArgumentError,
+    TritonError,
+    TritonFrontendHttp,
+)
+
+
+class KServeHttp:
+    @dataclass
+    class Options:
+        address: str = "0.0.0.0"
+        port: int = Field(8000, ge=0, le=65535)
+        reuse_port: bool = False
+        thread_count: int = Field(8, ge=0)
+        header_forward_pattern: str = ""
+        # DLIS-7215: Add restricted protocol support
+        # restricted_protocols: list
+
+    def __init__(self, server: tritonserver, options: "KServeHttp.Options" = None):
+        try:
+            server_ptr = server._ptr()  # TRITONSERVER_Server pointer
+
+            # If no options provided, default options are selected
+            if options is None:
+                options = KServeHttp.Options()
+
+            if not isinstance(options, KServeHttp.Options):
+                raise InvalidArgumentError(
+                    "Incorrect type for options. options argument must be of type KServeHttp.Options"
+                )
+
+            options_dict: dict[str, Union[int, bool, str]] = options.__dict__
+            # Converts dataclass instance -> python dictionary -> unordered_map<string, std::variant<...>>
+
+            self.triton_frontend = TritonFrontendHttp(server_ptr, options_dict)
+        except TritonError:
+            exc_type, exc_value, _ = sys.exc_info()
+            # raise ... from None masks the tritonfrontend Error from being added in traceback
+            raise ERROR_MAPPING[exc_type](exc_value) from None
+
+    def __enter__(self):
+        self.triton_frontend.start()
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.triton_frontend.stop()
+        if exc_type:
+            raise ERROR_MAPPING[exc_type](exc_value) from None
+
+    def start(self):
+        try:
+            self.triton_frontend.start()
+        except TritonError:
+            exc_type, exc_value, _ = sys.exc_info()
+            raise ERROR_MAPPING[exc_type](exc_value) from None
+
+    def stop(self):
+        try:
+            self.triton_frontend.stop()
+        except TritonError:
+            exc_type, exc_value, _ = sys.exc_info()
+            raise ERROR_MAPPING[exc_type](exc_value) from None
diff --git a/src/python/tritonfrontend/_api/_kservehttp.pyi b/src/python/tritonfrontend/_api/_kservehttp.pyi
new file mode 100644
index 0000000000..60f3997f39
--- /dev/null
+++ b/src/python/tritonfrontend/_api/_kservehttp.pyi
@@ -0,0 +1,49 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import tritonserver
+from _typeshed import Incomplete
+from tritonfrontend._c.tritonfrontend_bindings import (
+    InvalidArgumentError as InvalidArgumentError,
+)
+from tritonfrontend._c.tritonfrontend_bindings import (
+    TritonFrontendHttp as TritonFrontendHttp,
+)
+
+class KServeHttp:
+    class Options:
+        address: str
+        port: int
+        reuse_port: bool
+        thread_count: int
+        header_forward_pattern: str
+    class Server:
+        triton_frontend: Incomplete
+        def __init__(self, server: tritonserver, options: KServeHttp.Options = None) -> None: ...
+        def __enter__(self): ...
+        def __exit__(self, exc_type: type[BaseException] | None, exc_value: BaseException | None, traceback: types.TracebackType | None) -> None: ...
+        def start(self) -> None: ...
+        def stop(self) -> None: ...
diff --git a/src/python/tritonfrontend/_c/__init__.py b/src/python/tritonfrontend/_c/__init__.py
new file mode 100644
index 0000000000..3e892ede64
--- /dev/null
+++ b/src/python/tritonfrontend/_c/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from .tritonfrontend_bindings import *
diff --git a/src/python/tritonfrontend/_c/__init__.pyi b/src/python/tritonfrontend/_c/__init__.pyi
new file mode 100644
index 0000000000..99eaf9dace
--- /dev/null
+++ b/src/python/tritonfrontend/_c/__init__.pyi
@@ -0,0 +1,27 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from tritonfrontend._c.tritonfrontend_bindings import *
diff --git a/src/python/tritonfrontend/_c/tritonfrontend.h b/src/python/tritonfrontend/_c/tritonfrontend.h
new file mode 100644
index 0000000000..172147f566
--- /dev/null
+++ b/src/python/tritonfrontend/_c/tritonfrontend.h
@@ -0,0 +1,139 @@
+// Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include <memory>  // For shared_ptr
+#include <unordered_map>
+#include <variant>
+
+#include "../../../common.h"
+#include "../../../restricted_features.h"
+#include "../../../shared_memory_manager.h"
+#include "../../../tracer.h"
+#include "triton/common/logging.h"
+#include "triton/core/tritonserver.h"
+
+
+struct TRITONSERVER_Server {};
+
+namespace triton { namespace server { namespace python {
+
+// base exception for all Triton error code
+struct TritonError : public std::runtime_error {
+  explicit TritonError(const std::string& what) : std::runtime_error(what) {}
+};
+
+// triton::core::python exceptions map 1:1 to TRITONSERVER_Error_Code.
+struct UnknownError : public TritonError {
+  explicit UnknownError(const std::string& what) : TritonError(what) {}
+};
+struct InternalError : public TritonError {
+  explicit InternalError(const std::string& what) : TritonError(what) {}
+};
+struct NotFoundError : public TritonError {
+  explicit NotFoundError(const std::string& what) : TritonError(what) {}
+};
+struct InvalidArgumentError : public TritonError {
+  explicit InvalidArgumentError(const std::string& what) : TritonError(what) {}
+};
+struct UnavailableError : public TritonError {
+  explicit UnavailableError(const std::string& what) : TritonError(what) {}
+};
+struct UnsupportedError : public TritonError {
+  explicit UnsupportedError(const std::string& what) : TritonError(what) {}
+};
+struct AlreadyExistsError : public TritonError {
+  explicit AlreadyExistsError(const std::string& what) : TritonError(what) {}
+};
+
+void
+ThrowIfError(TRITONSERVER_Error* err)
+{
+  if (err == nullptr) {
+    return;
+  }
+  std::shared_ptr<TRITONSERVER_Error> managed_err(
+      err, TRITONSERVER_ErrorDelete);
+  std::string msg = TRITONSERVER_ErrorMessage(err);
+  switch (TRITONSERVER_ErrorCode(err)) {
+    case TRITONSERVER_ERROR_INTERNAL:
+      throw InternalError(std::move(msg));
+    case TRITONSERVER_ERROR_NOT_FOUND:
+      throw NotFoundError(std::move(msg));
+    case TRITONSERVER_ERROR_INVALID_ARG:
+      throw InvalidArgumentError(std::move(msg));
+    case TRITONSERVER_ERROR_UNAVAILABLE:
+      throw UnavailableError(std::move(msg));
+    case TRITONSERVER_ERROR_UNSUPPORTED:
+      throw UnsupportedError(std::move(msg));
+    case TRITONSERVER_ERROR_ALREADY_EXISTS:
+      throw AlreadyExistsError(std::move(msg));
+    default:
+      throw UnknownError(std::move(msg));
+  }
+}
+
+
+template <typename Base, typename FrontendServer>
+class TritonFrontend {
+ private:
+  std::shared_ptr<TRITONSERVER_Server> server_;
+  std::unique_ptr<Base> service;
+  triton::server::RestrictedFeatures restricted_features;
+  // TODO: [DLIS-7194] Add support for TraceManager & SharedMemoryManager
+  // triton::server::TraceManager trace_manager_;
+  // triton::server::SharedMemoryManager shm_manager_;
+
+ public:
+  TritonFrontend(uintptr_t server_mem_addr, UnorderedMapType data)
+  {
+    TRITONSERVER_Server* server_ptr =
+        reinterpret_cast<TRITONSERVER_Server*>(server_mem_addr);
+
+    server_.reset(server_ptr, EmptyDeleter);
+
+    ThrowIfError(FrontendServer::Create(
+        server_, data, nullptr /* TraceManager */,
+        nullptr /* SharedMemoryManager */, restricted_features, &service));
+  };
+
+  // TODO: [DLIS-7194] Add support for TraceManager & SharedMemoryManager
+  // TritonFrontend(
+  //     uintptr_t server_mem_addr, UnorderedMapType data,
+  //     TraceManager trace_manager, SharedMemoryManager shm_manager)
+
+  void StartService() { ThrowIfError(service->Start()); };
+  void StopService() { ThrowIfError(service->Stop()); };
+
+  // The frontend does not own the TRITONSERVER_Server* object.
+  // Hence, deleting the underlying server instance,
+  // will cause a double-free when the core bindings attempt to
+  // delete the TRITONSERVER_Server instance.
+  static void EmptyDeleter(TRITONSERVER_Server* obj){};
+};
+
+}}}  // namespace triton::server::python
diff --git a/src/python/tritonfrontend/_c/tritonfrontend_bindings.pyi b/src/python/tritonfrontend/_c/tritonfrontend_bindings.pyi
new file mode 100644
index 0000000000..535693a5cb
--- /dev/null
+++ b/src/python/tritonfrontend/_c/tritonfrontend_bindings.pyi
@@ -0,0 +1,44 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from tritonfrontend import AlreadyExistsError as AlreadyExistsError
+from tritonfrontend import InternalError as InternalError
+from tritonfrontend import InvalidArgumentError as InvalidArgumentError
+from tritonfrontend import NotFoundError as NotFoundError
+from tritonfrontend import TritonError as TritonError
+from tritonfrontend import UnavailableError as UnavailableError
+from tritonfrontend import UnknownError as UnknownError
+from tritonfrontend import UnsupportedError as UnsupportedError
+
+class TritonFrontendGrpc:
+    def __init__(self, arg0: int, arg1: dict[str, bool | int | str]) -> None: ...
+    def start(self) -> None: ...
+    def stop(self) -> None: ...
+
+class TritonFrontendHttp:
+    def __init__(self, arg0: int, arg1: dict[str, bool | int | str]) -> None: ...
+    def start(self) -> None: ...
+    def stop(self) -> None: ...
diff --git a/src/python/tritonfrontend/_c/tritonfrontend_pybind.cc b/src/python/tritonfrontend/_c/tritonfrontend_pybind.cc
new file mode 100644
index 0000000000..86a0ac1c41
--- /dev/null
+++ b/src/python/tritonfrontend/_c/tritonfrontend_pybind.cc
@@ -0,0 +1,76 @@
+// Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "../../../grpc/grpc_server.h"
+#include "../../../http_server.h"
+#include "triton/core/tritonserver.h"
+#include "tritonfrontend.h"
+
+
+namespace py = pybind11;
+
+namespace triton { namespace server { namespace python {
+
+
+PYBIND11_MODULE(tritonfrontend_bindings, m)
+{
+  m.doc() = "Python bindings for Triton Inference Server Frontend Endpoints";
+
+  auto tfe = py::register_exception<TritonError>(m, "TritonError");
+  py::register_exception<UnknownError>(m, "UnknownError", tfe.ptr());
+  py::register_exception<InternalError>(m, "InternalError", tfe.ptr());
+  py::register_exception<NotFoundError>(m, "NotFoundError", tfe.ptr());
+  py::register_exception<InvalidArgumentError>(
+      m, "InvalidArgumentError", tfe.ptr());
+  py::register_exception<UnavailableError>(m, "UnavailableError", tfe.ptr());
+  py::register_exception<UnsupportedError>(m, "UnsupportedError", tfe.ptr());
+  py::register_exception<AlreadyExistsError>(
+      m, "AlreadyExistsError", tfe.ptr());
+
+
+  py::class_<TritonFrontend<HTTPServer, HTTPAPIServer>>(m, "TritonFrontendHttp")
+      .def(py::init<uintptr_t, UnorderedMapType>())
+      .def("start", &TritonFrontend<HTTPServer, HTTPAPIServer>::StartService)
+      .def("stop", &TritonFrontend<HTTPServer, HTTPAPIServer>::StopService);
+
+  py::class_<TritonFrontend<
+      triton::server::grpc::Server, triton::server::grpc::Server>>(
+      m, "TritonFrontendGrpc")
+      .def(py::init<uintptr_t, UnorderedMapType>())
+      .def(
+          "start", &TritonFrontend<
+                       triton::server::grpc::Server,
+                       triton::server::grpc::Server>::StartService)
+      .def(
+          "stop", &TritonFrontend<
+                      triton::server::grpc::Server,
+                      triton::server::grpc::Server>::StopService);
+}
+
+}}}  // namespace triton::server::python
diff --git a/src/python/tritonfrontend/py.typed b/src/python/tritonfrontend/py.typed
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/src/shared_memory_manager.cc b/src/shared_memory_manager.cc
index 1f4a77e887..7b845709a1 100644
--- a/src/shared_memory_manager.cc
+++ b/src/shared_memory_manager.cc
@@ -69,7 +69,8 @@ TRITONSERVER_Error*
 SharedMemoryManager::GetMemoryInfo(
     const std::string& name, size_t offset, size_t byte_size,
     void** shm_mapped_addr, TRITONSERVER_MemoryType* memory_type,
-    int64_t* device_id)
+    int64_t* device_id,
+    std::shared_ptr<const SharedMemoryManager::SharedMemoryInfo>* shm_info)
 {
   return TRITONSERVER_ErrorNew(
       TRITONSERVER_ERROR_UNSUPPORTED,
@@ -408,9 +409,9 @@ SharedMemoryManager::RegisterSystemSharedMemory(
   }
 
   shared_memory_map_.insert(std::make_pair(
-      name, std::unique_ptr<SharedMemoryInfo>(new SharedMemoryInfo(
+      name, std::make_shared<SharedMemoryManager::SharedMemoryInfo>(
                 name, shm_key, offset, byte_size, shm_fd, mapped_addr,
-                TRITONSERVER_MEMORY_CPU, 0))));
+                TRITONSERVER_MEMORY_CPU, 0)));
 
   return nullptr;  // success
 }
@@ -444,9 +445,9 @@ SharedMemoryManager::RegisterCUDASharedMemory(
       name, reinterpret_cast<CUdeviceptr>(mapped_addr), byte_size));
 
   shared_memory_map_.insert(std::make_pair(
-      name, std::unique_ptr<CUDASharedMemoryInfo>(new CUDASharedMemoryInfo(
+      name, std::make_shared<SharedMemoryManager::CUDASharedMemoryInfo>(
                 name, "", 0, byte_size, 0, mapped_addr, TRITONSERVER_MEMORY_GPU,
-                device_id, cuda_shm_handle))));
+                device_id, cuda_shm_handle)));
 
   return nullptr;  // success
 }
@@ -456,7 +457,8 @@ TRITONSERVER_Error*
 SharedMemoryManager::GetMemoryInfo(
     const std::string& name, size_t offset, size_t byte_size,
     void** shm_mapped_addr, TRITONSERVER_MemoryType* memory_type,
-    int64_t* device_id)
+    int64_t* device_id,
+    std::shared_ptr<const SharedMemoryManager::SharedMemoryInfo>* shm_info)
 {
   // protect shared_memory_map_ from concurrent access
   std::lock_guard<std::mutex> lock(mu_);
@@ -494,6 +496,10 @@ SharedMemoryManager::GetMemoryInfo(
             .c_str());
   }
 
+  if (shm_info != nullptr) {
+    *shm_info = std::static_pointer_cast<const SharedMemoryInfo>(it->second);
+  }
+
   if (it->second->kind_ == TRITONSERVER_MEMORY_CPU) {
     *shm_mapped_addr = (void*)((uint8_t*)it->second->mapped_addr_ +
                                it->second->offset_ + offset);
@@ -561,11 +567,19 @@ SharedMemoryManager::GetStatus(
   } else {
     auto it = shared_memory_map_.find(name);
     if (it == shared_memory_map_.end()) {
-      return TRITONSERVER_ErrorNew(
-          TRITONSERVER_ERROR_NOT_FOUND,
-          std::string(
-              "Unable to find system shared memory region: '" + name + "'")
-              .c_str());
+      if (memory_type == TRITONSERVER_MEMORY_GPU) {
+        return TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_NOT_FOUND,
+            std::string(
+                "Unable to find cuda shared memory region: '" + name + "'")
+                .c_str());
+      } else {
+        return TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_NOT_FOUND,
+            std::string(
+                "Unable to find system shared memory region: '" + name + "'")
+                .c_str());
+      }
     }
 
     if (it->second->kind_ != memory_type) {
@@ -632,6 +646,7 @@ SharedMemoryManager::UnregisterAll(TRITONSERVER_MemoryType memory_type)
         TRITONSERVER_Error* err = UnregisterHelper(it->first, memory_type);
         if (err != nullptr) {
           unregister_fails.push_back(it->first);
+          LOG_VERBOSE(1) << TRITONSERVER_ErrorMessage(err);
         }
       }
     }
@@ -645,6 +660,7 @@ SharedMemoryManager::UnregisterAll(TRITONSERVER_MemoryType memory_type)
         ;
         if (err != nullptr) {
           unregister_fails.push_back(it->first);
+          LOG_VERBOSE(1) << TRITONSERVER_ErrorMessage(err);
         }
       }
     }
@@ -669,6 +685,15 @@ SharedMemoryManager::UnregisterHelper(
   // Must hold the lock on register_mu_ while calling this function.
   auto it = shared_memory_map_.find(name);
   if (it != shared_memory_map_.end() && it->second->kind_ == memory_type) {
+    if (it->second.use_count() > 1) {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INTERNAL,
+          std::string(
+              "Cannot unregister shared memory region '" + name +
+              "', it is currently in use.")
+              .c_str());
+    }
+
     if (it->second->kind_ == TRITONSERVER_MEMORY_CPU) {
       RETURN_IF_ERR(
           UnmapSharedMemory(it->second->mapped_addr_, it->second->byte_size_));
diff --git a/src/shared_memory_manager.h b/src/shared_memory_manager.h
index 51eb0f0786..393fd29128 100644
--- a/src/shared_memory_manager.h
+++ b/src/shared_memory_manager.h
@@ -1,4 +1,4 @@
-// Copyright 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -50,6 +50,48 @@ class SharedMemoryManager {
   SharedMemoryManager() = default;
   ~SharedMemoryManager();
 
+  /// A struct that records the shared memory regions registered by the shared
+  /// memory manager.
+  struct SharedMemoryInfo {
+    SharedMemoryInfo(
+        const std::string& name, const std::string& shm_key,
+        const size_t offset, const size_t byte_size, int shm_fd,
+        void* mapped_addr, const TRITONSERVER_MemoryType kind,
+        const int64_t device_id)
+        : name_(name), shm_key_(shm_key), offset_(offset),
+          byte_size_(byte_size), shm_fd_(shm_fd), mapped_addr_(mapped_addr),
+          kind_(kind), device_id_(device_id)
+    {
+    }
+
+    std::string name_;
+    std::string shm_key_;
+    size_t offset_;
+    size_t byte_size_;
+    int shm_fd_;
+    void* mapped_addr_;
+    TRITONSERVER_MemoryType kind_;
+    int64_t device_id_;
+  };
+
+#ifdef TRITON_ENABLE_GPU
+  struct CUDASharedMemoryInfo : SharedMemoryInfo {
+    CUDASharedMemoryInfo(
+        const std::string& name, const std::string& shm_key,
+        const size_t offset, const size_t byte_size, int shm_fd,
+        void* mapped_addr, const TRITONSERVER_MemoryType kind,
+        const int64_t device_id, const cudaIpcMemHandle_t* cuda_ipc_handle)
+        : SharedMemoryInfo(
+              name, shm_key, offset, byte_size, shm_fd, mapped_addr, kind,
+              device_id),
+          cuda_ipc_handle_(*cuda_ipc_handle)
+    {
+    }
+
+    cudaIpcMemHandle_t cuda_ipc_handle_;
+  };
+#endif
+
   /// Add a shared memory block representing shared memory in system
   /// (CPU) memory to the manager. Return TRITONSERVER_ERROR_ALREADY_EXISTS
   /// if a shared memory block of the same name already exists in the manager.
@@ -90,11 +132,18 @@ class SharedMemoryManager {
   /// \param memory_type Returns the type of the memory
   /// \param device_id Returns the device id associated with the
   /// memory block
-  /// \return a TRITONSERVER_Error indicating success or failure.
+  /// \param shm_info Returns a shared pointer reference(read-only) to the
+  /// shared memory block's information.
+  /// This pointer will automatically increase the usage count, preventing
+  /// unregistration while the reference is held. The reference must be cleared
+  /// or set to nullptr when no longer needed, to decrease the count and allow
+  /// unregistration.
+  /// \return a TRITONSERVER_Error indicating success or
+  /// failure.
   TRITONSERVER_Error* GetMemoryInfo(
       const std::string& name, size_t offset, size_t byte_size,
       void** shm_mapped_addr, TRITONSERVER_MemoryType* memory_type,
-      int64_t* device_id);
+      int64_t* device_id, std::shared_ptr<const SharedMemoryInfo>* shm_info);
 
 #ifdef TRITON_ENABLE_GPU
   /// Get the CUDA memory handle associated with the block name.
@@ -139,50 +188,8 @@ class SharedMemoryManager {
   TRITONSERVER_Error* UnregisterHelper(
       const std::string& name, TRITONSERVER_MemoryType memory_type);
 
-  /// A struct that records the shared memory regions registered by the shared
-  /// memory manager.
-  struct SharedMemoryInfo {
-    SharedMemoryInfo(
-        const std::string& name, const std::string& shm_key,
-        const size_t offset, const size_t byte_size, int shm_fd,
-        void* mapped_addr, const TRITONSERVER_MemoryType kind,
-        const int64_t device_id)
-        : name_(name), shm_key_(shm_key), offset_(offset),
-          byte_size_(byte_size), shm_fd_(shm_fd), mapped_addr_(mapped_addr),
-          kind_(kind), device_id_(device_id)
-    {
-    }
-
-    std::string name_;
-    std::string shm_key_;
-    size_t offset_;
-    size_t byte_size_;
-    int shm_fd_;
-    void* mapped_addr_;
-    TRITONSERVER_MemoryType kind_;
-    int64_t device_id_;
-  };
-
-#ifdef TRITON_ENABLE_GPU
-  struct CUDASharedMemoryInfo : SharedMemoryInfo {
-    CUDASharedMemoryInfo(
-        const std::string& name, const std::string& shm_key,
-        const size_t offset, const size_t byte_size, int shm_fd,
-        void* mapped_addr, const TRITONSERVER_MemoryType kind,
-        const int64_t device_id, const cudaIpcMemHandle_t* cuda_ipc_handle)
-        : SharedMemoryInfo(
-              name, shm_key, offset, byte_size, shm_fd, mapped_addr, kind,
-              device_id),
-          cuda_ipc_handle_(*cuda_ipc_handle)
-    {
-    }
-
-    cudaIpcMemHandle_t cuda_ipc_handle_;
-  };
-#endif
-
   using SharedMemoryStateMap =
-      std::map<std::string, std::unique_ptr<SharedMemoryInfo>>;
+      std::map<std::string, std::shared_ptr<SharedMemoryInfo>>;
   // A map between the name and the details of the associated
   // shared memory block
   SharedMemoryStateMap shared_memory_map_;
diff --git a/tools/add_copyright.py b/tools/add_copyright.py
new file mode 100644
index 0000000000..34432bb0c6
--- /dev/null
+++ b/tools/add_copyright.py
@@ -0,0 +1,365 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import argparse
+import os
+import re
+import subprocess
+import sys
+from datetime import datetime
+from typing import Callable, Dict, Optional, Sequence
+
+current_year = str(datetime.now().year)
+
+ROOT_DIR = os.path.join(os.path.dirname(__file__), os.path.pardir)
+
+LICENSE_PATH = os.path.join(ROOT_DIR, "LICENSE")
+
+COPYRIGHT_YEAR_PAT = re.compile(
+    r"Copyright( \(c\))? (\d{4})?-?(\d{4}), NVIDIA CORPORATION"
+)
+
+
+def has_copyright(content: str) -> bool:
+    return COPYRIGHT_YEAR_PAT.search(content)
+
+
+def update_copyright_year(
+    path: str, content: Optional[str] = None, disallow_range: bool = False
+) -> str:
+    """
+    Updates the copyright year in the provided file.
+    If the copyright is not present in the file, this function has no effect.
+    """
+    if content is None:
+        with open(path, "r") as f:
+            content = f.read()
+
+    match = COPYRIGHT_YEAR_PAT.search(content)
+    min_year = match.groups()[1] or match.groups()[2]
+
+    new_copyright = f"Copyright{match.groups()[0] or ''} "
+    if min_year < current_year and not disallow_range:
+        new_copyright += f"{min_year}-{current_year}"
+    else:
+        new_copyright += f"{current_year}"
+    new_copyright += ", NVIDIA CORPORATION"
+
+    updated_content = COPYRIGHT_YEAR_PAT.sub(new_copyright, content)
+
+    if content != updated_content:
+        with open(path, "w") as f:
+            f.write(updated_content)
+
+
+def update_and_get_license() -> str:
+    """
+    Updates the copyright year in the LICENSE file if necessary and then
+    returns its contents.
+    """
+    # TODO: Check if this is right - if the license file needs to have a range,
+    # we need to remove the range before returning the license text.
+    #
+    # License file should always have the current year.
+    update_copyright_year(LICENSE_PATH, disallow_range=True)
+
+    with open(LICENSE_PATH, "r") as license_file:
+        return license_file.read()
+
+
+LICENSE_TEXT = update_and_get_license()
+
+#
+# Header manipulation helpers
+#
+
+
+def prefix_lines(content: str, prefix: str) -> str:
+    # NOTE: This could have been done via `textwrap.indent`, but we're not actually indenting,
+    # so it seems semantically wrong to do that.
+    return prefix + f"\n{prefix}".join(content.splitlines())
+
+
+def insert_after(regex: str) -> Callable[[str], str]:
+    """
+    Builds a callback that will insert a provided header after
+    the specified regular expression. If the expression is not
+    found in the file contents, the header will be inserted at the
+    beginning of the file.
+
+    Args:
+        regex: The regular expression to match.
+
+    Returns:
+        A callable that can be used as the `add_header` argument to `update_or_add_header`.
+    """
+
+    def add_header(header: str, content: str) -> str:
+        match = re.match(regex, content)
+
+        if match is None:
+            return header + "\n" + content
+
+        insertion_point = match.span()[-1]
+
+        return content[:insertion_point] + f"{header}\n" + content[insertion_point:]
+
+    return add_header
+
+
+def update_or_add_header(
+    path: str, header: str, add_header: Optional[Callable[[str, str], str]] = None
+):
+    """
+    Updates in place or adds a new copyright header to the specified file.
+
+    Args:
+        path: The path of the file.
+        header: The contents of the copyright header.
+        add_header: A callback that receives the copyright header and file contents and
+            controls how the contents of the file are updated. By default, the copyright
+            header is prepended to the file.
+    """
+    with open(path, "r") as f:
+        content = f.read()
+
+    if has_copyright(content):
+        update_copyright_year(path, content)
+        return
+
+    add_header = add_header or (lambda header, content: header + "\n" + content)
+
+    content = add_header(header, content)
+
+    # As a sanity check, make sure we didn't accidentally add the copyright header
+    # twice, or add a new header when one was already present.
+    if content.count("Copyright (c)") != 1:
+        print(
+            f"WARNING: Something went wrong while processing: {path}!\n"
+            "Please check if the copyright header was included twice or wasn't added at all. "
+        )
+
+    with open(path, "w") as f:
+        f.write(content)
+
+
+# Each file type requires slightly different handling when inserting the copyright
+# header. For example, for C++ files, the header must be prefixed with `//` and for
+# shell scripts, it must be prefixed with `#` and must be inserted *after* the shebang.
+#
+# This mapping stores callables that return whether a handler wants to process a specified
+# file based on the path along with callables that will accept the file path and update
+# it with the copyright header.
+FILE_TYPE_HANDLERS: Dict[Callable[[str], bool], Callable[[str], None]] = {}
+
+
+#
+# Path matching callables
+# These allow registered functions to more easily specify what kinds of
+# paths they should be applied to.
+#
+def has_ext(exts: Sequence[str]):
+    def has_ext_impl(path: str):
+        _, ext = os.path.splitext(path)
+        return ext in exts
+
+    return has_ext_impl
+
+
+def basename_is(expected_path: str):
+    return lambda path: os.path.basename(path) == expected_path
+
+
+def path_contains(expected: str):
+    return lambda path: expected in path
+
+
+def any_of(*funcs: Sequence[Callable[[str], bool]]):
+    return lambda path: any(func(path) for func in funcs)
+
+
+#
+# File handlers for different types of files.
+# Many types of files require very similar handling - those are combined where possible.
+#
+
+
+def register(match: Callable[[str], bool]):
+    def register_impl(func):
+        FILE_TYPE_HANDLERS[match] = func
+        return func
+
+    return register_impl
+
+
+@register(
+    any_of(
+        has_ext([".py", ".sh", ".bash", ".yaml", ".pbtxt"]),
+        basename_is("CMakeLists.txt"),
+        path_contains("Dockerfile"),
+    )
+)
+def py_or_shell_like(path):
+    update_or_add_header(
+        path,
+        prefix_lines(LICENSE_TEXT, "# "),
+        # Insert the header *after* the shebang.
+        # NOTE: This could break if there is a shebang-like pattern elsewhere in the file.
+        # In that case, this could be edited to check only the first line of the file (after removing whitespace).
+        insert_after(r"#!(.*)\n"),
+    )
+
+
+@register(has_ext([".cc", ".h"]))
+def cpp(path):
+    update_or_add_header(path, prefix_lines(LICENSE_TEXT, "// "))
+
+
+@register(has_ext([".tpl"]))
+def tpl(path):
+    update_or_add_header(path, "{{/*\n" + prefix_lines(LICENSE_TEXT, "# ") + "\n*/}}")
+
+
+@register(has_ext([".html", ".md"]))
+def html_md(path):
+    update_or_add_header(path, "<!--\n" + prefix_lines(LICENSE_TEXT, "# ") + "\n-->")
+
+
+def add_copyrights(paths):
+    for path in paths:
+        for match, handler in FILE_TYPE_HANDLERS.items():
+            if match(path):
+                handler(path)
+                break
+        else:
+            print(
+                f"WARNING: No handler registered for file: {path}. Please add a new handler to {__file__}!"
+            )
+
+    subprocess.run(["git", "add"] + paths)
+
+    print(f"Processed copyright headers for {len(paths)} file(s).")
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        description="Adds copyright headers to source files"
+    )
+    parser.add_argument("files", nargs="*")
+
+    args, _ = parser.parse_known_args()
+    add_copyrights(args.files)
+    return 0
+
+
+if __name__ == "__main__":
+    # sys.exit is important here to avoid the test-related imports below during normal execution.
+    sys.exit(main())
+
+
+#
+# Integration Tests
+#
+import tempfile
+
+import pytest
+
+
+# Processes provided text through the copyright hook by writing it to a temporary file.
+def process_text(content, extension):
+    with tempfile.NamedTemporaryFile("w+", suffix=extension) as f:
+        f.write(content)
+        f.flush()
+
+        add_copyrights([f.name])
+
+        f.seek(0)
+        return f.read()
+
+
+# We use this slightly weird hack to make sure the copyright hook does not do a text replacement
+# of the parameters in the test, since they look exactly like copyright headers.
+def make_copyright_text(text):
+    return f"Copyright {text}"
+
+
+@pytest.mark.parametrize(
+    "content, expected",
+    [
+        # Convert to range if the year that's already present is older than the current year.
+        (
+            make_copyright_text("(c) 2018, NVIDIA CORPORATION"),
+            make_copyright_text(f"(c) 2018-{current_year}, NVIDIA CORPORATION"),
+        ),
+        (
+            make_copyright_text("2018, NVIDIA CORPORATION"),
+            make_copyright_text(f"2018-{current_year}, NVIDIA CORPORATION"),
+        ),
+        # No effect if the year is current:
+        (
+            make_copyright_text(f"(c) {current_year}, NVIDIA CORPORATION"),
+            make_copyright_text(f"(c) {current_year}, NVIDIA CORPORATION"),
+        ),
+        (
+            make_copyright_text(f"{current_year}, NVIDIA CORPORATION"),
+            make_copyright_text(f"{current_year}, NVIDIA CORPORATION"),
+        ),
+        # If there is already a range, update the upper bound of the range:
+        (
+            make_copyright_text("(c) 2018-2023, NVIDIA CORPORATION"),
+            make_copyright_text(f"(c) 2018-{current_year}, NVIDIA CORPORATION"),
+        ),
+    ],
+)
+def test_copyright_update(content, expected):
+    # We don't really care about the extension here - just needs to be something the hook will recognize.
+    assert process_text(content, ".py") == expected
+
+
+@pytest.mark.parametrize(
+    "content, extension, expected",
+    [
+        ("", ".cc", f"// {make_copyright_text(f'(c) {current_year}')}"),
+        ("", ".h", f"// {make_copyright_text(f'(c) {current_year}')}"),
+        ("", ".py", f"# {make_copyright_text(f'(c) {current_year}')}"),
+        ("", ".sh", f"# {make_copyright_text(f'(c) {current_year}')}"),
+        # Make sure copyright comes after shebangs
+        (
+            "#!/bin/python\n",
+            ".py",
+            f"#!/bin/python\n# {make_copyright_text(f'(c) {current_year}')}",
+        ),
+        (
+            "#!/bin/bash\n",
+            ".sh",
+            f"#!/bin/bash\n# {make_copyright_text(f'(c) {current_year}')}",
+        ),
+    ],
+)
+def test_adding_new_copyrights(content, extension, expected):
+    assert process_text(content, extension).startswith(expected)
+
+
+def test_license_has_no_range():
+    assert LICENSE_TEXT.startswith(f"Copyright (c) {current_year},")