diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f44f815351..663a36d631 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,4 +1,4 @@ -# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -72,3 +72,13 @@ repos: - id: mixed-line-ending - id: requirements-txt-fixer - id: trailing-whitespace + +- repo: local + hooks: + - id: add-license + name: Add License + entry: python tools/add_copyright.py + language: python + stages: [pre-commit] + verbose: true + require_serial: true diff --git a/CMakeLists.txt b/CMakeLists.txt index ff578c9724..56cb346dc0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -125,17 +125,13 @@ FetchContent_Declare( # Some libs are installed to ${TRITON_THIRD_PARTY_INSTALL_PREFIX}/{LIB}/lib64 instead # of ${TRITON_THIRD_PARTY_INSTALL_PREFIX}/{LIB}/lib on Centos -set (LIB_DIR "lib") -# /etc/os-release does not exist on Windows -if(EXISTS "/etc/os-release") - file(STRINGS /etc/os-release DISTRO REGEX "^NAME=") - string(REGEX REPLACE "NAME=\"(.*)\"" "\\1" DISTRO "${DISTRO}") - message(STATUS "Distro Name: ${DISTRO}") - if(DISTRO MATCHES "CentOS.*") +set(LIB_DIR "lib") +if(LINUX) + file(STRINGS "/etc/os-release" DISTRO_ID_LIKE REGEX "ID_LIKE") + if(${DISTRO_ID_LIKE} MATCHES "rhel|centos") set (LIB_DIR "lib64") - endif() -endif() - + endif(${DISTRO_ID_LIKE} MATCHES "rhel|centos") +endif(LINUX) set(TRITON_CORE_HEADERS_ONLY OFF) FetchContent_MakeAvailable(repo-third-party repo-core) diff --git a/Dockerfile.QA b/Dockerfile.QA index 2c43f735a5..68ab519b41 100644 --- a/Dockerfile.QA +++ b/Dockerfile.QA @@ -267,6 +267,12 @@ RUN cp -r qa/L0_decoupled/models qa/L0_decoupled/python_models/ && \ cp /workspace/tritonbuild/python/examples/decoupled/square_config.pbtxt \ qa/L0_decoupled/python_models/square_int32/. +RUN mkdir -p qa/L0_decoupled_grpc_error && \ + cp -r qa/L0_decoupled/. qa/L0_decoupled_grpc_error + +RUN mkdir -p qa/L0_grpc_error_state_cleanup && \ + cp -r qa/L0_grpc_state_cleanup/. qa/L0_grpc_error_state_cleanup + RUN mkdir -p qa/L0_repoagent_checksum/models/identity_int32/1 && \ cp tritonbuild/identity/install/backends/identity/libtriton_identity.so \ qa/L0_repoagent_checksum/models/identity_int32/1/. @@ -384,6 +390,10 @@ RUN rm -fr qa/L0_copyrights qa/L0_build_variants && \ RUN find qa/pkgs/ -maxdepth 1 -type f -name \ "tritonserver-*.whl" | xargs -I {} pip3 install --upgrade {}[all] +# Install Triton Frontend Python API +RUN find qa/pkgs/ -type f -name \ + "tritonfrontend-*.whl" | xargs -I {} pip3 install --upgrade {}[all] + ENV LD_LIBRARY_PATH /opt/tritonserver/qa/clients:${LD_LIBRARY_PATH} # DLIS-3631: Needed to run Perf Analyzer CI tests correctly diff --git a/Dockerfile.sdk b/Dockerfile.sdk index 1524b5ead3..5ddaf7274f 100644 --- a/Dockerfile.sdk +++ b/Dockerfile.sdk @@ -29,10 +29,11 @@ # # Base image on the minimum Triton container -ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.07-py3-min +ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.09-py3-min ARG TRITON_CLIENT_REPO_SUBDIR=clientrepo ARG TRITON_PA_REPO_SUBDIR=perfanalyzerrepo +ARG TRITON_REPO_ORGANIZATION=http://github.com/triton-inference-server ARG TRITON_COMMON_REPO_TAG=main ARG TRITON_CORE_REPO_TAG=main ARG TRITON_CLIENT_REPO_TAG=main @@ -217,6 +218,7 @@ WORKDIR /workspace COPY TRITON_VERSION . COPY NVIDIA_Deep_Learning_Container_License.pdf . COPY --from=sdk_build /workspace/client/ client/ +COPY --from=sdk_build /workspace/perf_analyzer/ perf_analyzer/ COPY --from=sdk_build /workspace/install/ install/ RUN cd install && \ export VERSION=`cat /workspace/TRITON_VERSION` && \ diff --git a/Dockerfile.win10.min b/Dockerfile.win10.min index 0a554fbcf4..dec972eaf3 100644 --- a/Dockerfile.win10.min +++ b/Dockerfile.win10.min @@ -37,9 +37,9 @@ RUN choco install unzip -y # # Installing TensorRT # -ARG TENSORRT_VERSION=10.2.0.19 -ARG TENSORRT_ZIP="TensorRT-${TENSORRT_VERSION}.Windows10.x86_64.cuda-12.5.zip" -ARG TENSORRT_SOURCE=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.2.0/zip/TensorRT-10.2.0.19.Windows10.x86_64.cuda-12.5.zip +ARG TENSORRT_VERSION=10.4.0.26 +ARG TENSORRT_ZIP="TensorRT-${TENSORRT_VERSION}.Windows.win10.cuda-12.6.zip" +ARG TENSORRT_SOURCE=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.4.0/zip/TensorRT-10.4.0.26.Windows.win10.cuda-12.6.zip # COPY ${TENSORRT_ZIP} /tmp/${TENSORRT_ZIP} ADD ${TENSORRT_SOURCE} /tmp/${TENSORRT_ZIP} RUN unzip /tmp/%TENSORRT_ZIP% @@ -51,9 +51,9 @@ LABEL TENSORRT_VERSION="${TENSORRT_VERSION}" # # Installing cuDNN # -ARG CUDNN_VERSION=9.2.1.18 +ARG CUDNN_VERSION=9.4.0.58 ARG CUDNN_ZIP=cudnn-windows-x86_64-${CUDNN_VERSION}_cuda12-archive.zip -ARG CUDNN_SOURCE=https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/windows-x86_64/cudnn-windows-x86_64-9.2.1.18_cuda12-archive.zip +ARG CUDNN_SOURCE=https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/windows-x86_64/cudnn-windows-x86_64-9.4.0.58_cuda12-archive.zip ADD ${CUDNN_SOURCE} /tmp/${CUDNN_ZIP} RUN unzip /tmp/%CUDNN_ZIP% RUN move cudnn-* cudnn @@ -101,14 +101,14 @@ LABEL CMAKE_VERSION=${CMAKE_VERSION} # # Installing Visual Studio BuildTools: VS17 2022 # -ARG BUILDTOOLS_VERSION=17.9.34622.214 +ARG BUILDTOOLS_VERSION=17.10.35201.131 # Download collect.exe in case of an install failure. ADD https://aka.ms/vscollect.exe "C:\tmp\collect.exe" # Use the latest release channel. For more control, specify the location of an internal layout. # Download the Build Tools bootstrapper. # ARG BUILD_TOOLS_SOURCE=https://aka.ms/vs/17/release/vs_buildtools.exe -ARG BUILD_TOOLS_SOURCE=https://download.visualstudio.microsoft.com/download/pr/5e7b923b-7d89-4e14-95b8-a84ab168e243/96b21d216c7954aaf606c6d7ba59a3de991884a8a86c578c767ba349c23188a9/vs_BuildTools.exe +ARG BUILD_TOOLS_SOURCE=https://download.visualstudio.microsoft.com/download/pr/28626b4b-f88f-4b55-a0cf-f3eaa2c643fb/e6c43d4dfb36338d954cdb3ad9010ab2a479e712088f4f6b016eadcc721bab28/vs_BuildTools.exe ADD ${BUILD_TOOLS_SOURCE} vs_buildtools.exe # Install Build Tools with the Microsoft.VisualStudio.Workload.VCTools workload, including recommended. ARG VS_INSTALL_PATH_WP="C:\BuildTools" @@ -175,7 +175,7 @@ RUN copy "%CUDA_INSTALL_ROOT_WP%\extras\visual_studio_integration\MSBuildExtensi RUN setx PATH "%CUDA_INSTALL_ROOT_WP%\bin;%PATH%" -ARG CUDNN_VERSION=9.2.1.18 +ARG CUDNN_VERSION=9.4.0.58 ENV CUDNN_VERSION ${CUDNN_VERSION} COPY --from=dependency_base /cudnn /cudnn RUN copy cudnn\bin\cudnn*.dll "%CUDA_INSTALL_ROOT_WP%\bin\." @@ -183,7 +183,7 @@ RUN copy cudnn\lib\x64\cudnn*.lib "%CUDA_INSTALL_ROOT_WP%\lib\x64\." RUN copy cudnn\include\cudnn*.h "%CUDA_INSTALL_ROOT_WP%\include\." LABEL CUDNN_VERSION="${CUDNN_VERSION}" -ARG TENSORRT_VERSION=10.2.0.19 +ARG TENSORRT_VERSION=10.4.0.26 ENV TRT_VERSION ${TENSORRT_VERSION} COPY --from=dependency_base /TensorRT /TensorRT RUN setx PATH "c:\TensorRT\lib;%PATH%" diff --git a/LICENSE b/LICENSE index 5529809efc..914565ec7d 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions diff --git a/README.md b/README.md index 17628b4f03..36ef51f279 100644 --- a/README.md +++ b/README.md @@ -30,11 +30,10 @@ [![License](https://img.shields.io/badge/License-BSD3-lightgrey.svg)](https://opensource.org/licenses/BSD-3-Clause) -[!WARNING] - -##### LATEST RELEASE -You are currently on the `main` branch which tracks under-development progress towards the next release. -The current release is version [2.48.0](https://github.com/triton-inference-server/server/releases/latest) and corresponds to the 24.07 container release on NVIDIA GPU Cloud (NGC). +>[!WARNING] +>You are currently on the `main` branch which tracks under-development progress +>towards the next release. The current release is version [2.50.0](https://github.com/triton-inference-server/server/releases/latest) +>and corresponds to the 24.09 container release on NVIDIA GPU Cloud (NGC). Triton Inference Server is an open source inference serving software that streamlines AI inferencing. Triton enables teams to deploy any AI model from @@ -92,16 +91,16 @@ Inference Server with the ```bash # Step 1: Create the example model repository -git clone -b r24.07 https://github.com/triton-inference-server/server.git +git clone -b r24.09 https://github.com/triton-inference-server/server.git cd server/docs/examples ./fetch_models.sh # Step 2: Launch triton from the NGC Triton container -docker run --gpus=1 --rm --net=host -v ${PWD}/model_repository:/models nvcr.io/nvidia/tritonserver:24.07-py3 tritonserver --model-repository=/models +docker run --gpus=1 --rm --net=host -v ${PWD}/model_repository:/models nvcr.io/nvidia/tritonserver:24.09-py3 tritonserver --model-repository=/models # Step 3: Sending an Inference Request # In a separate console, launch the image_client example from the NGC Triton SDK container -docker run -it --rm --net=host nvcr.io/nvidia/tritonserver:24.07-py3-sdk +docker run -it --rm --net=host nvcr.io/nvidia/tritonserver:24.09-py3-sdk /workspace/install/bin/image_client -m densenet_onnx -c 3 -s INCEPTION /workspace/images/mug.jpg # Inference should return the following @@ -179,7 +178,7 @@ configuration](docs/user_guide/model_configuration.md) for the model. [Backend-Platform Support Matrix](https://github.com/triton-inference-server/backend/blob/main/docs/backend_platform_support_matrix.md) to learn which backends are supported on your target platform. - Learn how to [optimize performance](docs/user_guide/optimization.md) using the - [Performance Analyzer](https://github.com/triton-inference-server/client/blob/main/src/c++/perf_analyzer/README.md) + [Performance Analyzer](https://github.com/triton-inference-server/perf_analyzer/blob/main/README.md) and [Model Analyzer](https://github.com/triton-inference-server/model_analyzer) - Learn how to [manage loading and unloading models](docs/user_guide/model_management.md) in diff --git a/TRITON_VERSION b/TRITON_VERSION index 37433781ef..124ddb483d 100644 --- a/TRITON_VERSION +++ b/TRITON_VERSION @@ -1 +1 @@ -2.49.0dev \ No newline at end of file +2.51.0dev \ No newline at end of file diff --git a/build.py b/build.py index 6ab8a58515..14301f843d 100755 --- a/build.py +++ b/build.py @@ -37,6 +37,7 @@ import sys from inspect import getsourcefile +import distro import requests # @@ -69,10 +70,10 @@ # incorrectly load the other version of the openvino libraries. # TRITON_VERSION_MAP = { - "2.49.0dev": ( - "24.08dev", # triton container - "24.07", # upstream container - "1.18.1", # ORT + "2.51.0dev": ( + "24.10dev", # triton container + "24.09", # upstream container + "1.19.2", # ORT "2024.0.0", # ORT OpenVINO "2024.0.0", # Standalone OpenVINO "3.2.6", # DCGM version @@ -115,13 +116,25 @@ def fail_if(p, msg): def target_platform(): - if FLAGS.target_platform is not None: + # When called by compose.py, FLAGS will be None + if FLAGS and FLAGS.target_platform is not None: return FLAGS.target_platform - return platform.system().lower() + platform_string = platform.system().lower() + if platform_string == "linux": + # Need to inspect the /etc/os-release file to get + # the distribution of linux + id_like_list = distro.like().split() + if "debian" in id_like_list: + return "linux" + else: + return "rhel" + else: + return platform_string def target_machine(): - if FLAGS.target_machine is not None: + # When called by compose.py, FLAGS will be None + if FLAGS and FLAGS.target_machine is not None: return FLAGS.target_machine return platform.machine().lower() @@ -203,6 +216,8 @@ def header(self, desc=None): self.comment("Exit script immediately if any command fails") if target_platform() == "windows": + self._file.write("$UseStructuredOutput = $false\n") + self.blankln() self._file.write("function ExitWithCode($exitcode) {\n") self._file.write(" $host.SetShouldExit($exitcode)\n") self._file.write(" exit $exitcode\n") @@ -628,13 +643,16 @@ def pytorch_cmake_args(images): cmake_backend_arg("pytorch", "TRITON_PYTORCH_DOCKER_IMAGE", None, image), ] - if FLAGS.enable_gpu: + # TODO: TPRD-372 TorchTRT extension is not currently supported by our manylinux build + # TODO: TPRD-373 NVTX extension is not currently supported by our manylinux build + if target_platform() != "rhel": + if FLAGS.enable_gpu: + cargs.append( + cmake_backend_enable("pytorch", "TRITON_PYTORCH_ENABLE_TORCHTRT", True) + ) cargs.append( - cmake_backend_enable("pytorch", "TRITON_PYTORCH_ENABLE_TORCHTRT", True) + cmake_backend_enable("pytorch", "TRITON_ENABLE_NVTX", FLAGS.enable_nvtx) ) - cargs.append( - cmake_backend_enable("pytorch", "TRITON_ENABLE_NVTX", FLAGS.enable_nvtx) - ) return cargs @@ -644,12 +662,15 @@ def onnxruntime_cmake_args(images, library_paths): "onnxruntime", "TRITON_BUILD_ONNXRUNTIME_VERSION", None, - TRITON_VERSION_MAP[FLAGS.version][2], + os.getenv("TRITON_BUILD_ONNXRUNTIME_VERSION") + if os.getenv("TRITON_BUILD_ONNXRUNTIME_VERSION") + else TRITON_VERSION_MAP[FLAGS.version][2], ) ] # TRITON_ENABLE_GPU is already set for all backends in backend_cmake_args() - if FLAGS.enable_gpu: + # TODO: TPRD-334 TensorRT extension is not currently supported by our manylinux build + if FLAGS.enable_gpu and target_platform() != "rhel": cargs.append( cmake_backend_enable( "onnxruntime", "TRITON_ENABLE_ONNXRUNTIME_TENSORRT", True @@ -680,8 +701,11 @@ def onnxruntime_cmake_args(images, library_paths): ) ) - if (target_machine() != "aarch64") and ( - TRITON_VERSION_MAP[FLAGS.version][3] is not None + # TODO: TPRD-333 OpenVino extension is not currently supported by our manylinux build + if ( + (target_machine() != "aarch64") + and (target_platform() != "rhel") + and (TRITON_VERSION_MAP[FLAGS.version][3] is not None) ): cargs.append( cmake_backend_enable( @@ -697,7 +721,7 @@ def onnxruntime_cmake_args(images, library_paths): ) ) - if target_platform() == "igpu": + if (target_platform() == "igpu") or (target_platform() == "rhel"): cargs.append( cmake_backend_arg( "onnxruntime", @@ -833,8 +857,31 @@ def install_dcgm_libraries(dcgm_version, target_machine): ) return "" else: - if target_machine == "aarch64": - return """ + # RHEL has the same install instructions for both aarch64 and x86 + if target_platform() == "rhel": + if target_machine == "aarch64": + return """ +ENV DCGM_VERSION {} +# Install DCGM. Steps from https://developer.nvidia.com/dcgm#Downloads +RUN dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/sbsa/cuda-rhel8.repo \\ + && dnf clean expire-cache \\ + && dnf install -y datacenter-gpu-manager-{} +""".format( + dcgm_version, dcgm_version + ) + else: + return """ +ENV DCGM_VERSION {} +# Install DCGM. Steps from https://developer.nvidia.com/dcgm#Downloads +RUN dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo \\ + && dnf clean expire-cache \\ + && dnf install -y datacenter-gpu-manager-{} +""".format( + dcgm_version, dcgm_version + ) + else: + if target_machine == "aarch64": + return """ ENV DCGM_VERSION {} # Install DCGM. Steps from https://developer.nvidia.com/dcgm#Downloads RUN curl -o /tmp/cuda-keyring.deb \\ @@ -844,10 +891,10 @@ def install_dcgm_libraries(dcgm_version, target_machine): && apt-get update \\ && apt-get install -y datacenter-gpu-manager=1:{} """.format( - dcgm_version, dcgm_version - ) - else: - return """ + dcgm_version, dcgm_version + ) + else: + return """ ENV DCGM_VERSION {} # Install DCGM. Steps from https://developer.nvidia.com/dcgm#Downloads RUN curl -o /tmp/cuda-keyring.deb \\ @@ -857,8 +904,106 @@ def install_dcgm_libraries(dcgm_version, target_machine): && apt-get update \\ && apt-get install -y datacenter-gpu-manager=1:{} """.format( - dcgm_version, dcgm_version - ) + dcgm_version, dcgm_version + ) + + +def create_dockerfile_buildbase_rhel(ddir, dockerfile_name, argmap): + df = """ +ARG TRITON_VERSION={} +ARG TRITON_CONTAINER_VERSION={} +ARG BASE_IMAGE={} +""".format( + argmap["TRITON_VERSION"], + argmap["TRITON_CONTAINER_VERSION"], + argmap["BASE_IMAGE"], + ) + + df += """ +FROM ${BASE_IMAGE} + +ARG TRITON_VERSION +ARG TRITON_CONTAINER_VERSION +""" + df += """ +# Install docker docker buildx +RUN yum install -y ca-certificates curl gnupg yum-utils \\ + && yum-config-manager --add-repo https://download.docker.com/linux/rhel/docker-ce.repo \\ + && yum install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin +# && yum install -y docker.io docker-buildx-plugin + +# libcurl4-openSSL-dev is needed for GCS +# python3-dev is needed by Torchvision +# python3-pip and libarchive-dev is needed by python backend +# libxml2-dev is needed for Azure Storage +# scons is needed for armnn_tflite backend build dep +RUN yum install -y \\ + ca-certificates \\ + autoconf \\ + automake \\ + git \\ + gperf \\ + re2-devel \\ + openssl-devel \\ + libtool \\ + libcurl-devel \\ + libb64-devel \\ + gperftools-devel \\ + patchelf \\ + python3.11-devel \\ + python3-pip \\ + python3-setuptools \\ + rapidjson-devel \\ + python3-scons \\ + pkg-config \\ + unzip \\ + wget \\ + zlib-devel \\ + libarchive-devel \\ + libxml2-devel \\ + numactl-devel \\ + wget + +RUN pip3 install --upgrade pip \\ + && pip3 install --upgrade \\ + wheel \\ + setuptools \\ + docker \\ + virtualenv + +# Install boost version >= 1.78 for boost::span +# Current libboost-dev apt packages are < 1.78, so install from tar.gz +RUN wget -O /tmp/boost.tar.gz \\ + https://archives.boost.io/release/1.80.0/source/boost_1_80_0.tar.gz \\ + && (cd /tmp && tar xzf boost.tar.gz) \\ + && mv /tmp/boost_1_80_0/boost /usr/include/boost + +# Server build requires recent version of CMake (FetchContent required) +# Might not need this if the installed version of cmake is high enough for our build. +# RUN apt update -q=2 \\ +# && apt install -y gpg wget \\ +# && wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null \\ +# && . /etc/os-release \\ +# && echo "deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ $UBUNTU_CODENAME main" | tee /etc/apt/sources.list.d/kitware.list >/dev/null \\ +# && apt-get update -q=2 \\ +# && apt-get install -y --no-install-recommends cmake=3.27.7* cmake-data=3.27.7* +""" + if FLAGS.enable_gpu: + df += install_dcgm_libraries(argmap["DCGM_VERSION"], target_machine()) + df += """ +ENV TRITON_SERVER_VERSION ${TRITON_VERSION} +ENV NVIDIA_TRITON_SERVER_VERSION ${TRITON_CONTAINER_VERSION} +""" + + df += """ +WORKDIR /workspace +RUN rm -fr * +COPY . . +ENTRYPOINT [] +""" + + with open(os.path.join(ddir, dockerfile_name), "w") as dfile: + dfile.write(df) def create_dockerfile_buildbase(ddir, dockerfile_name, argmap): @@ -1161,7 +1306,28 @@ def dockerfile_prepare_container_linux(argmap, backends, enable_gpu, target_mach fi \\ && [ `id -u $TRITON_SERVER_USER` -eq 1000 ] \\ && [ `id -g $TRITON_SERVER_USER` -eq 1000 ] +""".format( + gpu_enabled=gpu_enabled + ) + if target_platform() == "rhel": + df += """ +# Common dpeendencies. +RUN yum install -y \\ + git \\ + gperf \\ + re2-devel \\ + openssl-devel \\ + libtool \\ + libcurl-devel \\ + libb64-devel \\ + gperftools-devel \\ + patchelf \\ + wget \\ + numactl-devel +""" + else: + df += """ # Ensure apt-get won't prompt for selecting options ENV DEBIAN_FRONTEND=noninteractive @@ -1184,12 +1350,14 @@ def dockerfile_prepare_container_linux(argmap, backends, enable_gpu, target_mach wget \\ {backend_dependencies} \\ && rm -rf /var/lib/apt/lists/* +""".format( + backend_dependencies=backend_dependencies + ) + df += """ # Set TCMALLOC_RELEASE_RATE for users setting LD_PRELOAD with tcmalloc ENV TCMALLOC_RELEASE_RATE 200 -""".format( - gpu_enabled=gpu_enabled, backend_dependencies=backend_dependencies - ) +""" if "fastertransformer" in backends: be = "fastertransformer" @@ -1206,12 +1374,15 @@ def dockerfile_prepare_container_linux(argmap, backends, enable_gpu, target_mach if enable_gpu: df += install_dcgm_libraries(argmap["DCGM_VERSION"], target_machine) - df += """ + # This segment will break the RHEL SBSA build. Need to determine whether + # this is necessary to incorporate. + if target_platform() != "rhel": + df += """ # Extra defensive wiring for CUDA Compat lib RUN ln -sf ${_CUDA_COMPAT_PATH}/lib.real ${_CUDA_COMPAT_PATH}/lib \\ - && echo ${_CUDA_COMPAT_PATH}/lib > /etc/ld.so.conf.d/00-cuda-compat.conf \\ - && ldconfig \\ - && rm -f ${_CUDA_COMPAT_PATH}/lib + && echo ${_CUDA_COMPAT_PATH}/lib > /etc/ld.so.conf.d/00-cuda-compat.conf \\ + && ldconfig \\ + && rm -f ${_CUDA_COMPAT_PATH}/lib """ else: df += add_cpu_libs_to_linux_dockerfile(backends, target_machine) @@ -1433,9 +1604,14 @@ def create_build_dockerfiles( ) dockerfileargmap["GPU_BASE_IMAGE"] = gpu_base_image - create_dockerfile_buildbase( - FLAGS.build_dir, "Dockerfile.buildbase", dockerfileargmap - ) + if target_platform() == "rhel": + create_dockerfile_buildbase_rhel( + FLAGS.build_dir, "Dockerfile.buildbase", dockerfileargmap + ) + else: + create_dockerfile_buildbase( + FLAGS.build_dir, "Dockerfile.buildbase", dockerfileargmap + ) if target_platform() == "windows": create_dockerfile_windows( @@ -1647,6 +1823,21 @@ def core_build( os.path.join(repo_install_dir, "bin", "tritonserver.dll"), os.path.join(install_dir, "bin"), ) + cmake_script.cp( + os.path.join(repo_install_dir, "lib", "tritonserver.lib"), + os.path.join(install_dir, "bin"), + ) + elif target_platform() == "rhel": + cmake_script.mkdir(os.path.join(install_dir, "bin")) + cmake_script.cp( + os.path.join(repo_install_dir, "bin", "tritonserver"), + os.path.join(install_dir, "bin"), + ) + cmake_script.mkdir(os.path.join(install_dir, "lib64")) + cmake_script.cp( + os.path.join(repo_install_dir, "lib64", "libtritonserver.so"), + os.path.join(install_dir, "lib64"), + ) else: cmake_script.mkdir(os.path.join(install_dir, "bin")) cmake_script.cp( @@ -1658,11 +1849,11 @@ def core_build( os.path.join(repo_install_dir, "lib", "libtritonserver.so"), os.path.join(install_dir, "lib"), ) - # [FIXME] Placing the Triton server wheel file in 'python' for now, should - # have been upload to pip registry and be able to install directly + # [FIXME] Placing the tritonserver and tritonfrontend wheel files in 'python' for now, + # should be uploaded to pip registry to be able to install directly cmake_script.mkdir(os.path.join(install_dir, "python")) cmake_script.cp( - os.path.join(repo_install_dir, "python", "tritonserver*.whl"), + os.path.join(repo_install_dir, "python", "triton*.whl"), os.path.join(install_dir, "python"), ) @@ -1802,6 +1993,10 @@ def backend_clone( os.path.join(build_dir, be, "src", "model.py"), backend_dir, ) + clone_script.cpdir( + os.path.join(build_dir, be, "src", "utils"), + backend_dir, + ) clone_script.comment() clone_script.comment(f"end '{be}' backend") @@ -2120,7 +2315,7 @@ def enable_all(): "--target-platform", required=False, default=None, - help='Target platform for build, can be "linux", "windows" or "igpu". If not specified, build targets the current platform.', + help='Target platform for build, can be "linux", "rhel", "windows" or "igpu". If not specified, build targets the current platform.', ) parser.add_argument( "--target-machine", diff --git a/deploy/aws/values.yaml b/deploy/aws/values.yaml index 98151829c7..bd8ae0fe3b 100644 --- a/deploy/aws/values.yaml +++ b/deploy/aws/values.yaml @@ -27,7 +27,7 @@ replicaCount: 1 image: - imageName: nvcr.io/nvidia/tritonserver:24.07-py3 + imageName: nvcr.io/nvidia/tritonserver:24.09-py3 pullPolicy: IfNotPresent modelRepositoryPath: s3://triton-inference-server-repository/model_repository numGpus: 1 diff --git a/deploy/fleetcommand/Chart.yaml b/deploy/fleetcommand/Chart.yaml index 340e19fb50..8feee92b3c 100644 --- a/deploy/fleetcommand/Chart.yaml +++ b/deploy/fleetcommand/Chart.yaml @@ -26,7 +26,7 @@ apiVersion: v1 # appVersion is the Triton version; update when changing release -appVersion: "2.48.0" +appVersion: "2.50.0" description: Triton Inference Server (Fleet Command) name: triton-inference-server # version is the Chart version; update when changing anything in the chart diff --git a/deploy/fleetcommand/values.yaml b/deploy/fleetcommand/values.yaml index 7a556ef7df..dc5f37ca3b 100644 --- a/deploy/fleetcommand/values.yaml +++ b/deploy/fleetcommand/values.yaml @@ -27,7 +27,7 @@ replicaCount: 1 image: - imageName: nvcr.io/nvidia/tritonserver:24.07-py3 + imageName: nvcr.io/nvidia/tritonserver:24.09-py3 pullPolicy: IfNotPresent numGpus: 1 serverCommand: tritonserver @@ -47,13 +47,13 @@ image: # # To set model control mode, uncomment and configure below # TODO: Fix the following url, it is invalid - # See https://github.com/triton-inference-server/server/blob/r24.07/docs/model_management.md + # See https://github.com/triton-inference-server/server/blob/r24.09/docs/model_management.md # for more details #- --model-control-mode=explicit|poll|none # # Additional server args # - # see https://github.com/triton-inference-server/server/blob/r24.07/README.md + # see https://github.com/triton-inference-server/server/blob/r24.09/README.md # for more details service: diff --git a/deploy/gcp/values.yaml b/deploy/gcp/values.yaml index 937acc6b80..c5427c151e 100644 --- a/deploy/gcp/values.yaml +++ b/deploy/gcp/values.yaml @@ -27,7 +27,7 @@ replicaCount: 1 image: - imageName: nvcr.io/nvidia/tritonserver:24.07-py3 + imageName: nvcr.io/nvidia/tritonserver:24.09-py3 pullPolicy: IfNotPresent modelRepositoryPath: gs://triton-inference-server-repository/model_repository numGpus: 1 diff --git a/deploy/gke-marketplace-app/README.md b/deploy/gke-marketplace-app/README.md index e99b9efbae..595d4634ab 100644 --- a/deploy/gke-marketplace-app/README.md +++ b/deploy/gke-marketplace-app/README.md @@ -1,5 +1,5 @@ +### Triton Server (tritonfrontend) Bindings (Beta) + +The `tritonfrontend` python package is a set of bindings to Triton's existing +frontends implemented in C++. Currently, `tritonfrontend` supports starting up +`KServeHttp` and `KServeGrpc` frontends. These bindings used in-combination +with Triton's Python In-Process API +([`tritonserver`](https://github.com/triton-inference-server/core/tree/main/python/tritonserver)) +and [`tritonclient`](https://github.com/triton-inference-server/client/tree/main/src/python/library) +extend the ability to use Triton's full feature set with a few lines of Python. + +Let us walk through a simple example: +1. First we need to load the desired models and start the server with `tritonserver`. +```python +import tritonserver + +# Constructing path to Model Repository +model_path = f"server/src/python/examples/example_model_repository" + +server_options = tritonserver.Options( + server_id="ExampleServer", + model_repository=model_path, + log_error=True, + log_warn=True, + log_info=True, +) +server = tritonserver.Server(server_options).start(wait_until_ready=True) +``` +Note: `model_path` may need to be edited depending on your setup. + + +2. Now, to start up the respective services with `tritonfrontend` +```python +from tritonfrontend import KServeHttp, KServeGrpc +http_options = KServeHttp.Options(thread_count=5) +http_service = KServeHttp(server, http_options) +http_service.start() + +# Default options (if none provided) +grpc_service = KServeGrpc(server) +grpc_service.start() +``` + +3. Finally, with running services, we can use `tritonclient` or simple `curl` commands to send requests and receive responses from the frontends. + +```python +import tritonclient.http as httpclient +import numpy as np # Use version numpy < 2 +model_name = "identity" # output == input +url = "localhost:8000" + +# Create a Triton client +client = httpclient.InferenceServerClient(url=url) + +# Prepare input data +input_data = np.array([["Roger Roger"]], dtype=object) + +# Create input and output objects +inputs = [httpclient.InferInput("INPUT0", input_data.shape, "BYTES")] + +# Set the data for the input tensor +inputs[0].set_data_from_numpy(input_data) + +results = client.infer(model_name, inputs=inputs) + +# Get the output data +output_data = results.as_numpy("OUTPUT0") + +# Print results +print("[INFERENCE RESULTS]") +print("Output data:", output_data) + +# Stop respective services and server. +http_service.stop() +grpc_service.stop() +server.stop() +``` + +--- + +Additionally, `tritonfrontend` provides context manager support as well. So steps 2-3, could also be achieved through: +```python +from tritonfrontend import KServeHttp +import tritonclient.http as httpclient +import numpy as np # Use version numpy < 2 + +with KServeHttp(server) as http_service: + # The identity model returns an exact duplicate of the input data as output + model_name = "identity" + url = "localhost:8000" + # Create a Triton client + with httpclient.InferenceServerClient(url=url) as client: + # Prepare input data + input_data = np.array(["Roger Roger"], dtype=object) + # Create input and output objects + inputs = [httpclient.InferInput("INPUT0", input_data.shape, "BYTES")] + # Set the data for the input tensor + inputs[0].set_data_from_numpy(input_data) + # Perform inference + results = client.infer(model_name, inputs=inputs) + # Get the output data + output_data = results.as_numpy("OUTPUT0") + # Print results + print("[INFERENCE RESULTS]") + print("Output data:", output_data) + +server.stop() +``` +With this workflow, you can avoid having to stop each service after client requests have terminated. + + +## Known Issues +- The following features are not currently supported when launching the Triton frontend services through the python bindings: + - [Tracing](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/trace.md) + - [Shared Memory](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_shared_memory.md) + - [Metrics](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/metrics.md) + - [Restricted Protocols](https://github.com/triton-inference-server/server/blob/main/docs/customization_guide/inference_protocols.md#limit-endpoint-access-beta) + - VertexAI + - Sagemaker +- After a running server has been stopped, if the client sends an inference request, a Segmentation Fault will occur. \ No newline at end of file diff --git a/docs/examples/fetch_models.sh b/docs/examples/fetch_models.sh index 5594878b3e..f5aaed85aa 100755 --- a/docs/examples/fetch_models.sh +++ b/docs/examples/fetch_models.sh @@ -37,4 +37,4 @@ mv /tmp/inception_v3_2016_08_28_frozen.pb model_repository/inception_graphdef/1/ # ONNX densenet mkdir -p model_repository/densenet_onnx/1 wget -O model_repository/densenet_onnx/1/model.onnx \ - https://contentmamluswest001.blob.core.windows.net/content/14b2744cf8d6418c87ffddc3f3127242/9502630827244d60a1214f250e3bbca7/08aed7327d694b8dbaee2c97b8d0fcba/densenet121-1.2.onnx + https://github.com/onnx/models/raw/main/validated/vision/classification/densenet-121/model/densenet-7.onnx diff --git a/docs/examples/jetson/README.md b/docs/examples/jetson/README.md index 281d5f2a97..77a20474b9 100644 --- a/docs/examples/jetson/README.md +++ b/docs/examples/jetson/README.md @@ -1,5 +1,5 @@ + +# TensorRT-LLM User Guide + +## What is TensorRT-LLM + +[TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) +(TRT-LLM) is an open-source library designed to accelerate and optimize the +inference performance of large language models (LLMs) on NVIDIA GPUs. TRT-LLM +offers users an easy-to-use Python API to build TensorRT engines for LLMs, +incorporating state-of-the-art optimizations to ensure efficient inference on +NVIDIA GPUs. + +## How to run TRT-LLM models with Triton Server via TensorRT-LLM backend + +The +[TensorRT-LLM Backend](https://github.com/triton-inference-server/tensorrtllm_backend) +lets you serve TensorRT-LLM models with Triton Inference Server. Check out the +[Getting Started](https://github.com/triton-inference-server/tensorrtllm_backend?tab=readme-ov-file#getting-started) +section in the TensorRT-LLM Backend repo to learn how to utlize the +[NGC Triton TRT-LLM container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/tritonserver) +to prepare engines for your LLM models and serve them with Triton. + +## How to use your custom TRT-LLM model + +All the supported models can be found in the +[examples](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples) folder in +the TRT-LLM repo. Follow the examples to convert your models to TensorRT +engines. + +After the engine is built, [prepare the model repository](https://github.com/triton-inference-server/tensorrtllm_backend?tab=readme-ov-file#prepare-the-model-repository) +for Triton, and +[modify the model configuration](https://github.com/triton-inference-server/tensorrtllm_backend?tab=readme-ov-file#modify-the-model-configuration). + +Only the *mandatory parameters* need to be set in the model config file. Feel free +to modify the optional parameters as needed. To learn more about the +parameters, model inputs, and outputs, see the +[model config documentation](ttps://github.com/triton-inference-server/tensorrtllm_backend/blob/main/docs/model_config.md) for more details. + +## Advanced Configuration Options and Deployment Strategies + +Explore advanced configuration options and deployment strategies to optimize +and run Triton with your TRT-LLM models effectively: + +- [Model Deployment](https://github.com/triton-inference-server/tensorrtllm_backend/tree/main?tab=readme-ov-file#model-deployment): Techniques for efficiently deploying and managing your models in various environments. +- [Multi-Instance GPU (MIG) Support](https://github.com/triton-inference-server/tensorrtllm_backend/tree/main?tab=readme-ov-file#mig-support): Run Triton and TRT-LLM models with MIG to optimize GPU resource management. +- [Scheduling](https://github.com/triton-inference-server/tensorrtllm_backend/tree/main?tab=readme-ov-file#scheduling): Configure scheduling policies to control how requests are managed and executed. +- [Key-Value Cache](https://github.com/triton-inference-server/tensorrtllm_backend/tree/main?tab=readme-ov-file#key-value-cache): Utlizte KV cache and KV cache reuse to optimize memory usage and improve performance. +- [Decoding](https://github.com/triton-inference-server/tensorrtllm_backend/tree/main?tab=readme-ov-file#decoding): Advanced methods for generating text, including top-k, top-p, top-k top-p, beam search, Medusa, and speculative decoding. +- [Chunked Context](https://github.com/triton-inference-server/tensorrtllm_backend/tree/main?tab=readme-ov-file#chunked-context): Splitting the context into several chunks and batching them during generation phase to increase overall throughput. +- [Quantization](https://github.com/triton-inference-server/tensorrtllm_backend/tree/main?tab=readme-ov-file#quantization): Apply quantization techniques to reduce model size and enhance inference speed. +- [LoRa (Low-Rank Adaptation)](https://github.com/triton-inference-server/tensorrtllm_backend/tree/main?tab=readme-ov-file#lora): Use LoRa for efficient model fine-tuning and adaptation. + +## Tutorials + +Make sure to check out the +[tutorials](https://github.com/triton-inference-server/tutorials) repo to see +more guides on serving popular LLM models with Triton Server and TensorRT-LLM, +as well as deploying them on Kubernetes. + +## Benchmark + +[GenAI-Perf](https://github.com/triton-inference-server/perf_analyzer/tree/main/genai-perf) +is a command line tool for measuring the throughput and latency of LLMs served +by Triton Inference Server. Check out the +[Quick Start](https://github.com/triton-inference-server/perf_analyzer/tree/main/genai-perf#quick-start) +to learn how to use GenAI-Perf to benchmark your LLM models. + +## Performance Best Practices + +Check out the +[Performance Best Practices guide](https://nvidia.github.io/TensorRT-LLM/performance/perf-best-practices.html) +to learn how to optimize your TensorRT-LLM models for better performance. + +## Metrics + +Triton Server provides +[metrics](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/metrics.md) +indicating GPU and request statistics. +See the +[Triton Metrics](https://github.com/triton-inference-server/tensorrtllm_backend?tab=readme-ov-file#triton-metrics) +section in the TensorRT-LLM Backend repo to learn how to query the Triton +metrics endpoint to obtain TRT-LLM statistics. + +## Ask questions or report issues + +Can't find what you're looking for, or have a question or issue? Feel free to +ask questions or report issues in the GitHub issues page: + +- [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM/issues) +- [TensorRT-LLM Backend](https://github.com/triton-inference-server/tensorrtllm_backend/issues) +- [Triton Inference Server](https://github.com/triton-inference-server/server/issues) diff --git a/docs/user_guide/custom_operations.md b/docs/user_guide/custom_operations.md index 136edd180f..88a7037c7f 100644 --- a/docs/user_guide/custom_operations.md +++ b/docs/user_guide/custom_operations.md @@ -64,7 +64,7 @@ simple way to ensure you are using the correct version of TensorRT is to use the [NGC TensorRT container](https://ngc.nvidia.com/catalog/containers/nvidia:tensorrt) corresponding to the Triton container. For example, if you are using -the 24.07 version of Triton, use the 24.07 version of the TensorRT +the 24.09 version of Triton, use the 24.09 version of the TensorRT container. ## TensorFlow @@ -123,7 +123,7 @@ simple way to ensure you are using the correct version of TensorFlow is to use the [NGC TensorFlow container](https://ngc.nvidia.com/catalog/containers/nvidia:tensorflow) corresponding to the Triton container. For example, if you are using -the 24.07 version of Triton, use the 24.07 version of the TensorFlow +the 24.09 version of Triton, use the 24.09 version of the TensorFlow container. ## PyTorch @@ -167,7 +167,7 @@ simple way to ensure you are using the correct version of PyTorch is to use the [NGC PyTorch container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) corresponding to the Triton container. For example, if you are using -the 24.07 version of Triton, use the 24.07 version of the PyTorch +the 24.09 version of Triton, use the 24.09 version of the PyTorch container. ## ONNX diff --git a/docs/user_guide/debugging_guide.md b/docs/user_guide/debugging_guide.md index 3a38f209d3..e5b0263d30 100644 --- a/docs/user_guide/debugging_guide.md +++ b/docs/user_guide/debugging_guide.md @@ -1,5 +1,5 @@ Perf Analyzer documentation has been relocated to -[here](https://github.com/triton-inference-server/client/blob/main/src/c++/perf_analyzer/README.md). +[here](https://github.com/triton-inference-server/perf_analyzer/blob/main/README.md). diff --git a/docs/user_guide/performance_tuning.md b/docs/user_guide/performance_tuning.md index 49cad9e637..efea32a63b 100644 --- a/docs/user_guide/performance_tuning.md +++ b/docs/user_guide/performance_tuning.md @@ -73,7 +73,7 @@ For additional material, see the verify that we can run inference requests and get a baseline performance benchmark of your model. Triton's - [Perf Analyzer](https://github.com/triton-inference-server/client/blob/main/src/c++/perf_analyzer/README.md) + [Perf Analyzer](https://github.com/triton-inference-server/perf_analyzer/blob/main/README.md) tool specifically fits this purpose. Here is a simplified output for demonstration purposes: @@ -103,7 +103,7 @@ For additional material, see the There are many variables that can be tweaked just within your model configuration (`config.pbtxt`) to obtain different results. - As your model, config, or use case evolves, - [Perf Analyzer](https://github.com/triton-inference-server/client/blob/main/src/c++/perf_analyzer/README.md) + [Perf Analyzer](https://github.com/triton-inference-server/perf_analyzer/blob/main/README.md) is a great tool to quickly verify model functionality and performance. 3. How can I improve my model performance? @@ -235,7 +235,7 @@ with a `tritonserver` binary. ```bash # Start server container -docker run -ti --rm --gpus=all --network=host -v $PWD:/mnt --name triton-server nvcr.io/nvidia/tritonserver:24.07-py3 +docker run -ti --rm --gpus=all --network=host -v $PWD:/mnt --name triton-server nvcr.io/nvidia/tritonserver:24.09-py3 # Start serving your models tritonserver --model-repository=/mnt/models @@ -284,7 +284,7 @@ by setting the `-u` flag, such as `perf_analyzer -m densenet_onnx -u ```bash # Start the SDK container interactively -docker run -ti --rm --gpus=all --network=host -v $PWD:/mnt --name triton-client nvcr.io/nvidia/tritonserver:24.07-py3-sdk +docker run -ti --rm --gpus=all --network=host -v $PWD:/mnt --name triton-client nvcr.io/nvidia/tritonserver:24.09-py3-sdk # Benchmark model being served from step 3 perf_analyzer -m densenet_onnx --concurrency-range 1:4 diff --git a/qa/L0_backend_python/argument_validation/test.sh b/qa/L0_backend_python/argument_validation/test.sh index b14ba4abb3..90cbef89b5 100755 --- a/qa/L0_backend_python/argument_validation/test.sh +++ b/qa/L0_backend_python/argument_validation/test.sh @@ -25,7 +25,7 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -CLIENT_PY=../python_unittest.py +CLIENT_PY=../test_infer_shm_leak.py CLIENT_LOG="./arg_validation_client.log" TEST_RESULT_FILE='test_results.txt' SERVER_ARGS="--model-repository=${MODELDIR}/argument_validation/models --backend-directory=${BACKEND_DIR} --log-verbose=1" diff --git a/qa/L0_backend_python/bls/test.sh b/qa/L0_backend_python/bls/test.sh index 204af7e2ba..46d1f40818 100755 --- a/qa/L0_backend_python/bls/test.sh +++ b/qa/L0_backend_python/bls/test.sh @@ -25,7 +25,7 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -CLIENT_PY=../python_unittest.py +CLIENT_PY=../test_infer_shm_leak.py CLIENT_LOG="./bls_client.log" TEST_RESULT_FILE='test_results.txt' source ../../common/util.sh @@ -33,7 +33,7 @@ source ../../common/util.sh TRITON_REPO_ORGANIZATION=${TRITON_REPO_ORGANIZATION:=http://github.com/triton-inference-server} RET=0 -rm -fr *.log ./models *.txt +rm -fr *.log ./models *.txt *.xml # FIXME: [DLIS-5970] Until Windows supports GPU tensors, only test CPU if [[ ${TEST_WINDOWS} == 0 ]]; then @@ -119,30 +119,35 @@ if [[ ${TEST_WINDOWS} == 0 ]]; then for MODEL_NAME in bls bls_memory bls_memory_async bls_async; do export MODEL_NAME=${MODEL_NAME} - - python3 -m pytest --junitxml="${MODEL_NAME}.${TRIAL}.${CUDA_MEMORY_POOL_SIZE_MB}.report.xml" $CLIENT_PY >> $CLIENT_LOG 2>&1 - if [ $? -ne 0 ]; then + # Run with pytest to capture the return code correctly + pytest --junitxml="${MODEL_NAME}.${TRIAL}.${CUDA_MEMORY_POOL_SIZE_MB}.report.xml" $CLIENT_PY >> $CLIENT_LOG 2>&1 + EXIT_CODE=$? + if [ $EXIT_CODE -ne 0 ]; then echo -e "\n***\n*** ${MODEL_NAME} ${BLS_KIND} test FAILED. \n***" + RET=$EXIT_CODE cat $SERVER_LOG cat $CLIENT_LOG - RET=1 fi done - set -e - kill_server - # Check for bls 'test_timeout' to ensure timeout value is being correctly passed - if [ `grep -c "Request timeout: 11000000000" $SERVER_LOG` == "0" ]; then - echo -e "\n***\n*** BLS timeout value not correctly passed to model: line ${LINENO}\n***" - cat $SERVER_LOG - RET=1 + set -e + + # Only check the timeout value if there is no error since the test + # may fail before the test_timeout case gets run. + if [ $RET -eq 0 ]; then + # Check for bls 'test_timeout' to ensure timeout value is being correctly passed + if [ `grep -c "Request timeout: 11000000000" $SERVER_LOG` == "0" ]; then + echo -e "\n***\n*** BLS timeout value not correctly passed to model: line ${LINENO}\n***" + cat $SERVER_LOG + RET=1 + fi fi - if [[ $CUDA_MEMORY_POOL_SIZE_MB -eq 128 ]]; then + if [[ $CUDA_MEMORY_POOL_SIZE_MB -eq 256 ]]; then if [ `grep -c "Failed to allocate memory from CUDA memory pool" $SERVER_LOG` != "0" ]; then - echo -e "\n***\n*** Expected to use CUDA memory pool for all tests when CUDA_MEMOY_POOL_SIZE_MB is 128 MB for 'bls' $BLS_KIND test\n***" + echo -e "\n***\n*** Expected to use CUDA memory pool for all tests when CUDA_MEMORY_POOL_SIZE_MB is 256 MB for 'bls' $BLS_KIND test\n***" cat $SERVER_LOG RET=1 fi @@ -342,10 +347,10 @@ set -e kill_server -if [ $RET -eq 1 ]; then - echo -e "\n***\n*** BLS test FAILED. \n***" -else +if [ $RET -eq 0 ]; then echo -e "\n***\n*** BLS test PASSED. \n***" +else + echo -e "\n***\n*** BLS test FAILED. \n***" fi exit $RET diff --git a/qa/L0_backend_python/custom_metrics/test.sh b/qa/L0_backend_python/custom_metrics/test.sh index 4491d9e030..9020c7ebfd 100755 --- a/qa/L0_backend_python/custom_metrics/test.sh +++ b/qa/L0_backend_python/custom_metrics/test.sh @@ -25,7 +25,7 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -CLIENT_PY=../python_unittest.py +CLIENT_PY=../test_infer_shm_leak.py CLIENT_LOG="./custom_metrics_client.log" TEST_RESULT_FILE='test_results.txt' source ../../common/util.sh diff --git a/qa/L0_backend_python/lifecycle/lifecycle_test.py b/qa/L0_backend_python/lifecycle/lifecycle_test.py index 883f6d20b6..d6eb2a8f53 100755 --- a/qa/L0_backend_python/lifecycle/lifecycle_test.py +++ b/qa/L0_backend_python/lifecycle/lifecycle_test.py @@ -35,6 +35,7 @@ sys.path.append("../../common") import queue +import threading import time import unittest from functools import partial @@ -241,6 +242,135 @@ def test_infer_pymodel_error(self): initial_metrics_value, ) + # Test grpc stream behavior when triton_grpc_error is set to true. + # Expected to close stream and return GRPC error when model returns error. + def test_triton_grpc_error_error_on(self): + model_name = "execute_grpc_error" + shape = [2, 2] + number_of_requests = 2 + user_data = UserData() + triton_client = grpcclient.InferenceServerClient(f"{_tritonserver_ipaddr}:8001") + metadata = {"triton_grpc_error": "true"} + triton_client.start_stream( + callback=partial(callback, user_data), headers=metadata + ) + stream_end = False + for i in range(number_of_requests): + input_data = np.random.randn(*shape).astype(np.float32) + inputs = [ + grpcclient.InferInput( + "IN", input_data.shape, np_to_triton_dtype(input_data.dtype) + ) + ] + inputs[0].set_data_from_numpy(input_data) + try: + triton_client.async_stream_infer(model_name=model_name, inputs=inputs) + result = user_data._completed_requests.get() + if type(result) == InferenceServerException: + # execute_grpc_error intentionally returns error with StatusCode.INTERNAL status on 2nd request + self.assertEqual(str(result.status()), "StatusCode.INTERNAL") + stream_end = True + else: + # Stream is not killed + output_data = result.as_numpy("OUT") + self.assertIsNotNone(output_data, "error: expected 'OUT'") + except Exception as e: + if stream_end == True: + # We expect the stream to have closed + self.assertTrue( + True, + "This should always pass as cancellation should succeed", + ) + else: + self.assertFalse( + True, "Unexpected Stream killed without Error from CORE" + ) + + # Test grpc stream behavior when triton_grpc_error is set to true in multiple open streams. + # Expected to close stream and return GRPC error when model returns error. + def test_triton_grpc_error_multithreaded(self): + thread1 = threading.Thread(target=self.test_triton_grpc_error_error_on) + thread2 = threading.Thread(target=self.test_triton_grpc_error_error_on) + # Start the threads + thread1.start() + thread2.start() + # Wait for both threads to finish + thread1.join() + thread2.join() + + # Test grpc stream behavior when triton_grpc_error is set to true and subsequent stream is cancelled. + # Expected cancellation is successful. + def test_triton_grpc_error_cancel(self): + model_name = "execute_grpc_error" + shape = [2, 2] + number_of_requests = 1 + user_data = UserData() + triton_server_url = "localhost:8001" # Replace with your Triton server address + stream_end = False + triton_client = grpcclient.InferenceServerClient(triton_server_url) + + metadata = {"triton_grpc_error": "true"} + + triton_client.start_stream( + callback=partial(callback, user_data), headers=metadata + ) + + for i in range(number_of_requests): + input_data = np.random.randn(*shape).astype(np.float32) + inputs = [ + grpcclient.InferInput( + "IN", input_data.shape, np_to_triton_dtype(input_data.dtype) + ) + ] + inputs[0].set_data_from_numpy(input_data) + try: + triton_client.async_stream_infer(model_name=model_name, inputs=inputs) + result = user_data._completed_requests.get() + if type(result) == InferenceServerException: + stream_end = True + if i == 0: + triton_client.stop_stream(cancel_requests=True) + except Exception as e: + if stream_end == True: + # We expect the stream to have closed + self.assertTrue( + True, + "This should always pass as cancellation should succeed", + ) + else: + self.assertFalse( + True, "Unexpected Stream killed without Error from CORE" + ) + self.assertTrue( + True, + "This should always pass as cancellation should succeed without any exception", + ) + + # Test grpc stream behavior when triton_grpc_error is set to false + # and subsequent stream is NOT closed when error is reported from CORE + def test_triton_grpc_error_error_off(self): + model_name = "execute_grpc_error" + shape = [2, 2] + number_of_requests = 4 + response_counter = 0 + user_data = UserData() + triton_client = grpcclient.InferenceServerClient(f"{_tritonserver_ipaddr}:8001") + triton_client.start_stream(callback=partial(callback, user_data)) + for i in range(number_of_requests): + input_data = np.random.randn(*shape).astype(np.float32) + inputs = [ + grpcclient.InferInput( + "IN", input_data.shape, np_to_triton_dtype(input_data.dtype) + ) + ] + inputs[0].set_data_from_numpy(input_data) + triton_client.async_stream_infer(model_name=model_name, inputs=inputs) + _ = user_data._completed_requests.get() + response_counter += 1 + # we expect response_counter == number_of_requests, + # which indicates that after the first reported grpc error stream did NOT close and mode != triton_grpc_error + self.assertEqual(response_counter, number_of_requests) + if __name__ == "__main__": unittest.main() diff --git a/qa/L0_backend_python/lifecycle/test.sh b/qa/L0_backend_python/lifecycle/test.sh index dba4581ddd..59b846f56b 100755 --- a/qa/L0_backend_python/lifecycle/test.sh +++ b/qa/L0_backend_python/lifecycle/test.sh @@ -52,6 +52,14 @@ cp ../../python_models/execute_error/config.pbtxt ./models/execute_error/ sed -i "s/^max_batch_size:.*/max_batch_size: 8/" config.pbtxt && \ echo "dynamic_batching { preferred_batch_size: [8], max_queue_delay_microseconds: 12000000 }" >> config.pbtxt) +mkdir -p models/execute_grpc_error/1/ +cp ../../python_models/execute_grpc_error/model.py ./models/execute_grpc_error/1/ +cp ../../python_models/execute_grpc_error/config.pbtxt ./models/execute_grpc_error/ +(cd models/execute_grpc_error && \ + sed -i "s/^name:.*/name: \"execute_grpc_error\"/" config.pbtxt && \ + sed -i "s/^max_batch_size:.*/max_batch_size: 8/" config.pbtxt && \ + echo "dynamic_batching { preferred_batch_size: [8], max_queue_delay_microseconds: 1200000 }" >> config.pbtxt) + mkdir -p models/execute_return_error/1/ cp ../../python_models/execute_return_error/model.py ./models/execute_return_error/1/ cp ../../python_models/execute_return_error/config.pbtxt ./models/execute_return_error/ diff --git a/qa/L0_backend_python/request_rescheduling/test.sh b/qa/L0_backend_python/request_rescheduling/test.sh index 6fd6fe09e5..31ba6692d9 100755 --- a/qa/L0_backend_python/request_rescheduling/test.sh +++ b/qa/L0_backend_python/request_rescheduling/test.sh @@ -25,7 +25,7 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -CLIENT_PY="../python_unittest.py" +CLIENT_PY="../test_infer_shm_leak.py" CLIENT_LOG="./request_rescheduling_client.log" TEST_RESULT_FILE='test_results.txt' source ../../common/util.sh diff --git a/qa/L0_backend_python/response_sender/response_sender_complete_final_test.py b/qa/L0_backend_python/response_sender/response_sender_complete_final_test.py new file mode 100644 index 0000000000..386a54e3d3 --- /dev/null +++ b/qa/L0_backend_python/response_sender/response_sender_complete_final_test.py @@ -0,0 +1,77 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +import time +import unittest + +import numpy as np +import tritonclient.grpc as grpcclient + + +class ResponseSenderTest(unittest.TestCase): + def _generate_streaming_callback_and_responses_pair(self): + responses = [] # [{"result": result, "error": error}, ...] + + def callback(result, error): + responses.append({"result": result, "error": error}) + + return callback, responses + + def test_respond_after_complete_final(self): + with open(os.environ["SERVER_LOG"]) as f: + server_log = f.read() + self.assertNotIn("Test Passed", server_log) + + model_name = "response_sender_complete_final" + shape = [1, 1] + inputs = [grpcclient.InferInput("INPUT0", shape, "FP32")] + input0_np = np.array([[123.45]], np.float32) + inputs[0].set_data_from_numpy(input0_np) + + callback, responses = self._generate_streaming_callback_and_responses_pair() + with grpcclient.InferenceServerClient("localhost:8001") as client: + client.start_stream(callback) + client.async_stream_infer(model_name, inputs) + client.stop_stream() + + self.assertEqual(len(responses), 1) + for response in responses: + output0_np = response["result"].as_numpy(name="OUTPUT0") + self.assertTrue(np.allclose(input0_np, output0_np)) + self.assertIsNone(response["error"]) + + time.sleep(1) # make sure the logs are written before checking + with open(os.environ["SERVER_LOG"]) as f: + server_log = f.read() + self.assertNotIn("Unexpected request length", server_log) + self.assertNotIn("Expected exception not raised", server_log) + self.assertNotIn("Test FAILED", server_log) + self.assertIn("Test Passed", server_log) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_backend_python/response_sender/test.sh b/qa/L0_backend_python/response_sender/test.sh index 33db46edbb..cca7e7acfa 100755 --- a/qa/L0_backend_python/response_sender/test.sh +++ b/qa/L0_backend_python/response_sender/test.sh @@ -97,6 +97,37 @@ set -e kill $SERVER_PID wait $SERVER_PID +# +# Test response sender to raise exception on response after complete final flag +# +rm -rf models && mkdir models +mkdir -p models/response_sender_complete_final/1 && \ + cp ../../python_models/response_sender_complete_final/model.py models/response_sender_complete_final/1 && \ + cp ../../python_models/response_sender_complete_final/config.pbtxt models/response_sender_complete_final + +TEST_LOG="response_sender_complete_final_test.log" +SERVER_LOG="response_sender_complete_final_test.server.log" +SERVER_ARGS="--model-repository=${MODELDIR}/response_sender/models --backend-directory=${BACKEND_DIR} --log-verbose=1" + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +SERVER_LOG=$SERVER_LOG python3 -m pytest --junitxml=concurrency_test.report.xml response_sender_complete_final_test.py > $TEST_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** response sender complete final test FAILED\n***" + cat $TEST_LOG + RET=1 +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + # # Test async response sender under decoupled / non-decoupled # diff --git a/qa/L0_backend_python/setup_python_enviroment.sh b/qa/L0_backend_python/setup_python_enviroment.sh index 88baccc4f6..a2171e02da 100755 --- a/qa/L0_backend_python/setup_python_enviroment.sh +++ b/qa/L0_backend_python/setup_python_enviroment.sh @@ -151,7 +151,7 @@ apt-get update && apt-get -y install \ libboost-dev rm -f /usr/bin/python3 && \ ln -s "/usr/bin/python3.${PYTHON_ENV_VERSION}" /usr/bin/python3 -pip3 install --upgrade install requests numpy virtualenv protobuf +pip3 install --upgrade requests numpy virtualenv protobuf find /opt/tritonserver/qa/pkgs/ -maxdepth 1 -type f -name \ "tritonclient-*linux*.whl" | xargs printf -- '%s[all]' | \ xargs pip3 install --upgrade diff --git a/qa/L0_backend_python/test.sh b/qa/L0_backend_python/test.sh index 65767419f2..324ee5ba1f 100755 --- a/qa/L0_backend_python/test.sh +++ b/qa/L0_backend_python/test.sh @@ -39,18 +39,18 @@ fi # /mnt/c when needed but the paths on the tritonserver command-line # must be C:/ style. export TEST_WINDOWS=0 -if [[ "$(< /proc/sys/kernel/osrelease)" == *microsoft* ]]; then +if [[ -v WSL_DISTRO_NAME ]] || [[ -v MSYSTEM ]]; then export DATADIR=${DATADIR:="/c/data/inferenceserver/${REPO_VERSION}"} export TRITON_DIR=${TRITON_DIR:=c:/tritonserver} # This will run in WSL, but Triton will run in windows, so environment # variables meant for loaded models must be exported using WSLENV. # The /w flag indicates the value should only be included when invoking # Win32 from WSL. - export WSLENV=TRITON_DIR/w + export WSLENV=TRITON_DIR export SERVER=${SERVER:=c:/tritonserver/bin/tritonserver.exe} export BACKEND_DIR=${BACKEND_DIR:=c:/tritonserver/backends} export MODELDIR=${MODELDIR:=c:/} - TEST_WINDOWS=1 + export TEST_WINDOWS=1 else export DATADIR=${DATADIR:="/data/inferenceserver/${REPO_VERSION}"} export TRITON_DIR=${TRITON_DIR:="/opt/tritonserver"} @@ -425,11 +425,20 @@ if [ "$TEST_JETSON" == "0" ]; then # between dependencies. setup_virtualenv + set +e (cd ${TEST} && bash -ex test.sh) - if [ $? -ne 0 ]; then + EXIT_CODE=$? + if [ $EXIT_CODE -ne 0 ]; then echo "Subtest ${TEST} FAILED" - RET=1 + RET=$EXIT_CODE + + # In bls test, it is allowed to fail with a strict memory leak of 480 bytes with exit code '123'. + # Propagate the exit code to make sure it's not overwritten by other tests. + if [[ ${TEST} == "bls" ]] && [[ $EXIT_CODE -ne 1 ]] ; then + BLS_RET=$RET + fi fi + set -e deactivate_virtualenv done @@ -438,11 +447,13 @@ if [ "$TEST_JETSON" == "0" ]; then if [[ ${PYTHON_ENV_VERSION} = "10" ]] && [[ ${TEST_WINDOWS} == 0 ]]; then # In 'env' test we use miniconda for dependency management. No need to run # the test in a virtual environment. + set +e (cd env && bash -ex test.sh) if [ $? -ne 0 ]; then echo "Subtest env FAILED" RET=1 fi + set -e fi fi @@ -459,12 +470,14 @@ for TEST in ${SUBTESTS}; do # between dependencies. setup_virtualenv + set +e (cd ${TEST} && bash -ex test.sh) if [ $? -ne 0 ]; then echo "Subtest ${TEST} FAILED" RET=1 fi + set -e deactivate_virtualenv done @@ -475,4 +488,14 @@ else echo -e "\n***\n*** Test FAILED\n***" fi -exit $RET +# Exit with RET if it is 1, meaning that the test failed. +# Otherwise, exit with BLS_RET if it is set, meaning that the known memory leak is captured. +if [ $RET -eq 1 ]; then + exit $RET +else + if [ -z "$BLS_RET" ]; then + exit $RET + else + exit $BLS_RET + fi +fi diff --git a/qa/L0_backend_python/python_unittest.py b/qa/L0_backend_python/test_infer_shm_leak.py similarity index 75% rename from qa/L0_backend_python/python_unittest.py rename to qa/L0_backend_python/test_infer_shm_leak.py index 4b94996976..966243e86e 100755 --- a/qa/L0_backend_python/python_unittest.py +++ b/qa/L0_backend_python/test_infer_shm_leak.py @@ -33,6 +33,7 @@ import os import unittest +import pytest import shm_util import tritonclient.grpc as grpcclient from tritonclient.utils import * @@ -41,11 +42,13 @@ # we overwrite the IP address with the TRITONSERVER_IPADDR envvar _tritonserver_ipaddr = os.environ.get("TRITONSERVER_IPADDR", "localhost") +# The exit code 123 is used to indicate that the shm leak probe detected a 480 +# bytes leak in the bls sub-test. Any leak other than 480 bytes will cause the +# test to fail with the default exit code 1. +ALLOWED_FAILURE_EXIT_CODE = 123 -class PythonUnittest(unittest.TestCase): - def setUp(self): - self._shm_leak_detector = shm_util.ShmLeakDetector() +class TestInferShmLeak: def _run_unittest(self, model_name): with grpcclient.InferenceServerClient(f"{_tritonserver_ipaddr}:8001") as client: # No input is required @@ -54,15 +57,17 @@ def _run_unittest(self, model_name): # The model returns 1 if the tests were successfully passed. # Otherwise, it will return 0. - self.assertEqual( - output0, [1], f"python_unittest failed for model {model_name}" - ) - - def test_python_unittest(self): - model_name = os.environ["MODEL_NAME"] - with self._shm_leak_detector.Probe() as shm_probe: - self._run_unittest(model_name) + assert output0 == [1], f"python_unittest failed for model {model_name}" + def test_shm_leak(self): + self._shm_leak_detector = shm_util.ShmLeakDetector() + model_name = os.environ.get("MODEL_NAME", "default_model") -if __name__ == "__main__": - unittest.main() + try: + with self._shm_leak_detector.Probe() as shm_probe: + self._run_unittest(model_name) + except AssertionError as e: + if "Known shared memory leak of 480 bytes detected" in str(e): + pytest.exit(str(e), returncode=ALLOWED_FAILURE_EXIT_CODE) + else: + raise e diff --git a/qa/L0_batcher/test.sh b/qa/L0_batcher/test.sh index 827751eb40..7043aab2a5 100755 --- a/qa/L0_batcher/test.sh +++ b/qa/L0_batcher/test.sh @@ -79,7 +79,7 @@ TF_VERSION=${TF_VERSION:=2} # On windows the paths invoked by the script (running in WSL) must use # /mnt/c when needed but the paths on the tritonserver command-line # must be C:/ style. -if [[ "$(< /proc/sys/kernel/osrelease)" == *microsoft* ]]; then +if [[ -v WSL_DISTRO_NAME ]] || [[ -v MSYSTEM ]]; then MODELDIR=${MODELDIR:=C:/models} DATADIR=${DATADIR:="/mnt/c/data/inferenceserver/${REPO_VERSION}"} BACKEND_DIR=${BACKEND_DIR:=C:/tritonserver/backends} @@ -601,7 +601,7 @@ done TEST_CASE=test_multi_batch_preserve_ordering # Skip test for Windows. Trace file concats at 8192 chars on Windows. -if [[ "$(< /proc/sys/kernel/osrelease)" != *microsoft* ]]; then +if [[ ! -v WSL_DISTRO_NAME ]] || [[ ! -v MSYSTEM ]]; then rm -fr ./custom_models && mkdir ./custom_models && \ cp -r ../custom_models/custom_zero_1_float32 ./custom_models/. && \ mkdir -p ./custom_models/custom_zero_1_float32/1 diff --git a/qa/L0_client_build_variants/test.sh b/qa/L0_client_build_variants/test.sh index c31c55e310..9dc1c4c85d 100755 --- a/qa/L0_client_build_variants/test.sh +++ b/qa/L0_client_build_variants/test.sh @@ -58,10 +58,6 @@ TRITON_REPO_ORGANIZATION=${TRITON_REPO_ORGANIZATION:="http://github.com/triton-i -DTRITON_ENABLE_PYTHON_HTTP=ON \ -DTRITON_ENABLE_PYTHON_GRPC=ON \ -DTRITON_ENABLE_JAVA_HTTP=ON \ - -DTRITON_ENABLE_PERF_ANALYZER=ON \ - -DTRITON_ENABLE_PERF_ANALYZER_C_API=ON \ - -DTRITON_ENABLE_PERF_ANALYZER_TFS=OFF \ - -DTRITON_ENABLE_PERF_ANALYZER_TS=OFF \ -DTRITON_ENABLE_EXAMPLES=ON \ -DTRITON_ENABLE_TESTS=ON \ -DTRITON_ENABLE_GPU=OFF \ @@ -90,10 +86,6 @@ fi -DTRITON_ENABLE_CC_GRPC=ON \ -DTRITON_ENABLE_PYTHON_HTTP=OFF \ -DTRITON_ENABLE_PYTHON_GRPC=ON \ - -DTRITON_ENABLE_PERF_ANALYZER=ON \ - -DTRITON_ENABLE_PERF_ANALYZER_C_API=ON \ - -DTRITON_ENABLE_PERF_ANALYZER_TFS=OFF \ - -DTRITON_ENABLE_PERF_ANALYZER_TS=OFF \ -DTRITON_ENABLE_EXAMPLES=ON \ -DTRITON_ENABLE_TESTS=ON \ -DTRITON_ENABLE_GPU=ON \ @@ -121,10 +113,6 @@ fi -DTRITON_ENABLE_CC_GRPC=OFF \ -DTRITON_ENABLE_PYTHON_HTTP=ON \ -DTRITON_ENABLE_PYTHON_GRPC=OFF \ - -DTRITON_ENABLE_PERF_ANALYZER=ON \ - -DTRITON_ENABLE_PERF_ANALYZER_C_API=ON \ - -DTRITON_ENABLE_PERF_ANALYZER_TFS=OFF \ - -DTRITON_ENABLE_PERF_ANALYZER_TS=OFF \ -DTRITON_ENABLE_EXAMPLES=ON \ -DTRITON_ENABLE_TESTS=ON \ -DTRITON_ENABLE_GPU=ON \ @@ -141,59 +129,27 @@ else exit 1 fi -# -# Build without Perf Analyzer -# -(cd /workspace/build && \ - rm -fr cc-clients python-clients && \ - cmake -DCMAKE_INSTALL_PREFIX=/workspace/install \ - -DTRITON_ENABLE_CC_HTTP=ON \ - -DTRITON_ENABLE_CC_GRPC=ON \ - -DTRITON_ENABLE_PYTHON_HTTP=ON \ - -DTRITON_ENABLE_PYTHON_GRPC=ON \ - -DTRITON_ENABLE_PERF_ANALYZER=OFF \ - -DTRITON_ENABLE_PERF_ANALYZER_C_API=OFF \ - -DTRITON_ENABLE_PERF_ANALYZER_TFS=OFF \ - -DTRITON_ENABLE_PERF_ANALYZER_TS=OFF \ - -DTRITON_ENABLE_EXAMPLES=ON \ - -DTRITON_ENABLE_TESTS=ON \ - -DTRITON_ENABLE_GPU=ON \ - -DTRITON_REPO_ORGANIZATION:STRING=${TRITON_REPO_ORGANIZATION} \ - -DTRITON_COMMON_REPO_TAG=${TRITON_COMMON_REPO_TAG} \ - -DTRITON_CORE_REPO_TAG=${TRITON_CORE_REPO_TAG} \ - -DTRITON_THIRD_PARTY_REPO_TAG=${TRITON_THIRD_PARTY_REPO_TAG} \ - /workspace/client && \ - make -j16 cc-clients python-clients) -if [ $? -eq 0 ]; then - echo -e "\n***\n*** No-Perf-Analyzer Passed\n***" -else - echo -e "\n***\n*** No-Perf-Analyzer FAILED\n***" - exit 1 -fi - +# TODO: TPRD-342 These tests should be PA CI test +# cases not Triton test cases +rm -fr /workspace/build +mkdir -p /workspace/build # # Build without C API in Perf Analyzer # (cd /workspace/build && \ - rm -fr cc-clients python-clients && \ cmake -DCMAKE_INSTALL_PREFIX=/workspace/install \ -DTRITON_ENABLE_CC_HTTP=ON \ -DTRITON_ENABLE_CC_GRPC=ON \ - -DTRITON_ENABLE_PYTHON_HTTP=ON \ - -DTRITON_ENABLE_PYTHON_GRPC=ON \ - -DTRITON_ENABLE_PERF_ANALYZER=ON \ -DTRITON_ENABLE_PERF_ANALYZER_C_API=OFF \ -DTRITON_ENABLE_PERF_ANALYZER_TFS=ON \ -DTRITON_ENABLE_PERF_ANALYZER_TS=ON \ - -DTRITON_ENABLE_EXAMPLES=ON \ - -DTRITON_ENABLE_TESTS=ON \ -DTRITON_ENABLE_GPU=ON \ -DTRITON_REPO_ORGANIZATION:STRING=${TRITON_REPO_ORGANIZATION} \ -DTRITON_COMMON_REPO_TAG=${TRITON_COMMON_REPO_TAG} \ -DTRITON_CORE_REPO_TAG=${TRITON_CORE_REPO_TAG} \ - -DTRITON_THIRD_PARTY_REPO_TAG=${TRITON_THIRD_PARTY_REPO_TAG} \ - /workspace/client && \ - make -j16 cc-clients python-clients) + -DTRITON_CLIENT_REPO_TAG=${TRITON_CLIENT_REPO_TAG} \ + /workspace/perf_analyzer && \ + make -j16 perf-analyzer) if [ $? -eq 0 ]; then echo -e "\n***\n*** No-CAPI Passed\n***" else @@ -205,25 +161,20 @@ fi # Build without TensorFlow Serving in Perf Analyzer # (cd /workspace/build && \ - rm -fr cc-clients python-clients && \ + rm -fr cc_clients perf_analyzer && \ cmake -DCMAKE_INSTALL_PREFIX=/workspace/install \ -DTRITON_ENABLE_CC_HTTP=ON \ -DTRITON_ENABLE_CC_GRPC=ON \ - -DTRITON_ENABLE_PYTHON_HTTP=ON \ - -DTRITON_ENABLE_PYTHON_GRPC=ON \ - -DTRITON_ENABLE_PERF_ANALYZER=ON \ -DTRITON_ENABLE_PERF_ANALYZER_C_API=ON \ -DTRITON_ENABLE_PERF_ANALYZER_TFS=OFF \ -DTRITON_ENABLE_PERF_ANALYZER_TS=ON \ - -DTRITON_ENABLE_EXAMPLES=ON \ - -DTRITON_ENABLE_TESTS=ON \ -DTRITON_ENABLE_GPU=ON \ -DTRITON_REPO_ORGANIZATION:STRING=${TRITON_REPO_ORGANIZATION} \ -DTRITON_COMMON_REPO_TAG=${TRITON_COMMON_REPO_TAG} \ -DTRITON_CORE_REPO_TAG=${TRITON_CORE_REPO_TAG} \ - -DTRITON_THIRD_PARTY_REPO_TAG=${TRITON_THIRD_PARTY_REPO_TAG} \ - /workspace/client && \ - make -j16 cc-clients python-clients) + -DTRITON_CLIENT_REPO_TAG=${TRITON_CLIENT_REPO_TAG} \ + /workspace/perf_analyzer && \ + make -j16 perf-analyzer) if [ $? -eq 0 ]; then echo -e "\n***\n*** No-TF-Serving Passed\n***" else @@ -235,25 +186,20 @@ fi # Build without TorchServe in Perf Analyzer # (cd /workspace/build && \ - rm -fr cc-clients python-clients && \ + rm -fr cc_clients perf_analyzer && \ cmake -DCMAKE_INSTALL_PREFIX=/workspace/install \ -DTRITON_ENABLE_CC_HTTP=ON \ -DTRITON_ENABLE_CC_GRPC=ON \ - -DTRITON_ENABLE_PYTHON_HTTP=ON \ - -DTRITON_ENABLE_PYTHON_GRPC=ON \ - -DTRITON_ENABLE_PERF_ANALYZER=ON \ -DTRITON_ENABLE_PERF_ANALYZER_C_API=ON \ -DTRITON_ENABLE_PERF_ANALYZER_TFS=ON \ -DTRITON_ENABLE_PERF_ANALYZER_TS=OFF \ - -DTRITON_ENABLE_EXAMPLES=ON \ - -DTRITON_ENABLE_TESTS=ON \ -DTRITON_ENABLE_GPU=ON \ -DTRITON_REPO_ORGANIZATION:STRING=${TRITON_REPO_ORGANIZATION} \ -DTRITON_COMMON_REPO_TAG=${TRITON_COMMON_REPO_TAG} \ -DTRITON_CORE_REPO_TAG=${TRITON_CORE_REPO_TAG} \ - -DTRITON_THIRD_PARTY_REPO_TAG=${TRITON_THIRD_PARTY_REPO_TAG} \ - /workspace/client && \ - make -j16 cc-clients python-clients) + -DTRITON_CLIENT_REPO_TAG=${TRITON_CLIENT_REPO_TAG} \ + /workspace/perf_analyzer && \ + make -j16 perf-analyzer) if [ $? -eq 0 ]; then echo -e "\n***\n*** No-TorchServe Passed\n***" else diff --git a/qa/L0_cuda_shared_memory/cuda_shared_memory_test.py b/qa/L0_cuda_shared_memory/cuda_shared_memory_test.py index 07f9c05a88..51137e8934 100755 --- a/qa/L0_cuda_shared_memory/cuda_shared_memory_test.py +++ b/qa/L0_cuda_shared_memory/cuda_shared_memory_test.py @@ -31,18 +31,20 @@ sys.path.append("../common") import os +import time import unittest +from functools import partial import infer_util as iu import numpy as np import test_util as tu import tritonclient.grpc as grpcclient import tritonclient.http as httpclient -import tritonshmutils.cuda_shared_memory as cshm +import tritonclient.utils.cuda_shared_memory as cshm from tritonclient.utils import * -class CudaSharedMemoryTest(tu.TestResultCollector): +class CudaSharedMemoryTestBase(tu.TestResultCollector): DEFAULT_SHM_BYTE_SIZE = 64 def setUp(self): @@ -61,76 +63,6 @@ def _setup_client(self): self.url, verbose=True ) - def test_invalid_create_shm(self): - # Raises error since tried to create invalid cuda shared memory region - try: - shm_op0_handle = cshm.create_shared_memory_region("dummy_data", -1, 0) - cshm.destroy_shared_memory_region(shm_op0_handle) - except Exception as ex: - self.assertEqual(str(ex), "unable to create cuda shared memory handle") - - def test_valid_create_set_register(self): - # Create a valid cuda shared memory region, fill data in it and register - shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0) - cshm.set_shared_memory_region( - shm_op0_handle, [np.array([1, 2], dtype=np.float32)] - ) - self.triton_client.register_cuda_shared_memory( - "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8 - ) - shm_status = self.triton_client.get_cuda_shared_memory_status() - if self.protocol == "http": - self.assertEqual(len(shm_status), 1) - else: - self.assertEqual(len(shm_status.regions), 1) - cshm.destroy_shared_memory_region(shm_op0_handle) - - def test_unregister_before_register(self): - # Create a valid cuda shared memory region and unregister before register - shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0) - self.triton_client.unregister_cuda_shared_memory("dummy_data") - shm_status = self.triton_client.get_cuda_shared_memory_status() - if self.protocol == "http": - self.assertEqual(len(shm_status), 0) - else: - self.assertEqual(len(shm_status.regions), 0) - cshm.destroy_shared_memory_region(shm_op0_handle) - - def test_unregister_after_register(self): - # Create a valid cuda shared memory region and unregister after register - shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0) - self.triton_client.register_cuda_shared_memory( - "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8 - ) - self.triton_client.unregister_cuda_shared_memory("dummy_data") - shm_status = self.triton_client.get_cuda_shared_memory_status() - if self.protocol == "http": - self.assertEqual(len(shm_status), 0) - else: - self.assertEqual(len(shm_status.regions), 0) - cshm.destroy_shared_memory_region(shm_op0_handle) - - def test_reregister_after_register(self): - # Create a valid cuda shared memory region and unregister after register - shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0) - self.triton_client.register_cuda_shared_memory( - "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8 - ) - try: - self.triton_client.register_cuda_shared_memory( - "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8 - ) - except Exception as ex: - self.assertIn( - "shared memory region 'dummy_data' already in manager", str(ex) - ) - shm_status = self.triton_client.get_cuda_shared_memory_status() - if self.protocol == "http": - self.assertEqual(len(shm_status), 1) - else: - self.assertEqual(len(shm_status.regions), 1) - cshm.destroy_shared_memory_region(shm_op0_handle) - def _configure_server( self, create_byte_size=DEFAULT_SHM_BYTE_SIZE, @@ -205,6 +137,78 @@ def _cleanup_server(self, shm_handles): for shm_handle in shm_handles: cshm.destroy_shared_memory_region(shm_handle) + +class CudaSharedMemoryTest(CudaSharedMemoryTestBase): + def test_invalid_create_shm(self): + # Raises error since tried to create invalid cuda shared memory region + try: + shm_op0_handle = cshm.create_shared_memory_region("dummy_data", -1, 0) + cshm.destroy_shared_memory_region(shm_op0_handle) + except Exception as ex: + self.assertEqual(str(ex), "unable to create cuda shared memory handle") + + def test_valid_create_set_register(self): + # Create a valid cuda shared memory region, fill data in it and register + shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0) + cshm.set_shared_memory_region( + shm_op0_handle, [np.array([1, 2], dtype=np.float32)] + ) + self.triton_client.register_cuda_shared_memory( + "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8 + ) + shm_status = self.triton_client.get_cuda_shared_memory_status() + if self.protocol == "http": + self.assertEqual(len(shm_status), 1) + else: + self.assertEqual(len(shm_status.regions), 1) + cshm.destroy_shared_memory_region(shm_op0_handle) + + def test_unregister_before_register(self): + # Create a valid cuda shared memory region and unregister before register + shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0) + self.triton_client.unregister_cuda_shared_memory("dummy_data") + shm_status = self.triton_client.get_cuda_shared_memory_status() + if self.protocol == "http": + self.assertEqual(len(shm_status), 0) + else: + self.assertEqual(len(shm_status.regions), 0) + cshm.destroy_shared_memory_region(shm_op0_handle) + + def test_unregister_after_register(self): + # Create a valid cuda shared memory region and unregister after register + shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0) + self.triton_client.register_cuda_shared_memory( + "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8 + ) + self.triton_client.unregister_cuda_shared_memory("dummy_data") + shm_status = self.triton_client.get_cuda_shared_memory_status() + if self.protocol == "http": + self.assertEqual(len(shm_status), 0) + else: + self.assertEqual(len(shm_status.regions), 0) + cshm.destroy_shared_memory_region(shm_op0_handle) + + def test_reregister_after_register(self): + # Create a valid cuda shared memory region and unregister after register + shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0) + self.triton_client.register_cuda_shared_memory( + "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8 + ) + try: + self.triton_client.register_cuda_shared_memory( + "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8 + ) + except Exception as ex: + self.assertIn( + "shared memory region 'dummy_data' already in manager", str(ex) + ) + shm_status = self.triton_client.get_cuda_shared_memory_status() + if self.protocol == "http": + self.assertEqual(len(shm_status), 1) + else: + self.assertEqual(len(shm_status.regions), 1) + cshm.destroy_shared_memory_region(shm_op0_handle) + def test_unregister_after_inference(self): # Unregister after inference error_msg = [] @@ -396,5 +400,169 @@ def test_infer_byte_size_out_of_bound(self): self._cleanup_server(shm_handles) +class TestCudaSharedMemoryUnregister(CudaSharedMemoryTestBase): + def _test_unregister_shm_fail(self): + second_client = httpclient.InferenceServerClient("localhost:8000", verbose=True) + + with self.assertRaises(InferenceServerException) as ex: + second_client.unregister_cuda_shared_memory() + self.assertIn( + "Failed to unregister the following cuda shared memory regions: input0_data ,input1_data ,output0_data ,output1_data", + str(ex.exception), + ) + + with self.assertRaises(InferenceServerException) as ex: + second_client.unregister_cuda_shared_memory("input0_data") + self.assertIn( + "Cannot unregister shared memory region 'input0_data', it is currently in use.", + str(ex.exception), + ) + + with self.assertRaises(InferenceServerException) as ex: + second_client.unregister_cuda_shared_memory("input1_data") + self.assertIn( + "Cannot unregister shared memory region 'input1_data', it is currently in use.", + str(ex.exception), + ) + + with self.assertRaises(InferenceServerException) as ex: + second_client.unregister_cuda_shared_memory("output0_data") + self.assertIn( + "Cannot unregister shared memory region 'output0_data', it is currently in use.", + str(ex.exception), + ) + + with self.assertRaises(InferenceServerException) as ex: + second_client.unregister_cuda_shared_memory("output1_data") + self.assertIn( + "Cannot unregister shared memory region 'output1_data', it is currently in use.", + str(ex.exception), + ) + + def _test_shm_not_found(self): + second_client = httpclient.InferenceServerClient("localhost:8000", verbose=True) + + with self.assertRaises(InferenceServerException) as ex: + second_client.get_cuda_shared_memory_status("input0_data") + self.assertIn( + "Unable to find cuda shared memory region: 'input0_data'", + str(ex.exception), + ) + + with self.assertRaises(InferenceServerException) as ex: + second_client.get_cuda_shared_memory_status("input1_data") + self.assertIn( + "Unable to find cuda shared memory region: 'input1_data'", + str(ex.exception), + ) + + with self.assertRaises(InferenceServerException) as ex: + second_client.get_cuda_shared_memory_status("output0_data") + self.assertIn( + "Unable to find cuda shared memory region: 'output0_data'", + str(ex.exception), + ) + + with self.assertRaises(InferenceServerException) as ex: + second_client.get_cuda_shared_memory_status("output1_data") + self.assertIn( + "Unable to find cuda shared memory region: 'output1_data'", + str(ex.exception), + ) + + def test_unregister_shm_during_inference_http(self): + try: + self.triton_client.unregister_cuda_shared_memory() + shm_handles = self._configure_server() + + inputs = [ + httpclient.InferInput("INPUT0", [1, 16], "INT32"), + httpclient.InferInput("INPUT1", [1, 16], "INT32"), + ] + outputs = [ + httpclient.InferRequestedOutput("OUTPUT0", binary_data=True), + httpclient.InferRequestedOutput("OUTPUT1", binary_data=False), + ] + + inputs[0].set_shared_memory("input0_data", self.DEFAULT_SHM_BYTE_SIZE) + inputs[1].set_shared_memory("input1_data", self.DEFAULT_SHM_BYTE_SIZE) + outputs[0].set_shared_memory("output0_data", self.DEFAULT_SHM_BYTE_SIZE) + outputs[1].set_shared_memory("output1_data", self.DEFAULT_SHM_BYTE_SIZE) + + async_request = self.triton_client.async_infer( + model_name="simple", inputs=inputs, outputs=outputs + ) + + # Ensure inference started + time.sleep(2) + + # Try unregister shm regions during inference + self._test_unregister_shm_fail() + + # Blocking call + async_request.get_result() + + # Try unregister shm regions after inference + self.triton_client.unregister_cuda_shared_memory() + self._test_shm_not_found() + + finally: + self._cleanup_server(shm_handles) + + def test_unregister_shm_during_inference_grpc(self): + try: + self.triton_client.unregister_cuda_shared_memory() + shm_handles = self._configure_server() + + inputs = [ + grpcclient.InferInput("INPUT0", [1, 16], "INT32"), + grpcclient.InferInput("INPUT1", [1, 16], "INT32"), + ] + outputs = [ + grpcclient.InferRequestedOutput("OUTPUT0"), + grpcclient.InferRequestedOutput("OUTPUT1"), + ] + + inputs[0].set_shared_memory("input0_data", self.DEFAULT_SHM_BYTE_SIZE) + inputs[1].set_shared_memory("input1_data", self.DEFAULT_SHM_BYTE_SIZE) + outputs[0].set_shared_memory("output0_data", self.DEFAULT_SHM_BYTE_SIZE) + outputs[1].set_shared_memory("output1_data", self.DEFAULT_SHM_BYTE_SIZE) + + def callback(user_data, result, error): + if error: + user_data.append(error) + else: + user_data.append(result) + + user_data = [] + + self.triton_client.async_infer( + model_name="simple", + inputs=inputs, + outputs=outputs, + callback=partial(callback, user_data), + ) + + # Ensure inference started + time.sleep(2) + + # Try unregister shm regions during inference + self._test_unregister_shm_fail() + + # Wait until the results are available in user_data + time_out = 20 + while (len(user_data) == 0) and time_out > 0: + time_out = time_out - 1 + time.sleep(1) + time.sleep(2) + + # Try unregister shm regions after inference + self.triton_client.unregister_cuda_shared_memory() + self._test_shm_not_found() + + finally: + self._cleanup_server(shm_handles) + + if __name__ == "__main__": unittest.main() diff --git a/qa/L0_cuda_shared_memory/test.sh b/qa/L0_cuda_shared_memory/test.sh index 02857b2153..b7126a9295 100755 --- a/qa/L0_cuda_shared_memory/test.sh +++ b/qa/L0_cuda_shared_memory/test.sh @@ -84,6 +84,47 @@ for i in \ done done +mkdir -p python_models/simple/1/ +cp ../python_models/execute_delayed_model/model.py ./python_models/simple/1/ +cp ../python_models/execute_delayed_model/config.pbtxt ./python_models/simple/ +sed -i 's/KIND_CPU/KIND_GPU/g' ./python_models/simple/config.pbtxt + +for client_type in http grpc; do + SERVER_ARGS="--model-repository=`pwd`/python_models --log-verbose=1 ${SERVER_ARGS_EXTRA}" + SERVER_LOG="./unregister_shm.$client_type.server.log" + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + export CLIENT_TYPE=$client_type + CLIENT_LOG="./unregister_shm.$client_type.client.log" + set +e + python3 $SHM_TEST TestCudaSharedMemoryUnregister.test_unregister_shm_during_inference_$client_type >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $TEST_RESULT_FILE + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi + fi + + kill $SERVER_PID + wait $SERVER_PID + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Server shut down non-gracefully\n***" + RET=1 + fi + set -e + done + if [ $RET -eq 0 ]; then echo -e "\n***\n*** Test Passed\n***" else diff --git a/qa/L0_decoupled/decoupled_test.py b/qa/L0_decoupled/decoupled_test.py index 1f76f4845b..d7bc59f5c7 100755 --- a/qa/L0_decoupled/decoupled_test.py +++ b/qa/L0_decoupled/decoupled_test.py @@ -116,7 +116,13 @@ def _stream_infer_with_params( url="localhost:8001", verbose=True ) as triton_client: # Establish stream - triton_client.start_stream(callback=partial(callback, user_data)) + if "TRITONSERVER_GRPC_STATUS_FLAG" in os.environ: + metadata = {"triton_grpc_error": "true"} + triton_client.start_stream( + callback=partial(callback, user_data), headers=metadata + ) + else: + triton_client.start_stream(callback=partial(callback, user_data)) # Send specified many requests in parallel for i in range(request_count): time.sleep((request_delay / 1000)) @@ -175,7 +181,13 @@ def _stream_infer( url="localhost:8001", verbose=True ) as triton_client: # Establish stream - triton_client.start_stream(callback=partial(callback, user_data)) + if "TRITONSERVER_GRPC_STATUS_FLAG" in os.environ: + metadata = {"triton_grpc_error": "true"} + triton_client.start_stream( + callback=partial(callback, user_data), headers=metadata + ) + else: + triton_client.start_stream(callback=partial(callback, user_data)) # Send specified many requests in parallel for i in range(request_count): time.sleep((request_delay / 1000)) diff --git a/qa/L0_decoupled/test.sh b/qa/L0_decoupled/test.sh index 98ad134d8b..22c37dff49 100755 --- a/qa/L0_decoupled/test.sh +++ b/qa/L0_decoupled/test.sh @@ -176,4 +176,4 @@ else echo -e "\n***\n*** Test Failed\n***" fi -exit $RET +exit $RET \ No newline at end of file diff --git a/qa/L0_dlpack_multi_gpu/test.sh b/qa/L0_dlpack_multi_gpu/test.sh index 996f062f42..ae72daa7d0 100755 --- a/qa/L0_dlpack_multi_gpu/test.sh +++ b/qa/L0_dlpack_multi_gpu/test.sh @@ -27,7 +27,7 @@ SERVER=/opt/tritonserver/bin/tritonserver SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=1" -CLIENT_PY=./python_unittest.py +CLIENT_PY=./test_infer_shm_leak.py CLIENT_LOG="./client.log" EXPECTED_NUM_TESTS="1" TEST_RESULT_FILE='test_results.txt' @@ -52,8 +52,8 @@ rm -fr *.log ./models mkdir -p models/dlpack_test/1/ cp ../python_models/dlpack_test/model.py models/dlpack_test/1/ cp ../python_models/dlpack_test/config.pbtxt models/dlpack_test -cp ../L0_backend_python/python_unittest.py . -sed -i 's#sys.path.append("../../common")#sys.path.append("../common")#g' python_unittest.py +cp ../L0_backend_python/test_infer_shm_leak.py . +sed -i 's#sys.path.append("../../common")#sys.path.append("../common")#g' test_infer_shm_leak.py run_server if [ "$SERVER_PID" == "0" ]; then diff --git a/qa/L0_grpc/test.sh b/qa/L0_grpc/test.sh index 50cf5a6f91..93d22e75be 100755 --- a/qa/L0_grpc/test.sh +++ b/qa/L0_grpc/test.sh @@ -48,7 +48,7 @@ NGINX_CONF="./nginx.conf" # On windows the paths invoked by the script (running in WSL) must use # /mnt/c when needed but the paths on the tritonserver command-line # must be C:/ style. -if [[ "$(< /proc/sys/kernel/osrelease)" == *microsoft* ]]; then +if [[ -v WSL_DISTRO_NAME ]] || [[ -v MSYSTEM ]]; then SDKDIR=${SDKDIR:=C:/sdk} MODELDIR=${MODELDIR:=C:/models} CLIENT_PLUGIN_MODELDIR=${MODELDIR:=C:/client_plugin_models} diff --git a/qa/L0_grpc_state_cleanup/cleanup_test.py b/qa/L0_grpc_state_cleanup/cleanup_test.py index 431eeb1720..f7507747e9 100755 --- a/qa/L0_grpc_state_cleanup/cleanup_test.py +++ b/qa/L0_grpc_state_cleanup/cleanup_test.py @@ -161,9 +161,17 @@ def _stream_infer_with_params( url="localhost:8001", verbose=True ) as triton_client: # Establish stream - triton_client.start_stream( - callback=partial(callback, user_data), stream_timeout=stream_timeout - ) + if "TRITONSERVER_GRPC_STATUS_FLAG" in os.environ: + metadata = {"triton_grpc_error": "true"} + triton_client.start_stream( + callback=partial(callback, user_data), + stream_timeout=stream_timeout, + headers=metadata, + ) + else: + triton_client.start_stream( + callback=partial(callback, user_data), stream_timeout=stream_timeout + ) # Send specified many requests in parallel for i in range(request_count): time.sleep((request_delay / 1000)) @@ -229,9 +237,17 @@ def _stream_infer( url="localhost:8001", verbose=True ) as triton_client: # Establish stream - triton_client.start_stream( - callback=partial(callback, user_data), stream_timeout=stream_timeout - ) + if "TRITONSERVER_GRPC_STATUS_FLAG" in os.environ: + metadata = {"triton_grpc_error": "true"} + triton_client.start_stream( + callback=partial(callback, user_data), + stream_timeout=stream_timeout, + headers=metadata, + ) + else: + triton_client.start_stream( + callback=partial(callback, user_data), stream_timeout=stream_timeout + ) # Send specified many requests in parallel for i in range(request_count): time.sleep((request_delay / 1000)) @@ -608,9 +624,17 @@ def test_non_decoupled_streaming_multi_response(self): url="localhost:8001", verbose=True ) as client: # Establish stream - client.start_stream( - callback=partial(callback, user_data), stream_timeout=16 - ) + if "TRITONSERVER_GRPC_STATUS_FLAG" in os.environ: + metadata = {"triton_grpc_error": "true"} + client.start_stream( + callback=partial(callback, user_data), + stream_timeout=16, + headers=metadata, + ) + else: + client.start_stream( + callback=partial(callback, user_data), stream_timeout=16 + ) # Send a request client.async_stream_infer( model_name=self.repeat_non_decoupled_model_name, diff --git a/qa/L0_http/generate_endpoint_test.py b/qa/L0_http/generate_endpoint_test.py index a9a972e02a..3eb0b6ea5f 100755 --- a/qa/L0_http/generate_endpoint_test.py +++ b/qa/L0_http/generate_endpoint_test.py @@ -142,6 +142,21 @@ def test_generate(self): self.assertIn("TEXT", data) self.assertEqual(text, data["TEXT"]) + def test_generate_with_all_inputs(self): + # Setup text-based input + text = "hello world" + inputs = {"PROMPT": text, "STREAM": False, "input_ids": [100, 200]} + + r = self.generate(self._model_name, inputs) + r.raise_for_status() + + self.assertIn("Content-Type", r.headers) + self.assertEqual(r.headers["Content-Type"], "application/json") + + data = r.json() + self.assertIn("TEXT", data) + self.assertEqual(text, data["TEXT"]) + def test_request_id(self): # Setup text based input text = "hello world" @@ -220,18 +235,26 @@ def test_missing_inputs(self): ] for inputs in missing_all_inputs: self.generate_expect_failure( - self._model_name, inputs, "expected 2 inputs but got 0" + self._model_name, + inputs, + "expected number of inputs between 2 and 3 but got 0", ) self.generate_stream_expect_failure( - self._model_name, inputs, "expected 2 inputs but got 0" + self._model_name, + inputs, + "expected number of inputs between 2 and 3 but got 0", ) for inputs in missing_one_input: self.generate_expect_failure( - self._model_name, inputs, "expected 2 inputs but got 1" + self._model_name, + inputs, + "expected number of inputs between 2 and 3 but got 1", ) self.generate_stream_expect_failure( - self._model_name, inputs, "expected 2 inputs but got 1" + self._model_name, + inputs, + "expected number of inputs between 2 and 3 but got 1", ) def test_invalid_input_types(self): diff --git a/qa/L0_http/generate_models/mock_llm/config.pbtxt b/qa/L0_http/generate_models/mock_llm/config.pbtxt index 6871661525..74a306052a 100644 --- a/qa/L0_http/generate_models/mock_llm/config.pbtxt +++ b/qa/L0_http/generate_models/mock_llm/config.pbtxt @@ -1,4 +1,4 @@ -# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -41,6 +41,12 @@ input [ name: "STREAM" data_type: TYPE_BOOL dims: [ 1, 1 ] + }, + { + name: "input_ids" + data_type: TYPE_INT32 + dims: [ 1, -1 ] + optional: true } ] diff --git a/qa/L0_http/test.sh b/qa/L0_http/test.sh index 321c398995..572c527ba4 100755 --- a/qa/L0_http/test.sh +++ b/qa/L0_http/test.sh @@ -49,7 +49,7 @@ NGINX_CONF="./nginx.conf" # On windows the paths invoked by the script (running in WSL) must use # /mnt/c when needed but the paths on the tritonserver command-line # must be C:/ style. -if [[ "$(< /proc/sys/kernel/osrelease)" == *microsoft* ]]; then +if [[ -v WSL_DISTRO_NAME ]] || [[ -v MSYSTEM ]]; then SDKDIR=${SDKDIR:=C:/sdk} MODELDIR=${MODELDIR:=C:/models} DATADIR=${DATADIR:="/mnt/c/data/inferenceserver/${REPO_VERSION}"} @@ -662,7 +662,7 @@ fi ## Python Unit Tests TEST_RESULT_FILE='test_results.txt' PYTHON_TEST=generate_endpoint_test.py -EXPECTED_NUM_TESTS=16 +EXPECTED_NUM_TESTS=17 set +e python $PYTHON_TEST >$CLIENT_LOG 2>&1 if [ $? -ne 0 ]; then diff --git a/qa/L0_infer/install_and_test.sh b/qa/L0_infer/install_and_test.sh index 28e5dad52e..4c136cf1dd 100755 --- a/qa/L0_infer/install_and_test.sh +++ b/qa/L0_infer/install_and_test.sh @@ -25,14 +25,24 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# Determine the operating system to call the correct package manager. +ID_LIKE=$(grep -Po '(?<=ID_LIKE=).*' /etc/os-release | awk -F= '{print $1}' | tr -d '"' | awk '{print $1}') + # Note: This script is to be used with customized triton containers that need # dependencies to run L0_infer tests -apt-get update && \ - apt-get install -y --no-install-recommends \ - curl \ - jq \ - python3 \ - python3-pip +if [[ "$ID_LIKE" =~ "debian" ]]; then + apt-get update && \ + apt-get install -y --no-install-recommends \ + curl \ + jq \ + python3 \ + python3-pip +else + yum install -y \ + jq \ + curl +fi + pip3 install --upgrade pip # install client libraries pip3 install tritonclient[all] diff --git a/qa/L0_infer/test.sh b/qa/L0_infer/test.sh index dba4d7dbcc..36f63053e3 100755 --- a/qa/L0_infer/test.sh +++ b/qa/L0_infer/test.sh @@ -87,7 +87,7 @@ DEFAULT_SHM_SIZE_BYTES=$((1024*1024*$DEFAULT_SHM_SIZE_MB)) # On windows the paths invoked by the script (running in WSL) must use # /mnt/c when needed but the paths on the tritonserver command-line # must be C:/ style. -if [[ "$(< /proc/sys/kernel/osrelease)" == *microsoft* ]]; then +if [[ -v WSL_DISTRO_NAME ]] || [[ -v MSYSTEM ]]; then MODELDIR=${MODELDIR:=C:/models} DATADIR=${DATADIR:="/mnt/c/data/inferenceserver/${REPO_VERSION}"} BACKEND_DIR=${BACKEND_DIR:=C:/tritonserver/backends} diff --git a/qa/L0_input_validation/input_validation_test.py b/qa/L0_input_validation/input_validation_test.py index 33360b7a08..8e7f58bb0c 100755 --- a/qa/L0_input_validation/input_validation_test.py +++ b/qa/L0_input_validation/input_validation_test.py @@ -195,7 +195,7 @@ def get_input_array(input_size, np_dtype): triton_client.infer(model_name=model_name, inputs=inputs) err_str = str(e.exception) self.assertIn( - f"expected {input_size} string elements for inference input 'INPUT1', got {input_size-2}", + f"expected {input_size} string elements for inference input 'INPUT1' for model '{model_name}', got {input_size-2}", err_str, ) @@ -208,7 +208,7 @@ def get_input_array(input_size, np_dtype): triton_client.infer(model_name=model_name, inputs=inputs) err_str = str(e.exception) self.assertIn( - f"expected {input_size} string elements for inference input 'INPUT1', got {input_size+2}", + f"unexpected number of string elements {input_size+1} for inference input 'INPUT1' for model '{model_name}', expecting {input_size}", err_str, ) diff --git a/qa/L0_input_validation/test.sh b/qa/L0_input_validation/test.sh index fc70abd969..22e0560959 100755 --- a/qa/L0_input_validation/test.sh +++ b/qa/L0_input_validation/test.sh @@ -68,7 +68,9 @@ set +e python3 -m pytest --junitxml="input_validation.report.xml" $TEST_PY::InputValTest >> $CLIENT_LOG 2>&1 if [ $? -ne 0 ]; then - echo -e "\n***\n*** input_validation_test.py FAILED. \n***" + cat $CLIENT_LOG + cat $SERVER_LOG + echo -e "\n***\n*** input_validation_test.py::InputValTest FAILED. \n***" RET=1 fi set -e @@ -138,7 +140,9 @@ set +e python3 -m pytest --junitxml="input_shape_validation.report.xml" $TEST_PY::InputShapeTest >> $CLIENT_LOG 2>&1 if [ $? -ne 0 ]; then - echo -e "\n***\n*** input_validation_test.py FAILED. \n***" + cat $CLIENT_LOG + cat $SERVER_LOG + echo -e "\n***\n*** input_validation_test.py::InputShapeTest FAILED. \n***" RET=1 fi set -e @@ -147,10 +151,13 @@ kill $SERVER_PID wait $SERVER_PID # input_byte_size_test +cp -r /data/inferenceserver/${REPO_VERSION}/qa_identity_model_repository/{savedmodel_zero_1_float32,savedmodel_zero_1_object} ./models + set +e -LD_LIBRARY_PATH=/opt/tritonserver/lib:$LD_LIBRARY_PATH $TEST_EXEC >>$TEST_LOG 2>&1 +LD_LIBRARY_PATH=/opt/tritonserver/lib:$LD_LIBRARY_PATH $TEST_EXEC >> $TEST_LOG 2>&1 if [ $? -ne 0 ]; then - echo -e "\n***\n*** Query Unit Test Failed\n***" + cat $TEST_LOG + echo -e "\n***\n*** input_byte_size_test FAILED\n***" RET=1 fi set -e @@ -158,8 +165,6 @@ set -e if [ $RET -eq 0 ]; then echo -e "\n***\n*** Input Validation Test Passed\n***" else - cat $CLIENT_LOG - cat $SERVER_LOG echo -e "\n***\n*** Input Validation Test FAILED\n***" fi diff --git a/qa/L0_lifecycle/lifecycle_test.py b/qa/L0_lifecycle/lifecycle_test.py index a2bfc067bc..49fe684ff1 100755 --- a/qa/L0_lifecycle/lifecycle_test.py +++ b/qa/L0_lifecycle/lifecycle_test.py @@ -3493,6 +3493,133 @@ def test_delete_custom_config(self): except Exception as ex: self.assertTrue(False, "unexpected error {}".format(ex)) + def test_load_new_model_version(self): + model_name = "identity_fp32" + client = self._get_client(use_grpc=True) + + # version 1 and 2 are already loaded + # version 3 is in the model directory but not loaded + # version 4 does not exist anywhere + self.assertTrue(client.is_model_ready(model_name, "1")) + self.assertTrue(client.is_model_ready(model_name, "2")) + self.assertFalse(client.is_model_ready(model_name, "3")) + self.assertFalse(client.is_model_ready(model_name, "4")) + with open(os.environ["SERVER_LOG"]) as f: + server_log = f.read() + self.assertEqual(server_log.count("[PB model] Loading version 1"), 1) + self.assertEqual(server_log.count("[PB model] Loading version 2"), 1) + self.assertEqual(server_log.count("[PB model] Loading version 3"), 0) + self.assertEqual(server_log.count("[PB model] Loading version 4"), 0) + self.assertEqual(server_log.count("successfully loaded 'identity_fp32'"), 1) + + # update version 2 model file + Path(os.path.join("models", model_name, "2", "model.py")).touch() + # add version 4 model file + src_path = os.path.join("models", model_name, "3") + dst_path = os.path.join("models", model_name, "4") + shutil.copytree(src_path, dst_path) + # update model config to load version 1 to 4 + config_path = os.path.join("models", model_name, "config.pbtxt") + with open(config_path, mode="r+", encoding="utf-8", errors="strict") as f: + config = f.read() + config = config.replace( + "version_policy: { specific: { versions: [1, 2] } }", + "version_policy: { specific: { versions: [1, 2, 3, 4] } }", + ) + f.truncate(0) + f.seek(0) + f.write(config) + # reload the model + client.load_model(model_name) + + # version 1 is unmodified so it should not be reloaded + # version 2 is modified so it should be reloaded + # version 3 model file existed but not loaded so it should be loaded + # version 4 is a new version so it should be loaded + self.assertTrue(client.is_model_ready(model_name, "1")) + self.assertTrue(client.is_model_ready(model_name, "2")) + self.assertTrue(client.is_model_ready(model_name, "3")) + self.assertTrue(client.is_model_ready(model_name, "4")) + with open(os.environ["SERVER_LOG"]) as f: + server_log = f.read() + self.assertEqual(server_log.count("[PB model] Loading version 1"), 1) + self.assertEqual(server_log.count("[PB model] Loading version 2"), 2) + self.assertEqual(server_log.count("[PB model] Loading version 3"), 1) + self.assertEqual(server_log.count("[PB model] Loading version 4"), 1) + self.assertEqual(server_log.count("successfully loaded 'identity_fp32'"), 2) + + # simulate a dependency change to all versions + Path(os.path.join("models", model_name, "dummy_dependency.py")).touch() + # reload the model + client.load_model(model_name) + + # all 4 versions should be reloaded + self.assertTrue(client.is_model_ready(model_name, "1")) + self.assertTrue(client.is_model_ready(model_name, "2")) + self.assertTrue(client.is_model_ready(model_name, "3")) + self.assertTrue(client.is_model_ready(model_name, "4")) + with open(os.environ["SERVER_LOG"]) as f: + server_log = f.read() + self.assertEqual(server_log.count("[PB model] Loading version 1"), 2) + self.assertEqual(server_log.count("[PB model] Loading version 2"), 3) + self.assertEqual(server_log.count("[PB model] Loading version 3"), 2) + self.assertEqual(server_log.count("[PB model] Loading version 4"), 2) + self.assertEqual(server_log.count("successfully loaded 'identity_fp32'"), 3) + + # update model config to only load version 4 + config_path = os.path.join("models", model_name, "config.pbtxt") + with open(config_path, mode="r+", encoding="utf-8", errors="strict") as f: + config = f.read() + config = config.replace( + "version_policy: { specific: { versions: [1, 2, 3, 4] } }", + "version_policy: { specific: { versions: [4] } }", + ) + f.truncate(0) + f.seek(0) + f.write(config) + # reload the model + client.load_model(model_name) + + # only version 4 should be available and no reloads should happen + self.assertFalse(client.is_model_ready(model_name, "1")) + self.assertFalse(client.is_model_ready(model_name, "2")) + self.assertFalse(client.is_model_ready(model_name, "3")) + self.assertTrue(client.is_model_ready(model_name, "4")) + with open(os.environ["SERVER_LOG"]) as f: + server_log = f.read() + self.assertEqual(server_log.count("[PB model] Loading version 1"), 2) + self.assertEqual(server_log.count("[PB model] Loading version 2"), 3) + self.assertEqual(server_log.count("[PB model] Loading version 3"), 2) + self.assertEqual(server_log.count("[PB model] Loading version 4"), 2) + self.assertEqual(server_log.count("successfully loaded 'identity_fp32'"), 4) + + # update model config to load version 1 and 4 + config_path = os.path.join("models", model_name, "config.pbtxt") + with open(config_path, mode="r+", encoding="utf-8", errors="strict") as f: + config = f.read() + config = config.replace( + "version_policy: { specific: { versions: [4] } }", + "version_policy: { specific: { versions: [1, 4] } }", + ) + f.truncate(0) + f.seek(0) + f.write(config) + # reload the model + client.load_model(model_name) + + # version 1 should be loaded and version 4 should not be reloaded + self.assertTrue(client.is_model_ready(model_name, "1")) + self.assertFalse(client.is_model_ready(model_name, "2")) + self.assertFalse(client.is_model_ready(model_name, "3")) + self.assertTrue(client.is_model_ready(model_name, "4")) + with open(os.environ["SERVER_LOG"]) as f: + server_log = f.read() + self.assertEqual(server_log.count("[PB model] Loading version 1"), 3) + self.assertEqual(server_log.count("[PB model] Loading version 2"), 3) + self.assertEqual(server_log.count("[PB model] Loading version 3"), 2) + self.assertEqual(server_log.count("[PB model] Loading version 4"), 2) + self.assertEqual(server_log.count("successfully loaded 'identity_fp32'"), 5) + if __name__ == "__main__": unittest.main() diff --git a/qa/L0_lifecycle/test.sh b/qa/L0_lifecycle/test.sh index 9236fdabfb..4efd244c76 100755 --- a/qa/L0_lifecycle/test.sh +++ b/qa/L0_lifecycle/test.sh @@ -2196,6 +2196,41 @@ set -e kill $SERVER_PID wait $SERVER_PID +LOG_IDX=$((LOG_IDX+1)) + +# LifeCycleTest.test_load_new_model_version +rm -rf models +mkdir models +cp -r ../python_models/identity_fp32 models/ && (cd models/identity_fp32 && \ + echo "version_policy: { specific: { versions: [1, 2] } }" >> config.pbtxt && \ + echo " def initialize(self, args):" >> model.py && \ + echo " pb_utils.Logger.log_info(f'[PB model] Loading version {args[\"model_version\"]}')" >> model.py && \ + mkdir 1 && cp model.py 1 && \ + mkdir 2 && cp model.py 2 && \ + mkdir 3 && mv model.py 3) + +export PYTHONDONTWRITEBYTECODE="True" +SERVER_ARGS="--model-repository=`pwd`/models --model-control-mode=explicit --load-model=*" +SERVER_LOG="./inference_server_$LOG_IDX.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +SERVER_LOG=$SERVER_LOG python $LC_TEST LifeCycleTest.test_load_new_model_version >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID +unset PYTHONDONTWRITEBYTECODE if [ $RET -eq 0 ]; then echo -e "\n***\n*** Test Passed\n***" diff --git a/qa/L0_perf_analyzer_capi/test.sh b/qa/L0_perf_analyzer_capi/test.sh index d031e2cacf..3e3f9e4af6 100755 --- a/qa/L0_perf_analyzer_capi/test.sh +++ b/qa/L0_perf_analyzer_capi/test.sh @@ -56,7 +56,7 @@ SHAPETENSORADTAFILE=`pwd`/../common/perf_analyzer_input_data_json/shape_tensor_d ERROR_STRING="error | Request count: 0 | : 0 infer/sec" -STABILITY_THRESHOLD="15" +STABILITY_THRESHOLD="9999" source ../common/util.sh diff --git a/qa/L0_perf_analyzer_doc_links/test.sh b/qa/L0_perf_analyzer_doc_links/test.sh index db80e84974..d0757bca9e 100755 --- a/qa/L0_perf_analyzer_doc_links/test.sh +++ b/qa/L0_perf_analyzer_doc_links/test.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -35,10 +35,10 @@ python3 -m pip install mkdocs-htmlproofer-plugin==0.10.3 #Download perf_analyzer docs TRITON_REPO_ORGANIZATION=${TRITON_REPO_ORGANIZATION:="http://github.com/triton-inference-server"} -TRITON_CLIENT_REPO_TAG="${TRITON_CLIENT_REPO_TAG:=main}" -git clone -b ${TRITON_CLIENT_REPO_TAG} ${TRITON_REPO_ORGANIZATION}/client.git -cp `pwd`/client/src/c++/perf_analyzer/README.md . -cp -rf `pwd`/client/src/c++/perf_analyzer/docs . +TRITON_PERF_ANALYZER_REPO_TAG="${TRITON_PERF_ANALYZER_REPO_TAG:=main}" +git clone -b ${TRITON_PERF_ANALYZER_REPO_TAG} ${TRITON_REPO_ORGANIZATION}/perf_analyzer.git +cp `pwd`/perf_analyzer/README.md . +cp -rf `pwd`/perf_analyzer/docs . # Need to remove all links that start with -- or -. Mkdocs converts all -- to - for anchor links. # This breaks all links to cli commands throughout the docs. This will iterate over all diff --git a/qa/L0_perf_tensorrt_llm/test.sh b/qa/L0_perf_tensorrt_llm/test.sh index 35d360498d..e74b01e568 100755 --- a/qa/L0_perf_tensorrt_llm/test.sh +++ b/qa/L0_perf_tensorrt_llm/test.sh @@ -34,7 +34,7 @@ TRT_ROOT="/usr/local/tensorrt" MODEL_NAME="gpt2_tensorrt_llm" NAME="tensorrt_llm_benchmarking_test" MODEL_REPOSITORY="$(pwd)/triton_model_repo" -TENSORRTLLM_BACKEND_DIR="/opt/tritonserver/tensorrtllm_backend" +TENSORRTLLM_BACKEND_DIR="/workspace/tensorrtllm_backend" GPT_DIR="$TENSORRTLLM_BACKEND_DIR/tensorrt_llm/examples/gpt" TOKENIZER_DIR="$GPT_DIR/gpt2" ENGINES_DIR="${BASE_DIR}/engines/inflight_batcher_llm/${NUM_GPUS}-gpu" @@ -47,40 +47,27 @@ SERVER_TIMEOUT=${SERVER_TIMEOUT:=120} function clone_tensorrt_llm_backend_repo { rm -rf $TENSORRTLLM_BACKEND_DIR && mkdir $TENSORRTLLM_BACKEND_DIR apt-get update && apt-get install git-lfs -y --no-install-recommends - git clone --single-branch --depth=1 -b ${TENSORRTLLM_BACKEND_REPO_TAG} https://github.com/triton-inference-server/tensorrtllm_backend.git $TENSORRTLLM_BACKEND_DIR + git clone --single-branch --depth=1 -b ${TENSORRTLLM_BACKEND_REPO_TAG} ${TRITON_REPO_ORG}/tensorrtllm_backend.git $TENSORRTLLM_BACKEND_DIR cd $TENSORRTLLM_BACKEND_DIR && git lfs install && git submodule update --init --recursive } # Update Open MPI to a version compatible with SLURM. function upgrade_openmpi { - cd /tmp/ local CURRENT_VERSION=$(mpirun --version 2>&1 | awk '/Open MPI/ {gsub(/rc[0-9]+/, "", $NF); print $NF}') if [ -n "$CURRENT_VERSION" ] && dpkg --compare-versions "$CURRENT_VERSION" lt "5.0.1"; then # Uninstall the current version of Open MPI - wget "https://download.open-mpi.org/release/open-mpi/v$(echo "${CURRENT_VERSION}" | awk -F. '{print $1"."$2}')/openmpi-${CURRENT_VERSION}.tar.gz" || { - echo "Failed to download Open MPI ${CURRENT_VERSION}" - exit 1 - } - rm -rf "openmpi-${CURRENT_VERSION}" && tar -xzf "openmpi-${CURRENT_VERSION}.tar.gz" && cd "openmpi-${CURRENT_VERSION}" || { - echo "Failed to extract Open MPI ${CURRENT_VERSION}" - exit 1 - } - unset PMIX_VERSION && ./configure --prefix=/opt/hpcx/ompi/ && make uninstall || { - echo "Failed to uninstall Open MPI ${CURRENT_VERSION}" - exit 1 - } - rm -rf /opt/hpcx/ompi/ /usr/local/mpi/ || { - echo "Failed to remove Open MPI ${CURRENT_VERSION} installation directories" + rm -r /opt/hpcx/ompi/ /usr/local/mpi && rm -rf /usr/lib/$(gcc -print-multiarch)/openmpi || { + echo "Failed to uninstall the existing Open MPI version $CURRENT_VERSION." exit 1 } - cd ../ && rm -r openmpi-${CURRENT_VERSION} else - echo "Installed Open MPI version is not less than 5.0.1. Skipping the upgrade." + echo "The installed Open MPI version ($CURRENT_VERSION) is 5.0.1 or higher. Skipping the upgrade." return fi # Install SLURM supported Open MPI version + cd /tmp/ wget "https://download.open-mpi.org/release/open-mpi/v5.0/openmpi-5.0.1.tar.gz" || { echo "Failed to download Open MPI 5.0.1" exit 1 @@ -108,18 +95,6 @@ function upgrade_openmpi { mpirun --version } -function install_tensorrt_llm { - # Install CMake - bash ${TENSORRTLLM_BACKEND_DIR}/tensorrt_llm/docker/common/install_cmake.sh - export PATH="/usr/local/cmake/bin:${PATH}" - - TORCH_INSTALL_TYPE="pypi" && - (cd ${TENSORRTLLM_BACKEND_DIR}/tensorrt_llm && - bash docker/common/install_pytorch.sh $TORCH_INSTALL_TYPE && - python3 ./scripts/build_wheel.py --trt_root=/usr/local/tensorrt && - pip3 install ./build/tensorrt_llm*.whl) -} - function build_gpt2_base_model { # Download weights from HuggingFace Transformers cd ${GPT_DIR} && rm -rf gpt2 && git clone https://huggingface.co/gpt2-medium gpt2 && cd gpt2 @@ -131,24 +106,21 @@ function build_gpt2_base_model { cd ${GPT_DIR} # Convert weights from HF Tranformers to FT format - python3 hf_gpt_convert.py -p 1 -i gpt2 -o ./c-model/gpt2 --tensor-parallelism ${NUM_GPUS} --storage-type float16 + python3 convert_checkpoint.py --model_dir gpt2 --dtype float16 --tp_size ${NUM_GPUS} --output_dir "./c-model/gpt2/${NUM_GPUS}-gpu/" cd ${BASE_DIR} } function build_gpt2_tensorrt_engine { # Build TensorRT engines cd ${GPT_DIR} - python3 build.py --model_dir="./c-model/gpt2/${NUM_GPUS}-gpu/" \ - --world_size="${NUM_GPUS}" \ - --dtype float16 \ - --use_inflight_batching \ - --use_gpt_attention_plugin float16 \ - --paged_kv_cache \ - --use_gemm_plugin float16 \ - --remove_input_padding \ - --hidden_act gelu \ - --parallel_build \ - --output_dir="${ENGINES_DIR}" + trtllm-build --checkpoint_dir "./c-model/gpt2/${NUM_GPUS}-gpu/" \ + --gpt_attention_plugin float16 \ + --remove_input_padding enable \ + --paged_kv_cache enable \ + --gemm_plugin float16 \ + --workers "${NUM_GPUS}" \ + --output_dir "${ENGINES_DIR}" + cd ${BASE_DIR} } @@ -172,18 +144,18 @@ function prepare_model_repository { replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_REPOSITORY}/preprocessing/config.pbtxt" replace_config_tags '${preprocessing_instance_count}' '1' "${MODEL_REPOSITORY}/preprocessing/config.pbtxt" replace_config_tags '${tokenizer_dir}' "${TOKENIZER_DIR}/" "${MODEL_REPOSITORY}/preprocessing/config.pbtxt" - replace_config_tags '${tokenizer_type}' 'auto' "${MODEL_REPOSITORY}/preprocessing/config.pbtxt" replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_REPOSITORY}/postprocessing/config.pbtxt" replace_config_tags '${postprocessing_instance_count}' '1' "${MODEL_REPOSITORY}/postprocessing/config.pbtxt" replace_config_tags '${tokenizer_dir}' "${TOKENIZER_DIR}/" "${MODEL_REPOSITORY}/postprocessing/config.pbtxt" - replace_config_tags '${tokenizer_type}' 'auto' "${MODEL_REPOSITORY}/postprocessing/config.pbtxt" replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt" replace_config_tags '${decoupled_mode}' 'true' "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt" replace_config_tags '${max_queue_delay_microseconds}' "1000000" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt" replace_config_tags '${batching_strategy}' 'inflight_fused_batching' "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt" replace_config_tags '${engine_dir}' "${ENGINES_DIR}" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt" + replace_config_tags '${triton_backend}' "tensorrtllm" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt" + replace_config_tags '${max_queue_size}' "0" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt" } # Wait until server health endpoint shows ready. Sets WAIT_RET to 0 on @@ -244,13 +216,12 @@ function kill_server { upgrade_openmpi clone_tensorrt_llm_backend_repo -install_tensorrt_llm build_gpt2_base_model build_gpt2_tensorrt_engine prepare_model_repository # Install perf_analyzer -pip3 install tritonclient nvidia-ml-py3 +pip3 install tritonclient ARCH="amd64" STATIC_BATCH=1 diff --git a/qa/L0_perf_vllm/test.sh b/qa/L0_perf_vllm/test.sh index 498f6f8e14..e1ce8cf2ed 100755 --- a/qa/L0_perf_vllm/test.sh +++ b/qa/L0_perf_vllm/test.sh @@ -41,7 +41,7 @@ SERVER_ARGS="--model-repository=${MODEL_REPO} --backend-directory=${BACKEND_DIR} export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:=0} EXPORT_FILE=profile-export-vllm-model.json -pip3 install tritonclient nvidia-ml-py3 +pip3 install tritonclient rm -rf $MODEL_REPO $EXPORT_FILE *.tjson *.json *.csv mkdir -p $MODEL_REPO/$MODEL_NAME/1 diff --git a/qa/L0_python_api/test.sh b/qa/L0_python_api/test.sh index 6dc7206fe3..0d87d16771 100755 --- a/qa/L0_python_api/test.sh +++ b/qa/L0_python_api/test.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -49,6 +49,15 @@ if [ $? -ne 0 ]; then RET=1 fi + +FRONTEND_TEST_LOG="./python_kserve.log" +python -m pytest --junitxml=test_kserve.xml test_kserve.py > $FRONTEND_TEST_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $FRONTEND_TEST_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + set -e if [ $RET -eq 0 ]; then diff --git a/qa/L0_python_api/test_kserve.py b/qa/L0_python_api/test_kserve.py new file mode 100644 index 0000000000..9e8b82eb43 --- /dev/null +++ b/qa/L0_python_api/test_kserve.py @@ -0,0 +1,298 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import time +from functools import partial + +import numpy as np +import pytest +import testing_utils as utils +import tritonclient.grpc as grpcclient +import tritonclient.http as httpclient +import tritonserver +from tritonclient.utils import InferenceServerException +from tritonfrontend import KServeGrpc, KServeHttp + + +class TestHttpOptions: + def test_correct_http_parameters(self): + KServeHttp.Options( + address="0.0.0.1", port=8080, reuse_port=True, thread_count=16 + ) + + def test_wrong_http_parameters(self): + # Out of range + with pytest.raises(Exception): + KServeHttp.Options(port=-15) + with pytest.raises(Exception): + KServeHttp.Options(thread_count=-5) + + # Wrong data type + with pytest.raises(Exception): + KServeHttp.Options(header_forward_pattern=10) + + +class TestGrpcOptions: + def test_correct_grpc_parameters(self): + KServeGrpc.Options( + infer_compression_level=KServeGrpc.Grpc_compression_level.HIGH, + reuse_port=True, + infer_allocation_pool_size=12, + http2_max_pings_without_data=10, + ) + + def test_wrong_grpc_parameters(self): + # Out of Range + with pytest.raises(Exception): + KServeGrpc.Options(port=-5) + with pytest.raises(Exception): + KServeGrpc.Options(keepalive_timeout_ms=-20_000) + + # Wrong data type + with pytest.raises(Exception): + KServeGrpc.Options(infer_allocation_pool_size="big pool") + with pytest.raises(Exception): + KServeGrpc.Options(server_key=10) + + +HTTP_ARGS = (KServeHttp, httpclient, "localhost:8000") # Default HTTP args +GRPC_ARGS = (KServeGrpc, grpcclient, "localhost:8001") # Default GRPC args + + +class TestKServe: + @pytest.mark.parametrize("frontend, client_type, url", [HTTP_ARGS, GRPC_ARGS]) + def test_server_ready(self, frontend, client_type, url): + server = utils.setup_server() + service = utils.setup_service(server, frontend) + client = utils.setup_client(client_type, url=url) + + assert client.is_server_ready() + + utils.teardown_client(client) + utils.teardown_service(service) + utils.teardown_server(server) + + @pytest.mark.parametrize("frontend", [HTTP_ARGS[0], GRPC_ARGS[0]]) + def test_service_double_start(self, frontend): + server = utils.setup_server() + # setup_service() performs service.start() + service = utils.setup_service(server, frontend) + + with pytest.raises( + tritonserver.AlreadyExistsError, match="server is already running." + ): + service.start() + + utils.teardown_server(server) + utils.teardown_service(service) + + @pytest.mark.parametrize("frontend", [HTTP_ARGS[0], GRPC_ARGS[0]]) + def test_invalid_options(self, frontend): + server = utils.setup_server() + # Current flow is KServeHttp.Options or KServeGrpc.Options have to be + # provided to ensure type and range validation occurs. + with pytest.raises( + tritonserver.InvalidArgumentError, + match="Incorrect type for options. options argument must be of type", + ): + frontend(server, {"port": 8001}) + + utils.teardown_server(server) + + @pytest.mark.parametrize("frontend", [HTTP_ARGS[0], GRPC_ARGS[0]]) + def test_server_service_order(self, frontend): + server = utils.setup_server() + service = utils.setup_service(server, frontend) + + utils.teardown_server(server) + utils.teardown_service(service) + + @pytest.mark.parametrize("frontend, client_type", [HTTP_ARGS[:2], GRPC_ARGS[:2]]) + def test_service_custom_port(self, frontend, client_type): + server = utils.setup_server() + options = frontend.Options(port=8005) + service = utils.setup_service(server, frontend, options) + client = utils.setup_client(client_type, url="localhost:8005") + + # Confirms that service starts at port 8005 + client.is_server_ready() + + utils.teardown_client(client) + utils.teardown_service(service) + utils.teardown_server(server) + + @pytest.mark.parametrize("frontend, client_type, url", [HTTP_ARGS, GRPC_ARGS]) + def test_inference(self, frontend, client_type, url): + server = utils.setup_server() + service = utils.setup_service(server, frontend) + + # TODO: use common/test_infer + assert utils.send_and_test_inference_identity(client_type, url=url) + + utils.teardown_service(service) + utils.teardown_server(server) + + @pytest.mark.parametrize("frontend, client_type, url", [GRPC_ARGS]) + def test_streaming_inference(self, frontend, client_type, url): + server = utils.setup_server() + service = utils.setup_service(server, frontend) + + assert utils.send_and_test_stream_inference(client_type, url) + + utils.teardown_service(service) + utils.teardown_server(server) + + @pytest.mark.parametrize("frontend, client_type, url", [HTTP_ARGS]) + def test_http_generate_inference(self, frontend, client_type, url): + server = utils.setup_server() + service = utils.setup_service(server, frontend) + + assert utils.send_and_test_generate_inference() + + utils.teardown_service(service) + utils.teardown_server(server) + + @pytest.mark.parametrize("frontend, client_type, url", [HTTP_ARGS]) + def test_http_req_during_shutdown(self, frontend, client_type, url): + server = utils.setup_server() + http_service = utils.setup_service(server, frontend) + http_client = httpclient.InferenceServerClient(url="localhost:8000") + model_name = "delayed_identity" + delay = 2 # seconds + input_data0 = np.array([[delay]], dtype=np.float32) + + input0 = httpclient.InferInput("INPUT0", input_data0.shape, "FP32") + input0.set_data_from_numpy(input_data0) + + inputs = [input0] + outputs = [httpclient.InferRequestedOutput("OUTPUT0")] + + async_request = http_client.async_infer( + model_name=model_name, inputs=inputs, outputs=outputs + ) + # http_service.stop() does not use graceful shutdown + utils.teardown_service(http_service) + + # So, inference request will fail as http endpoints have been stopped. + with pytest.raises( + InferenceServerException, match="failed to obtain inference response" + ): + async_request.get_result(block=True, timeout=delay) + + # http_client.close() calls join() to terminate pool of greenlets + # However, due to an unsuccessful get_result(), async_request is still + # an active thread. Hence, join stalls until greenlet timeouts. + # Does not throw an exception, but displays error in logs. + utils.teardown_client(http_client) + + # delayed_identity will still be an active model + # Hence, server.stop() causes InternalError: Timeout. + with pytest.raises( + tritonserver.InternalError, + match="Exit timeout expired. Exiting immediately.", + ): + utils.teardown_server(server) + + @pytest.mark.parametrize("frontend, client_type, url", [GRPC_ARGS]) + def test_grpc_req_during_shutdown(self, frontend, client_type, url): + server = utils.setup_server() + grpc_service = utils.setup_service(server, frontend) + grpc_client = grpcclient.InferenceServerClient(url=url) + user_data = [] + + def callback(user_data, result, error): + if error: + user_data.append(error) + else: + user_data.append(result) + + model_name = "delayed_identity" + delay = 2 # seconds + + input_data0 = np.array([[delay]], dtype=np.float32) + input0 = client_type.InferInput("INPUT0", input_data0.shape, "FP32") + input0.set_data_from_numpy(input_data0) + + inputs = [input0] + outputs = [client_type.InferRequestedOutput("OUTPUT0")] + + grpc_client.async_infer( + model_name=model_name, + inputs=inputs, + outputs=outputs, + callback=partial(callback, user_data), + ) + + utils.teardown_service(grpc_service) + + time_out = delay + 1 + while (len(user_data) == 0) and time_out > 0: + time_out = time_out - 1 + time.sleep(1) + + # Depending on when gRPC frontend shut down StatusCode can vary + acceptable_failure_msgs = [ + "[StatusCode.CANCELLED] CANCELLED", + "[StatusCode.UNAVAILABLE] failed to connect to all addresses", + ] + + assert ( + len(user_data) == 1 + and isinstance(user_data[0], InferenceServerException) + and any( + failure_msg in str(user_data[0]) + for failure_msg in acceptable_failure_msgs + ) + ) + + utils.teardown_client(grpc_client) + utils.teardown_server(server) + + # KNOWN ISSUE: CAUSES SEGFAULT + # Created [DLIS-7231] to address at future date + # Once the server has been stopped, the underlying TRITONSERVER_Server instance + # is deleted. However, the frontend does not know the server instance + # is no longer valid. + # def test_inference_after_server_stop(self): + # server = utils.setup_server() + # http_service = utils.setup_service(server, KServeHttp) + # http_client = setup_client(httpclient, url="localhost:8000") + + # teardown_server(server) # Server has been stopped + + # model_name = "identity" + # input_data = np.array([["testing"]], dtype=object) + # # Create input and output objects + # inputs = [httpclient.InferInput("INPUT0", input_data.shape, "BYTES")] + # outputs = [httpclient.InferRequestedOutput("OUTPUT0")] + + # # Set the data for the input tensor + # inputs[0].set_data_from_numpy(input_data) + + # results = http_client.infer(model_name, inputs=inputs, outputs=outputs) + + # utils.teardown_client(http_client) + # utils.teardown_service(http_service) diff --git a/qa/L0_python_api/test_model_repository/delayed_identity/1/model.py b/qa/L0_python_api/test_model_repository/delayed_identity/1/model.py new file mode 100644 index 0000000000..b6095cec8f --- /dev/null +++ b/qa/L0_python_api/test_model_repository/delayed_identity/1/model.py @@ -0,0 +1,51 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import time + +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + def execute(self, requests): + """ + Mock Model that uses the input data to determine how long to wait + before returning identity data + """ + assert len(requests) == 1 + delay = 0 + request = requests[0] + responses = [] + + delay_tensor = pb_utils.get_input_tensor_by_name(request, "INPUT0") + delay_as_numpy = delay_tensor.as_numpy() + delay = float(delay_as_numpy[0][0]) + + out_tensor = pb_utils.Tensor("OUTPUT0", delay_as_numpy) + responses.append(pb_utils.InferenceResponse([out_tensor])) + + time.sleep(delay) + return responses diff --git a/qa/L0_python_api/test_model_repository/delayed_identity/config.pbtxt b/qa/L0_python_api/test_model_repository/delayed_identity/config.pbtxt new file mode 100644 index 0000000000..9ac8f1aaff --- /dev/null +++ b/qa/L0_python_api/test_model_repository/delayed_identity/config.pbtxt @@ -0,0 +1,52 @@ +# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "delayed_identity" +backend: "python" +max_batch_size: 64 + +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] + +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] + +instance_group [ + { + count: 1 + kind : KIND_CPU + } +] \ No newline at end of file diff --git a/qa/L0_python_api/test_model_repository/identity/1/model.py b/qa/L0_python_api/test_model_repository/identity/1/model.py new file mode 100644 index 0000000000..629b6469c9 --- /dev/null +++ b/qa/L0_python_api/test_model_repository/identity/1/model.py @@ -0,0 +1,49 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + """This model loops through different dtypes to make sure that + serialize_byte_tensor works correctly in the Python backend. + """ + + def initialize(self, args): + self._index = 0 + self._dtypes = [np.bytes_, np.object_] + + def execute(self, requests): + responses = [] + for request in requests: + in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0") + out_tensor_0 = pb_utils.Tensor( + "OUTPUT0", in_0.as_numpy().astype(self._dtypes[self._index]) + ) + self._index += 1 + responses.append(pb_utils.InferenceResponse([out_tensor_0])) + return responses diff --git a/qa/L0_python_api/test_model_repository/identity/config.pbtxt b/qa/L0_python_api/test_model_repository/identity/config.pbtxt new file mode 100644 index 0000000000..3f22e14468 --- /dev/null +++ b/qa/L0_python_api/test_model_repository/identity/config.pbtxt @@ -0,0 +1,44 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "identity" +backend: "python" +max_batch_size: 0 + +input [ + { + name: "INPUT0" + data_type: TYPE_STRING + dims: [ 1 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_STRING + dims: [ 1 ] + } +] \ No newline at end of file diff --git a/qa/L0_python_api/testing_utils.py b/qa/L0_python_api/testing_utils.py new file mode 100644 index 0000000000..48cb3ccc37 --- /dev/null +++ b/qa/L0_python_api/testing_utils.py @@ -0,0 +1,153 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +import queue +from functools import partial +from typing import Union + +import numpy as np +import requests +import tritonserver +from tritonclient.utils import InferenceServerException +from tritonfrontend import KServeGrpc, KServeHttp + +# TODO: Re-Format documentation to fit: +# https://google.github.io/styleguide/pyguide.html#38-comments-and-docstrings + + +def setup_server(model_repository="test_model_repository") -> tritonserver.Server: + module_directory = os.path.split(os.path.abspath(__file__))[0] + model_path = os.path.abspath(os.path.join(module_directory, model_repository)) + + # Starting Server Instance + server_options = tritonserver.Options( + server_id="TestServer", + model_repository=model_path, + log_error=True, + log_warn=True, + log_info=True, + ) + + return tritonserver.Server(server_options).start(wait_until_ready=True) + + +def teardown_server(server: tritonserver.Server) -> None: + server.stop() + + +def setup_service( + server: tritonserver.Server, + frontend: Union[KServeHttp, KServeGrpc], + options=None, +) -> Union[KServeHttp, KServeGrpc]: + service = frontend(server=server, options=options) + service.start() + return service + + +def teardown_service(service: Union[KServeHttp, KServeGrpc]) -> None: + service.stop() + + +def setup_client(frontend_client, url: str): + return frontend_client.InferenceServerClient(url=url) + + +def teardown_client(client) -> None: + client.close() + + +# Sends an inference to test_model_repository/identity model and verifies input == output. +def send_and_test_inference_identity(frontend_client, url: str) -> bool: + model_name = "identity" + client = setup_client(frontend_client, url) + input_data = np.array(["testing"], dtype=object) + + # Create input and output objects + inputs = [frontend_client.InferInput("INPUT0", input_data.shape, "BYTES")] + outputs = [frontend_client.InferRequestedOutput("OUTPUT0")] + # Set the data for the input tensor + inputs[0].set_data_from_numpy(input_data) + + # Perform inference request + results = client.infer(model_name=model_name, inputs=inputs, outputs=outputs) + + output_data = results.as_numpy("OUTPUT0") # Gather output data + + teardown_client(client) + return input_data[0] == output_data[0].decode() + + +# Sends multiple streaming requests to "delayed_identity" model with negligible delays, +# and verifies the inputs matches outputs and the ordering is preserved. +def send_and_test_stream_inference(frontend_client, url: str) -> bool: + num_requests = 100 + requests = [] + for i in range(num_requests): + input0_np = np.array([[float(i) / 1000]], dtype=np.float32) + inputs = [frontend_client.InferInput("INPUT0", input0_np.shape, "FP32")] + inputs[0].set_data_from_numpy(input0_np) + requests.append(inputs) + + responses = [] + + def callback(responses, result, error): + responses.append({"result": result, "error": error}) + + client = frontend_client.InferenceServerClient(url=url) + client.start_stream(partial(callback, responses)) + for inputs in requests: + client.async_stream_infer("delayed_identity", inputs) + client.stop_stream() + teardown_client(client) + + assert len(responses) == num_requests + for i in range(len(responses)): + assert responses[i]["error"] is None + output0_np = responses[i]["result"].as_numpy(name="OUTPUT0") + assert np.allclose(output0_np, [[float(i) / 1000]]) + + return True # test passed + + +def send_and_test_generate_inference() -> bool: + model_name = "identity" + url = f"http://localhost:8000/v2/models/{model_name}/generate" + input_text = "testing" + data = { + "INPUT0": input_text, + } + + response = requests.post(url, json=data, stream=True) + if response.status_code == 200: + result = response.json() + output_text = result.get("OUTPUT0", "") + + if output_text == input_text: + return True + + return False diff --git a/qa/L0_sequence_batcher/test.sh b/qa/L0_sequence_batcher/test.sh index d91b433966..ac34458b4e 100755 --- a/qa/L0_sequence_batcher/test.sh +++ b/qa/L0_sequence_batcher/test.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2018-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -93,7 +93,7 @@ TF_VERSION=${TF_VERSION:=2} # /mnt/c when needed but the paths on the tritonserver command-line # must be C:/ style. WINDOWS=0 -if [[ "$(< /proc/sys/kernel/osrelease)" == *microsoft* ]]; then +if [[ -v WSL_DISTRO_NAME ]] || [[ -v MSYSTEM ]]; then MODELDIR=${MODELDIR:=C:/models} DATADIR=${DATADIR:="/mnt/c/data/inferenceserver/${REPO_VERSION}"} BACKEND_DIR=${BACKEND_DIR:=C:/tritonserver/backends} @@ -183,6 +183,16 @@ export USE_SINGLE_BUFFER # models4 - four instances with batch-size 1 rm -fr *.log models{0,1,2,4} queue_delay_models && mkdir models{0,1,2,4} queue_delay_models +# Search BACKENDS to determine if a backend should be tested +function should_test_backend() { + local target_backend=$1 + if [[ $(echo "${BACKENDS[@]}" | grep -c "${target_backend}") -ne 0 ]]; then + echo "true" + return + fi + echo "false" +} + # Get the datatype to use based on the backend function get_datatype () { local dtype="int32 bool" @@ -827,8 +837,13 @@ fi ### Start Preserve Ordering Tests ### -# Test only supported on windows currently due to use of python backend models -if [ ${WINDOWS} -ne 1 ]; then +# FIXME: Test only supported on windows currently due to use of python backend models. +# Now that Windows supports the PYBE, we should check that this tests works once Windows +# CI is stable. + +# These subtests use python models. They should not be executed if 'python' is not one +# of the backends under test. +if [[ $(should_test_backend "python") == "true" && !( -v WSL_DISTRO_NAME || -v MSYSTEM )]]; then # Test preserve ordering true/false and decoupled/non-decoupled TEST_CASE=SequenceBatcherPreserveOrderingTest MODEL_PATH=preserve_ordering_models diff --git a/qa/L0_shared_memory/shared_memory_test.py b/qa/L0_shared_memory/shared_memory_test.py index c38ecb4814..871fca9b2a 100755 --- a/qa/L0_shared_memory/shared_memory_test.py +++ b/qa/L0_shared_memory/shared_memory_test.py @@ -31,7 +31,9 @@ sys.path.append("../common") import os +import time import unittest +from functools import partial import infer_util as iu import numpy as np @@ -43,7 +45,7 @@ from tritonclient import utils -class SharedMemoryTest(tu.TestResultCollector): +class SystemSharedMemoryTestBase(tu.TestResultCollector): DEFAULT_SHM_BYTE_SIZE = 64 def setUp(self): @@ -62,6 +64,68 @@ def _setup_client(self): self.url, verbose=True ) + def _configure_server( + self, + create_byte_size=DEFAULT_SHM_BYTE_SIZE, + register_byte_size=DEFAULT_SHM_BYTE_SIZE, + register_offset=0, + ): + """Creates and registers shared memory regions for testing. + + Parameters + ---------- + create_byte_size: int + Size of each system shared memory region to create. + NOTE: This should be sufficiently large to hold the inputs/outputs + stored in shared memory. + + register_byte_size: int + Size of each system shared memory region to register with server. + NOTE: The (offset + register_byte_size) should be less than or equal + to the create_byte_size. Otherwise an exception will be raised for + an invalid set of registration args. + + register_offset: int + Offset into the shared memory object to start the registered region. + + """ + shm_ip0_handle = shm.create_shared_memory_region( + "input0_data", "/input0_data", create_byte_size + ) + shm_ip1_handle = shm.create_shared_memory_region( + "input1_data", "/input1_data", create_byte_size + ) + shm_op0_handle = shm.create_shared_memory_region( + "output0_data", "/output0_data", create_byte_size + ) + shm_op1_handle = shm.create_shared_memory_region( + "output1_data", "/output1_data", create_byte_size + ) + # Implicit assumption that input and output byte_sizes are 64 bytes for now + input0_data = np.arange(start=0, stop=16, dtype=np.int32) + input1_data = np.ones(shape=16, dtype=np.int32) + shm.set_shared_memory_region(shm_ip0_handle, [input0_data]) + shm.set_shared_memory_region(shm_ip1_handle, [input1_data]) + self.triton_client.register_system_shared_memory( + "input0_data", "/input0_data", register_byte_size, offset=register_offset + ) + self.triton_client.register_system_shared_memory( + "input1_data", "/input1_data", register_byte_size, offset=register_offset + ) + self.triton_client.register_system_shared_memory( + "output0_data", "/output0_data", register_byte_size, offset=register_offset + ) + self.triton_client.register_system_shared_memory( + "output1_data", "/output1_data", register_byte_size, offset=register_offset + ) + return [shm_ip0_handle, shm_ip1_handle, shm_op0_handle, shm_op1_handle] + + def _cleanup_server(self, shm_handles): + for shm_handle in shm_handles: + shm.destroy_shared_memory_region(shm_handle) + + +class SharedMemoryTest(SystemSharedMemoryTestBase): def test_invalid_create_shm(self): # Raises error since tried to create invalid system shared memory region try: @@ -128,66 +192,6 @@ def test_reregister_after_register(self): self.assertTrue(len(shm_status.regions) == 1) shm.destroy_shared_memory_region(shm_op0_handle) - def _configure_server( - self, - create_byte_size=DEFAULT_SHM_BYTE_SIZE, - register_byte_size=DEFAULT_SHM_BYTE_SIZE, - register_offset=0, - ): - """Creates and registers shared memory regions for testing. - - Parameters - ---------- - create_byte_size: int - Size of each system shared memory region to create. - NOTE: This should be sufficiently large to hold the inputs/outputs - stored in shared memory. - - register_byte_size: int - Size of each system shared memory region to register with server. - NOTE: The (offset + register_byte_size) should be less than or equal - to the create_byte_size. Otherwise an exception will be raised for - an invalid set of registration args. - - register_offset: int - Offset into the shared memory object to start the registered region. - - """ - shm_ip0_handle = shm.create_shared_memory_region( - "input0_data", "/input0_data", create_byte_size - ) - shm_ip1_handle = shm.create_shared_memory_region( - "input1_data", "/input1_data", create_byte_size - ) - shm_op0_handle = shm.create_shared_memory_region( - "output0_data", "/output0_data", create_byte_size - ) - shm_op1_handle = shm.create_shared_memory_region( - "output1_data", "/output1_data", create_byte_size - ) - # Implicit assumption that input and output byte_sizes are 64 bytes for now - input0_data = np.arange(start=0, stop=16, dtype=np.int32) - input1_data = np.ones(shape=16, dtype=np.int32) - shm.set_shared_memory_region(shm_ip0_handle, [input0_data]) - shm.set_shared_memory_region(shm_ip1_handle, [input1_data]) - self.triton_client.register_system_shared_memory( - "input0_data", "/input0_data", register_byte_size, offset=register_offset - ) - self.triton_client.register_system_shared_memory( - "input1_data", "/input1_data", register_byte_size, offset=register_offset - ) - self.triton_client.register_system_shared_memory( - "output0_data", "/output0_data", register_byte_size, offset=register_offset - ) - self.triton_client.register_system_shared_memory( - "output1_data", "/output1_data", register_byte_size, offset=register_offset - ) - return [shm_ip0_handle, shm_ip1_handle, shm_op0_handle, shm_op1_handle] - - def _cleanup_server(self, shm_handles): - for shm_handle in shm_handles: - shm.destroy_shared_memory_region(shm_handle) - def test_unregister_after_inference(self): # Unregister after inference error_msg = [] @@ -443,5 +447,169 @@ def test_python_client_leak(self): ) +class TestSharedMemoryUnregister(SystemSharedMemoryTestBase): + def _test_unregister_shm_fail(self): + second_client = httpclient.InferenceServerClient("localhost:8000", verbose=True) + + with self.assertRaises(utils.InferenceServerException) as ex: + second_client.unregister_system_shared_memory() + self.assertIn( + "Failed to unregister the following system shared memory regions: input0_data ,input1_data ,output0_data ,output1_data", + str(ex.exception), + ) + + with self.assertRaises(utils.InferenceServerException) as ex: + second_client.unregister_system_shared_memory("input0_data") + self.assertIn( + "Cannot unregister shared memory region 'input0_data', it is currently in use.", + str(ex.exception), + ) + + with self.assertRaises(utils.InferenceServerException) as ex: + second_client.unregister_system_shared_memory("input1_data") + self.assertIn( + "Cannot unregister shared memory region 'input1_data', it is currently in use.", + str(ex.exception), + ) + + with self.assertRaises(utils.InferenceServerException) as ex: + second_client.unregister_system_shared_memory("output0_data") + self.assertIn( + "Cannot unregister shared memory region 'output0_data', it is currently in use.", + str(ex.exception), + ) + + with self.assertRaises(utils.InferenceServerException) as ex: + second_client.unregister_system_shared_memory("output1_data") + self.assertIn( + "Cannot unregister shared memory region 'output1_data', it is currently in use.", + str(ex.exception), + ) + + def _test_shm_not_found(self): + second_client = httpclient.InferenceServerClient("localhost:8000", verbose=True) + + with self.assertRaises(utils.InferenceServerException) as ex: + second_client.get_system_shared_memory_status("input0_data") + self.assertIn( + "Unable to find system shared memory region: 'input0_data'", + str(ex.exception), + ) + + with self.assertRaises(utils.InferenceServerException) as ex: + second_client.get_system_shared_memory_status("input1_data") + self.assertIn( + "Unable to find system shared memory region: 'input1_data'", + str(ex.exception), + ) + + with self.assertRaises(utils.InferenceServerException) as ex: + second_client.get_system_shared_memory_status("output0_data") + self.assertIn( + "Unable to find system shared memory region: 'output0_data'", + str(ex.exception), + ) + + with self.assertRaises(utils.InferenceServerException) as ex: + second_client.get_system_shared_memory_status("output1_data") + self.assertIn( + "Unable to find system shared memory region: 'output1_data'", + str(ex.exception), + ) + + def test_unregister_shm_during_inference_http(self): + try: + self.triton_client.unregister_system_shared_memory() + shm_handles = self._configure_server() + + inputs = [ + httpclient.InferInput("INPUT0", [1, 16], "INT32"), + httpclient.InferInput("INPUT1", [1, 16], "INT32"), + ] + outputs = [ + httpclient.InferRequestedOutput("OUTPUT0", binary_data=True), + httpclient.InferRequestedOutput("OUTPUT1", binary_data=False), + ] + + inputs[0].set_shared_memory("input0_data", self.DEFAULT_SHM_BYTE_SIZE) + inputs[1].set_shared_memory("input1_data", self.DEFAULT_SHM_BYTE_SIZE) + outputs[0].set_shared_memory("output0_data", self.DEFAULT_SHM_BYTE_SIZE) + outputs[1].set_shared_memory("output1_data", self.DEFAULT_SHM_BYTE_SIZE) + + async_request = self.triton_client.async_infer( + model_name="simple", inputs=inputs, outputs=outputs + ) + + # Ensure inference started + time.sleep(2) + + # Try unregister shm regions during inference + self._test_unregister_shm_fail() + + # Blocking call + async_request.get_result() + + # Try unregister shm regions after inference + self.triton_client.unregister_system_shared_memory() + self._test_shm_not_found() + + finally: + self._cleanup_server(shm_handles) + + def test_unregister_shm_during_inference_grpc(self): + try: + self.triton_client.unregister_system_shared_memory() + shm_handles = self._configure_server() + + inputs = [ + grpcclient.InferInput("INPUT0", [1, 16], "INT32"), + grpcclient.InferInput("INPUT1", [1, 16], "INT32"), + ] + outputs = [ + grpcclient.InferRequestedOutput("OUTPUT0"), + grpcclient.InferRequestedOutput("OUTPUT1"), + ] + + inputs[0].set_shared_memory("input0_data", self.DEFAULT_SHM_BYTE_SIZE) + inputs[1].set_shared_memory("input1_data", self.DEFAULT_SHM_BYTE_SIZE) + outputs[0].set_shared_memory("output0_data", self.DEFAULT_SHM_BYTE_SIZE) + outputs[1].set_shared_memory("output1_data", self.DEFAULT_SHM_BYTE_SIZE) + + def callback(user_data, result, error): + if error: + user_data.append(error) + else: + user_data.append(result) + + user_data = [] + + self.triton_client.async_infer( + model_name="simple", + inputs=inputs, + outputs=outputs, + callback=partial(callback, user_data), + ) + + # Ensure inference started + time.sleep(2) + + # Try unregister shm regions during inference + self._test_unregister_shm_fail() + + # Wait until the results are available in user_data + time_out = 20 + while (len(user_data) == 0) and time_out > 0: + time_out = time_out - 1 + time.sleep(1) + time.sleep(2) + + # Try unregister shm regions after inference + self.triton_client.unregister_system_shared_memory() + self._test_shm_not_found() + + finally: + self._cleanup_server(shm_handles) + + if __name__ == "__main__": unittest.main() diff --git a/qa/L0_shared_memory/test.sh b/qa/L0_shared_memory/test.sh index ba6a2fa8f2..e711de9cff 100755 --- a/qa/L0_shared_memory/test.sh +++ b/qa/L0_shared_memory/test.sh @@ -95,6 +95,46 @@ for i in \ done done +mkdir -p python_models/simple/1/ +cp ../python_models/execute_delayed_model/model.py ./python_models/simple/1/ +cp ../python_models/execute_delayed_model/config.pbtxt ./python_models/simple/ + +for client_type in http grpc; do + SERVER_ARGS="--model-repository=`pwd`/python_models --log-verbose=1 ${SERVER_ARGS_EXTRA}" + SERVER_LOG="./unregister_shm.$client_type.server.log" + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + export CLIENT_TYPE=$client_type + CLIENT_LOG="./unregister_shm.$client_type.client.log" + set +e + python3 $SHM_TEST TestSharedMemoryUnregister.test_unregister_shm_during_inference_$client_type >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $TEST_RESULT_FILE + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi + fi + + kill $SERVER_PID + wait $SERVER_PID + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Server shut down non-gracefully\n***" + RET=1 + fi + set -e + done + if [ $RET -eq 0 ]; then echo -e "\n***\n*** Test Passed\n***" else diff --git a/qa/L0_trt_plugin/test.sh b/qa/L0_trt_plugin/test.sh index 075dd54eab..a9d04331f0 100755 --- a/qa/L0_trt_plugin/test.sh +++ b/qa/L0_trt_plugin/test.sh @@ -47,7 +47,7 @@ PLUGIN_TEST=trt_plugin_test.py # On windows the paths invoked by the script (running in WSL) must use # /mnt/c when needed but the paths on the tritonserver command-line # must be C:/ style. -if [[ "$(< /proc/sys/kernel/osrelease)" == *microsoft* ]]; then +if [[ -v WSL_DISTRO_NAME ]] || [[ -v MSYSTEM ]]; then DATADIR=${DATADIR:="/mnt/c/data/inferenceserver/${REPO_VERSION}"} MODELDIR=${MODELDIR:=C:/models} CUSTOMPLUGIN=${CUSTOMPLUGIN:=$MODELDIR/HardmaxPlugin.dll} @@ -135,7 +135,7 @@ SERVER_LD_PRELOAD=$CUSTOMPLUGIN SERVER_ARGS=$SERVER_ARGS_BASE SERVER_LOG="./inference_server_$LOG_IDX.log" -if [[ "$(< /proc/sys/kernel/osrelease)" != *microsoft* ]]; then +if [[ ! -v WSL_DISTRO_NAME ]] || [[ ! -v MSYSTEM ]]; then run_server if [ "$SERVER_PID" == "0" ]; then echo -e "\n***\n*** Failed to start $SERVER\n***" diff --git a/qa/L0_trt_shape_tensors/test.sh b/qa/L0_trt_shape_tensors/test.sh index f08ed339b0..548ebb55af 100755 --- a/qa/L0_trt_shape_tensors/test.sh +++ b/qa/L0_trt_shape_tensors/test.sh @@ -45,7 +45,7 @@ CLIENT_LOG="./client.log" SHAPE_TENSOR_TEST=trt_shape_tensor_test.py SERVER=/opt/tritonserver/bin/tritonserver -SERVER_ARGS="--model-repository=`pwd`/models" +SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=1" SERVER_LOG="./inference_server.log" source ../common/util.sh diff --git a/qa/L0_warmup/test.sh b/qa/L0_warmup/test.sh index aeed873b25..a535aed25b 100755 --- a/qa/L0_warmup/test.sh +++ b/qa/L0_warmup/test.sh @@ -42,7 +42,7 @@ export CUDA_VISIBLE_DEVICES=0 CLIENT=../clients/image_client CLIENT_LOG="./client.log" -CLIENT_PY=./python_unittest.py +CLIENT_PY=./test_infer_shm_leak.py EXPECTED_NUM_TESTS="1" TEST_RESULT_FILE='test_results.txt' @@ -449,8 +449,8 @@ mkdir -p models/bls_onnx_warmup/1/ cp ../python_models/bls_onnx_warmup/model.py models/bls_onnx_warmup/1/ cp ../python_models/bls_onnx_warmup/config.pbtxt models/bls_onnx_warmup/. -cp ../L0_backend_python/python_unittest.py . -sed -i 's#sys.path.append("../../common")#sys.path.append("../common")#g' python_unittest.py +cp ../L0_backend_python/test_infer_shm_leak.py . +sed -i 's#sys.path.append("../../common")#sys.path.append("../common")#g' test_infer_shm_leak.py run_server if [ "$SERVER_PID" == "0" ]; then diff --git a/qa/common/gen_jetson_trt_models b/qa/common/gen_jetson_trt_models index 99a6175a08..21e9fe53ff 100755 --- a/qa/common/gen_jetson_trt_models +++ b/qa/common/gen_jetson_trt_models @@ -34,7 +34,7 @@ # Make all generated files accessible outside of container umask 0000 # Set the version of the models -TRITON_VERSION=${TRITON_VERSION:=24.07} +TRITON_VERSION=${TRITON_VERSION:=24.09} # Set the CUDA device to use CUDA_DEVICE=${RUNNER_ID:=0} # Set TensorRT image @@ -48,9 +48,7 @@ HOST_MODEL_DIR=${HOST_MODEL_DIR:="${HOST_BUILD_DIR}/${TRITON_VERSION}"} HOST_SOURCE_DIR=$HOST_BUILD_DIR/gen_srcdir # Set CI specific parameters -DOCKER_GPU_ARGS="${DOCKER_GPU_ARGS:="--gpus device=$CUDA_DEVICE"}" -[[ $RUNNER_GPUS =~ ^[0-9] ]] && DOCKER_GPU_ARGS=$(eval $NV_DOCKER_ARGS) - +DOCKER_GPU_ARGS=${DOCKER_GPU_ARGS:-$([[ $RUNNER_GPUS =~ ^[0-9] ]] && eval $NV_DOCKER_ARGS || echo "--gpus device=$CUDA_DEVICE" )} # Set model output directories diff --git a/qa/common/gen_qa_custom_ops b/qa/common/gen_qa_custom_ops index 4ae0f006b3..286052914b 100755 --- a/qa/common/gen_qa_custom_ops +++ b/qa/common/gen_qa_custom_ops @@ -37,14 +37,14 @@ ## ############################################################################ -TRITON_VERSION=${TRITON_VERSION:=24.07} +TRITON_VERSION=${TRITON_VERSION:=24.09} NVIDIA_UPSTREAM_VERSION=${NVIDIA_UPSTREAM_VERSION:=$TRITON_VERSION} TENSORFLOW_IMAGE=${TENSORFLOW_IMAGE:=nvcr.io/nvidia/tensorflow:$NVIDIA_UPSTREAM_VERSION-tf2-py3} PYTORCH_IMAGE=${PYTORCH_IMAGE:=nvcr.io/nvidia/pytorch:$NVIDIA_UPSTREAM_VERSION-py3} CUDA_DEVICE=${NV_GPU:=0} -[[ $RUNNER_GPUS =~ ^[0-9] ]] && DOCKER_GPU_ARGS=$(eval $NV_DOCKER_ARGS) || DOCKER_GPU_ARGS="--gpus device=$CUDA_DEVICE" +DOCKER_GPU_ARGS=${DOCKER_GPU_ARGS:-$([[ $RUNNER_GPUS =~ ^[0-9] ]] && eval $NV_DOCKER_ARGS || echo "--gpus device=$CUDA_DEVICE" )} ### HOST_BUILD_DIR=${HOST_BUILD_DIR:=/tmp} diff --git a/qa/common/gen_qa_model_repository b/qa/common/gen_qa_model_repository index cab497aa86..f26ba863ce 100755 --- a/qa/common/gen_qa_model_repository +++ b/qa/common/gen_qa_model_repository @@ -48,7 +48,7 @@ ## ############################################################################ -TRITON_VERSION=${TRITON_VERSION:=24.07} +TRITON_VERSION=${TRITON_VERSION:=24.09} # ONNX. Use ONNX_OPSET 0 to use the default for ONNX version ONNX_VERSION=1.13.0 @@ -63,7 +63,8 @@ TENSORFLOW_IMAGE=${TENSORFLOW_IMAGE:=nvcr.io/nvidia/tensorflow:$TRITON_VERSION-t TENSORRT_IMAGE=${TENSORRT_IMAGE:=nvcr.io/nvidia/tensorrt:$TRITON_VERSION-py3} CUDA_DEVICE=${NV_GPU:=0} -[[ $RUNNER_GPUS =~ ^[0-9] ]] && DOCKER_GPU_ARGS=$(eval $NV_DOCKER_ARGS) || DOCKER_GPU_ARGS="--gpus device=$CUDA_DEVICE" +DOCKER_GPU_ARGS=${DOCKER_GPU_ARGS:-$([[ $RUNNER_GPUS =~ ^[0-9] ]] && eval $NV_DOCKER_ARGS || echo "--gpus device=$CUDA_DEVICE" )} +MODEL_TYPE=${MODEL_TYPE:-""} ### HOST_BUILD_DIR=${HOST_BUILD_DIR:=/tmp} @@ -360,8 +361,10 @@ python3 $SRCDIR/gen_qa_implicit_models.py --libtorch --variable --models_dir=$VA chmod -R 777 $VARIMPLICITSEQDESTDIR python3 $SRCDIR/gen_qa_dyna_sequence_models.py --libtorch --models_dir=$DYNASEQDESTDIR chmod -R 777 $DYNASEQDESTDIR -python3 $SRCDIR/gen_qa_torchtrt_models.py --models_dir=$TORCHTRTDESTDIR -chmod -R 777 $TORCHTRTDESTDIR +if [ -z "$MODEL_TYPE" ] || [ "$MODEL_TYPE" != "igpu" ]; then + python3 $SRCDIR/gen_qa_torchtrt_models.py --models_dir=$TORCHTRTDESTDIR + chmod -R 777 $TORCHTRTDESTDIR +fi python3 $SRCDIR/gen_qa_ragged_models.py --libtorch --models_dir=$RAGGEDDESTDIR chmod -R 777 $RAGGEDDESTDIR # Export torchvision image models to ONNX diff --git a/qa/common/shm_util.py b/qa/common/shm_util.py index 16e5ce4e45..0e533bcdbb 100755 --- a/qa/common/shm_util.py +++ b/qa/common/shm_util.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -# Copyright 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2018-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -441,6 +441,9 @@ def __exit__(self, type, value, traceback): print( f"Shared memory leak detected [{shm_region}]: {curr_shm_free_size} (curr free) < {prev_shm_free_size} (prev free)." ) + # FIXME DLIS-7122: Known shared memory leak of 480 bytes in BLS test. + if curr_shm_free_size == 1006576 and prev_shm_free_size == 1007056: + assert False, f"Known shared memory leak of 480 bytes detected." assert not shm_leak_detected, f"Shared memory leak detected." def _get_shm_free_sizes(self, delay_sec=0): diff --git a/qa/common/util.sh b/qa/common/util.sh index 3297dd2914..3874916573 100755 --- a/qa/common/util.sh +++ b/qa/common/util.sh @@ -257,7 +257,7 @@ function run_server_nowait () { return fi - if [[ "$(< /proc/sys/kernel/osrelease)" == *microsoft* ]]; then + if [[ -v WSL_DISTRO_NAME ]] || [[ -v MSYSTEM ]]; then # LD_PRELOAD not yet supported on windows if [ -z "$SERVER_LD_PRELOAD" ]; then echo "=== Running $SERVER $SERVER_ARGS" @@ -329,7 +329,7 @@ function kill_server () { # causes the entire WSL shell to just exit. So instead we must use # taskkill.exe which can only forcefully kill tritonserver which # means that it does not gracefully exit. - if [[ "$(< /proc/sys/kernel/osrelease)" == *microsoft* ]]; then + if [[ -v WSL_DISTRO_NAME ]]; then # Disable -x as it makes output below hard to read oldstate="$(set +o)"; [[ -o errexit ]] && oldstate="$oldstate; set -e" set +x @@ -353,6 +353,8 @@ function kill_server () { fi set +vx; eval "$oldstate" + elif [[ -v MSYSTEM ]] ; then + taskkill //F //IM tritonserver.exe else # Non-windows... kill $SERVER_PID @@ -512,17 +514,23 @@ remove_array_outliers() { function setup_virtualenv() { # Create and activate virtual environment - virtualenv --system-site-packages venv - source venv/bin/activate - pip install pytest + if [[ -v MSYSTEM ]]; then + pip3 install pytest + else + virtualenv --system-site-packages venv + source venv/bin/activate + pip install pytest + fi if [[ ${TEST_WINDOWS} == 1 ]]; then - pip3 install "numpy<2" tritonclient[all] + pip3 install "numpy<2" tritonclient[all] fi } function deactivate_virtualenv() { # Deactivate virtual environment and clean up + if [[ ! -v MSYSTEM ]]; then deactivate rm -fr venv + fi } diff --git a/qa/python_models/custom_metrics/model.py b/qa/python_models/custom_metrics/model.py index 31f105a1dd..7c78b46894 100644 --- a/qa/python_models/custom_metrics/model.py +++ b/qa/python_models/custom_metrics/model.py @@ -1,4 +1,4 @@ -# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -74,6 +74,96 @@ def _metric_api_helper(self, metric, kind): self.assertEqual(metric.value(), value) logger.log_info("Set metric to : {}".format(metric.value())) + # Test observe value + observe = 0.05 + # Counter and gauge do not support observe + with self.assertRaises(pb_utils.TritonModelException): + metric.observe(observe) + + def _histogram_api_helper(self, metric, name, labels): + def histogram_str_builder(name, type, labels, value, le=None): + if type == "count" or type == "sum": + return f"{name}_{type}{{{labels}}} {value}" + elif type == "bucket": + return f'{name}_bucket{{{labels},le="{le}"}} {value}' + else: + raise + + # Adding logger to test if custom metrics and logging work together + # as they use the same message queue. + logger = pb_utils.Logger + + # All values should be 0.0 before the test + metrics = self._get_metrics() + self.assertIn(histogram_str_builder(name, "count", labels, "0"), metrics) + self.assertIn(histogram_str_builder(name, "sum", labels, "0"), metrics) + self.assertIn( + histogram_str_builder(name, "bucket", labels, "0", le="0.1"), metrics + ) + self.assertIn( + histogram_str_builder(name, "bucket", labels, "0", le="1"), metrics + ) + self.assertIn( + histogram_str_builder(name, "bucket", labels, "0", le="2.5"), metrics + ) + self.assertIn( + histogram_str_builder(name, "bucket", labels, "0", le="5"), metrics + ) + self.assertIn( + histogram_str_builder(name, "bucket", labels, "0", le="10"), metrics + ) + self.assertIn( + histogram_str_builder(name, "bucket", labels, "0", le="+Inf"), metrics + ) + + # Histogram does not support value + with self.assertRaises(pb_utils.TritonModelException): + metric.value() + + # Test increment value + increment = 2023.0 + # Histogram does not support increment + with self.assertRaises(pb_utils.TritonModelException): + metric.increment(increment) + + # Test set value + value = 999.9 + # Histogram does not support set + with self.assertRaises(pb_utils.TritonModelException): + metric.set(value) + + # Test observe value + data = [0.05, 1.5, 6.0] + for datum in data: + metric.observe(datum) + logger.log_info("Observe histogram metric with value : {}".format(datum)) + + metrics = self._get_metrics() + self.assertIn( + histogram_str_builder(name, "count", labels, str(len(data))), metrics + ) + self.assertIn( + histogram_str_builder(name, "sum", labels, str(sum(data))), metrics + ) + self.assertIn( + histogram_str_builder(name, "bucket", labels, "1", le="0.1"), metrics + ) + self.assertIn( + histogram_str_builder(name, "bucket", labels, "1", le="1"), metrics + ) + self.assertIn( + histogram_str_builder(name, "bucket", labels, "2", le="2.5"), metrics + ) + self.assertIn( + histogram_str_builder(name, "bucket", labels, "2", le="5"), metrics + ) + self.assertIn( + histogram_str_builder(name, "bucket", labels, "3", le="10"), metrics + ) + self.assertIn( + histogram_str_builder(name, "bucket", labels, "3", le="+Inf"), metrics + ) + def _dup_metric_helper(self, labels={}): # Adding logger to test if custom metrics and logging work together # as they use the same message queue. @@ -128,14 +218,62 @@ def test_gauge_e2e(self): description="test metric gauge kind end to end", kind=pb_utils.MetricFamily.GAUGE, ) - labels = {"example1": "counter_label1", "example2": "counter_label2"} + labels = {"example1": "gauge_label1", "example2": "gauge_label2"} metric = metric_family.Metric(labels=labels) self._metric_api_helper(metric, "gauge") - pattern = 'test_gauge_e2e{example1="counter_label1",example2="counter_label2"}' + pattern = 'test_gauge_e2e{example1="gauge_label1",example2="gauge_label2"}' metrics = self._get_metrics() self.assertIn(pattern, metrics) + def test_histogram_e2e(self): + name = "test_histogram_e2e" + metric_family = pb_utils.MetricFamily( + name=name, + description="test metric histogram kind end to end", + kind=pb_utils.MetricFamily.HISTOGRAM, + ) + + labels = {"example1": "histogram_label1", "example2": "histogram_label2"} + buckets = [0.1, 1.0, 2.5, 5.0, 10.0] + metric = metric_family.Metric(labels=labels, buckets=buckets) + + labels_str = 'example1="histogram_label1",example2="histogram_label2"' + self._histogram_api_helper(metric, name, labels_str) + + metrics = self._get_metrics() + count_pattern = f"{name}_count{{{labels_str}}}" + sum_pattern = f"{name}_sum{{{labels_str}}}" + bucket_pattern = f"{name}_bucket{{{labels_str}" + self.assertEqual(metrics.count(count_pattern), 1) + self.assertEqual(metrics.count(sum_pattern), 1) + self.assertEqual(metrics.count(bucket_pattern), len(buckets) + 1) + + def test_histogram_args(self): + name = "test_histogram_args" + metric_family = pb_utils.MetricFamily( + name=name, + description="test metric histogram args", + kind=pb_utils.MetricFamily.HISTOGRAM, + ) + + # Test "None" value buckets + with self.assertRaises(pb_utils.TritonModelException): + metric_family.Metric(labels={}) + with self.assertRaises(pb_utils.TritonModelException): + metric_family.Metric(labels={}, buckets=None) + + # Test non-ascending order buckets + with self.assertRaises(pb_utils.TritonModelException): + metric_family.Metric(labels={}, buckets=[2.5, 0.1, 1.0, 10.0, 5.0]) + + # Test duplicate value buckets + with self.assertRaises(pb_utils.TritonModelException): + metric_family.Metric(labels={}, buckets=[1, 1, 2, 5, 5]) + + # Test empty list bucket + metric_family.Metric(labels={}, buckets=[]) + def test_dup_metric_family_diff_kind(self): # Test that a duplicate metric family can't be added with a conflicting type/kind metric_family1 = pb_utils.MetricFamily( diff --git a/qa/python_models/execute_delayed_model/config.pbtxt b/qa/python_models/execute_delayed_model/config.pbtxt new file mode 100644 index 0000000000..0a4ee59d3e --- /dev/null +++ b/qa/python_models/execute_delayed_model/config.pbtxt @@ -0,0 +1,55 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "simple" +backend: "python" +max_batch_size: 8 +input [ + { + name: "INPUT0" + data_type: TYPE_INT32 + dims: [ 16 ] + }, + { + name: "INPUT1" + data_type: TYPE_INT32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_INT32 + dims: [ 16 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_INT32 + dims: [ 16 ] + } +] + +instance_group [ { kind: KIND_CPU }] diff --git a/qa/python_models/execute_delayed_model/model.py b/qa/python_models/execute_delayed_model/model.py new file mode 100644 index 0000000000..055b321a93 --- /dev/null +++ b/qa/python_models/execute_delayed_model/model.py @@ -0,0 +1,72 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json +import time + +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + def initialize(self, args): + self.model_config = model_config = json.loads(args["model_config"]) + output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0") + output1_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT1") + self.output0_dtype = pb_utils.triton_string_to_numpy( + output0_config["data_type"] + ) + self.output1_dtype = pb_utils.triton_string_to_numpy( + output1_config["data_type"] + ) + + def execute(self, requests): + output0_dtype = self.output0_dtype + output1_dtype = self.output1_dtype + responses = [] + + time.sleep(15) + + for request in requests: + in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0") + in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1") + + out_0, out_1 = ( + in_0.as_numpy() + in_1.as_numpy(), + in_0.as_numpy() - in_1.as_numpy(), + ) + + out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0.astype(output0_dtype)) + out_tensor_1 = pb_utils.Tensor("OUTPUT1", out_1.astype(output1_dtype)) + + inference_response = pb_utils.InferenceResponse( + output_tensors=[out_tensor_0, out_tensor_1] + ) + responses.append(inference_response) + + return responses + + def finalize(self): + print("Cleaning up...") diff --git a/qa/python_models/execute_grpc_error/config.pbtxt b/qa/python_models/execute_grpc_error/config.pbtxt new file mode 100644 index 0000000000..70e247148a --- /dev/null +++ b/qa/python_models/execute_grpc_error/config.pbtxt @@ -0,0 +1,51 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +backend: "python" +max_batch_size: 64 + +input [ + { + name: "IN" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] + +output [ + { + name: "OUT" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] + +instance_group [ + { + count: 1 + kind : KIND_CPU + } +] diff --git a/qa/python_models/execute_grpc_error/model.py b/qa/python_models/execute_grpc_error/model.py new file mode 100644 index 0000000000..d5087a49ec --- /dev/null +++ b/qa/python_models/execute_grpc_error/model.py @@ -0,0 +1,52 @@ +# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + def __init__(self): + # Maintain total inference count, so as to return error on 2nd request, all of this to simulate model failure + self.inf_count = 1 + + def execute(self, requests): + """This function is called on inference request.""" + responses = [] + + # Generate the error for the second request + for request in requests: + input_tensor = pb_utils.get_input_tensor_by_name(request, "IN") + out_tensor = pb_utils.Tensor("OUT", input_tensor.as_numpy()) + if self.inf_count % 2: + # Every odd request is success + responses.append(pb_utils.InferenceResponse([out_tensor])) + else: + # Every even request is failure + error = pb_utils.TritonError("An error occurred during execution") + responses.append(pb_utils.InferenceResponse([out_tensor], error)) + self.inf_count += 1 + + return responses diff --git a/qa/python_models/response_sender_complete_final/config.pbtxt b/qa/python_models/response_sender_complete_final/config.pbtxt new file mode 100644 index 0000000000..f08ed6da5b --- /dev/null +++ b/qa/python_models/response_sender_complete_final/config.pbtxt @@ -0,0 +1,47 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +backend: "python" +max_batch_size: 8 + +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] + +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] + +instance_group [{ kind: KIND_CPU }] +model_transaction_policy { decoupled: True } diff --git a/qa/python_models/response_sender_complete_final/model.py b/qa/python_models/response_sender_complete_final/model.py new file mode 100644 index 0000000000..e17f0b04f6 --- /dev/null +++ b/qa/python_models/response_sender_complete_final/model.py @@ -0,0 +1,63 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + def execute(self, requests): + # Expect exactly one request per execute() call. + if len(requests) != 1: + pb_utils.Logger.log_error(f"Unexpected request length: {len(requests)}") + raise Exception("Test FAILED") + + # Send a response with complete final flag, and then send another response and + # and assert an exception is raised, for all requests. + for request in requests: + in_tensor = pb_utils.get_input_tensor_by_name(request, "INPUT0") + out_tensor = pb_utils.Tensor("OUTPUT0", in_tensor.as_numpy()) + response = pb_utils.InferenceResponse([out_tensor]) + response_sender = request.get_response_sender() + response_sender.send( + response, flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL + ) + test_passed = False + try: + response_sender.send(response) + except Exception as e: + pb_utils.Logger.log_info(f"Raised exception: {e}") + if ( + str(e) + == "Unable to send response. Response sender has been closed." + ): + test_passed = True + finally: + if not test_passed: + pb_utils.Logger.log_error("Expected exception not raised") + raise Exception("Test FAILED") + pb_utils.Logger.log_info("Test Passed") + return None diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 783275d8d7..9488fc6233 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -138,6 +138,15 @@ else() ) endif() +set(LIB_DIR "lib") +if(LINUX) + file(STRINGS "/etc/os-release" DISTRO_ID_LIKE REGEX "ID_LIKE") + if(${DISTRO_ID_LIKE} MATCHES "rhel|centos") + set (LIB_DIR "lib64") + endif(${DISTRO_ID_LIKE} MATCHES "rhel|centos") +endif(LINUX) +set(TRITON_CORE_HEADERS_ONLY OFF) + set_target_properties( main PROPERTIES @@ -145,7 +154,7 @@ set_target_properties( SKIP_BUILD_RPATH TRUE BUILD_WITH_INSTALL_RPATH TRUE INSTALL_RPATH_USE_LINK_PATH FALSE - INSTALL_RPATH "$\{ORIGIN\}/../lib" + INSTALL_RPATH "$\{ORIGIN\}/../${LIB_DIR}" ) target_link_libraries( @@ -773,7 +782,14 @@ if (NOT WIN32) endif() # TRITON_ENABLE_GPU endif() # NOT WIN32 +# DLIS-7292: Extend tritonfrontend to build for Windows +if (NOT WIN32) + # tritonfrontend python package + add_subdirectory(python) +endif (NOT WIN32) + # Currently unit tests do not build for windows... if ( NOT WIN32) add_subdirectory(test test) endif() # NOT WIN32 + diff --git a/src/common.h b/src/common.h index aa160f394f..011546d637 100644 --- a/src/common.h +++ b/src/common.h @@ -27,7 +27,11 @@ #include #include +#include #include +#include +#include +#include #include #include "triton/core/tritonserver.h" @@ -184,4 +188,60 @@ Join(const T& container, const std::string& delim) return ss.str(); } + +// Used by Python Bindings to accept arguments to initialize Frontends. +// Known pybind11 issue: bool has to come before int for std::variant +using VariantType = std::variant; +using UnorderedMapType = std::unordered_map; + + +template +TRITONSERVER_Error* +GetValue(const UnorderedMapType& options, const std::string& key, T* arg) +{ + auto curr = options.find(key); + bool is_present = (curr != options.end()); + std::string msg; + + if (!is_present) { + msg = "Key: " + key + " not found in options provided."; + return TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INVALID_ARG, msg.c_str()); + } + + bool correct_type = std::holds_alternative(curr->second); + if (!correct_type) { + std::string expected; + std::string found; + VariantType value = *arg; + if (std::holds_alternative(value)) { + expected = "int"; + } else if (std::holds_alternative(value)) { + expected = "bool"; + } else if (std::holds_alternative(value)) { + expected = "string"; + } + + switch (curr->second.index()) { + case 0: + found = "bool"; + break; + case 1: + found = "int"; + break; + case 2: + found = "string"; + break; + } + + msg = "Key: " + key + " found, but incorrect type. Expected " + expected + + " Found: " + found; + + return TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INVALID_ARG, msg.c_str()); + } + + *arg = std::get(curr->second); + return nullptr; +} + + }} // namespace triton::server diff --git a/src/grpc/grpc_server.cc b/src/grpc/grpc_server.cc index c0a92ebd33..74ec443ae6 100644 --- a/src/grpc/grpc_server.cc +++ b/src/grpc/grpc_server.cc @@ -2435,6 +2435,101 @@ Server::Create( return nullptr; // success } +TRITONSERVER_Error* +Server::Create( + std::shared_ptr& server, UnorderedMapType& options, + triton::server::TraceManager* trace_manager, + const std::shared_ptr& shm_manager, + const RestrictedFeatures& restricted_features, + std::unique_ptr* service) +{ + Options grpc_options; + + RETURN_IF_ERR(GetOptions(grpc_options, options)); + + return Create(server, trace_manager, shm_manager, grpc_options, service); +} + +TRITONSERVER_Error* +Server::GetOptions(Options& options, UnorderedMapType& options_map) +{ + SocketOptions socket_selection; + SslOptions ssl_selection; + KeepAliveOptions keep_alive_selection; + + RETURN_IF_ERR(GetSocketOptions(options.socket_, options_map)); + RETURN_IF_ERR(GetSslOptions(options.ssl_, options_map)); + RETURN_IF_ERR(GetKeepAliveOptions(options.keep_alive_, options_map)); + + int infer_compression_level_key; + + RETURN_IF_ERR(GetValue( + options_map, "infer_compression_level", &infer_compression_level_key)); + + options.infer_compression_level_ = + static_cast(infer_compression_level_key); + + RETURN_IF_ERR(GetValue( + options_map, "infer_allocation_pool_size", + &options.infer_allocation_pool_size_)); + RETURN_IF_ERR(GetValue( + options_map, "forward_header_pattern", &options.forward_header_pattern_)); + + return nullptr; +} + +TRITONSERVER_Error* +Server::GetSocketOptions(SocketOptions& options, UnorderedMapType& options_map) +{ + RETURN_IF_ERR(GetValue(options_map, "address", &options.address_)); + RETURN_IF_ERR(GetValue(options_map, "port", &options.port_)); + RETURN_IF_ERR(GetValue(options_map, "reuse_port", &options.reuse_port_)); + + return nullptr; +} + +TRITONSERVER_Error* +Server::GetSslOptions(SslOptions& options, UnorderedMapType& options_map) +{ + RETURN_IF_ERR(GetValue(options_map, "use_ssl", &options.use_ssl_)); + RETURN_IF_ERR(GetValue(options_map, "server_cert", &options.server_cert_)); + RETURN_IF_ERR(GetValue(options_map, "server_key", &options.server_key_)); + RETURN_IF_ERR(GetValue(options_map, "root_cert", &options.root_cert_)); + RETURN_IF_ERR( + GetValue(options_map, "use_mutual_auth", &options.use_mutual_auth_)); + + return nullptr; +} + +TRITONSERVER_Error* +Server::GetKeepAliveOptions( + KeepAliveOptions& options, UnorderedMapType& options_map) +{ + RETURN_IF_ERR( + GetValue(options_map, "keepalive_time_ms", &options.keepalive_time_ms_)); + RETURN_IF_ERR(GetValue( + options_map, "keepalive_timeout_ms", &options.keepalive_timeout_ms_)); + RETURN_IF_ERR(GetValue( + options_map, "keepalive_permit_without_calls", + &options.keepalive_permit_without_calls_)); + RETURN_IF_ERR(GetValue( + options_map, "http2_max_pings_without_data", + &options.http2_max_pings_without_data_)); + RETURN_IF_ERR(GetValue( + options_map, "http2_min_recv_ping_interval_without_data_ms", + &options.http2_min_recv_ping_interval_without_data_ms_)); + RETURN_IF_ERR(GetValue( + options_map, "http2_max_ping_strikes", &options.http2_max_ping_strikes_)); + RETURN_IF_ERR(GetValue( + options_map, "max_connection_age_ms", &options.max_connection_age_ms_)); + RETURN_IF_ERR(GetValue( + options_map, "max_connection_age_grace_ms", + &options.max_connection_age_grace_ms_)); + + return nullptr; +} + + TRITONSERVER_Error* Server::Start() { diff --git a/src/grpc/grpc_server.h b/src/grpc/grpc_server.h index 8a38cdd4fe..89d8dc7388 100644 --- a/src/grpc/grpc_server.h +++ b/src/grpc/grpc_server.h @@ -29,6 +29,7 @@ #include +#include "../common.h" #include "../restricted_features.h" #include "../shared_memory_manager.h" #include "../tracer.h" @@ -100,6 +101,13 @@ class Server { const std::shared_ptr& shm_manager, const Options& server_options, std::unique_ptr* server); + static TRITONSERVER_Error* Create( + std::shared_ptr& server, UnorderedMapType& options, + triton::server::TraceManager* trace_manager, + const std::shared_ptr& shm_manager, + const RestrictedFeatures& restricted_features, + std::unique_ptr* service); + ~Server(); TRITONSERVER_Error* Start(); @@ -112,6 +120,16 @@ class Server { const std::shared_ptr& shm_manager, const Options& server_options); + static TRITONSERVER_Error* GetSocketOptions( + SocketOptions& options, UnorderedMapType& options_map); + static TRITONSERVER_Error* GetSslOptions( + SslOptions& options, UnorderedMapType& options_map); + static TRITONSERVER_Error* GetKeepAliveOptions( + KeepAliveOptions& options, UnorderedMapType& options_map); + + static TRITONSERVER_Error* GetOptions( + Options& options, UnorderedMapType& options_map); + std::shared_ptr tritonserver_; TraceManager* trace_manager_; std::shared_ptr shm_manager_; diff --git a/src/grpc/grpc_utils.h b/src/grpc/grpc_utils.h index 898e4acb4f..032dec3ad9 100644 --- a/src/grpc/grpc_utils.h +++ b/src/grpc/grpc_utils.h @@ -76,6 +76,46 @@ typedef enum { PARTIAL_COMPLETION } Steps; +typedef enum { + // No error from CORE seen yet + NONE, + // Error from CORE encountered, waiting to be picked up by completion queue to + // initiate cancellation + ERROR_ENCOUNTERED, + // Error from CORE encountered, stream closed + // This state is added to avoid double cancellation + ERROR_HANDLING_COMPLETE +} TritonGRPCErrorSteps; + +class gRPCErrorTracker { + public: + // True if set by user via header + // Can be accessed without a lock, as set only once in startstream + std::atomic triton_grpc_error_; + + // Indicates the state of triton_grpc_error, only relevant if special + // triton_grpc_error feature set to true by client + TritonGRPCErrorSteps grpc_stream_error_state_; + + // Constructor + gRPCErrorTracker() + : triton_grpc_error_(false), + grpc_stream_error_state_(TritonGRPCErrorSteps::NONE) + { + } + // Changes the state of grpc_stream_error_state_ to ERROR_HANDLING_COMPLETE, + // indicating we have closed the stream and initiated the cancel flow + void MarkGRPCErrorHandlingComplete(); + + // Returns true ONLY when GRPC_ERROR from CORE is waiting to be processed. + bool CheckAndUpdateGRPCError(); + + // Marks error after it has been responded to + void MarkGRPCErrorEncountered(); + + // Checks if error already responded to in triton_grpc_error mode + bool GRPCErrorEncountered(); +}; // Debugging helper std::ostream& operator<<(std::ostream& out, const Steps& step); @@ -183,5 +223,4 @@ TRITONSERVER_Error* ParseClassificationParams( void ReadFile(const std::string& filename, std::string& data); - }}} // namespace triton::server::grpc diff --git a/src/grpc/infer_handler.cc b/src/grpc/infer_handler.cc index 35659f4900..c4ba9338cb 100644 --- a/src/grpc/infer_handler.cc +++ b/src/grpc/infer_handler.cc @@ -158,18 +158,6 @@ InferResponseFree( return nullptr; // Success } -TRITONSERVER_Error* InferGRPCToInputHelper( - const std::string& input_name, const std::string& model_name, - const TRITONSERVER_DataType tensor_dt, const TRITONSERVER_DataType input_dt, - const size_t binary_data_byte_size); - -TRITONSERVER_Error* InferGRPCToInput( - const std::shared_ptr& tritonserver, - const std::shared_ptr& shm_manager, - const inference::ModelInferRequest& request, - std::list* serialized_data, - TRITONSERVER_InferenceRequest* inference_request); - TRITONSERVER_Error* InferGRPCToInputHelper( const std::string& input_name, const std::string& model_name, @@ -391,7 +379,9 @@ InferGRPCToInput( const std::shared_ptr& shm_manager, const inference::ModelInferRequest& request, std::list* serialized_data, - TRITONSERVER_InferenceRequest* inference_request) + TRITONSERVER_InferenceRequest* inference_request, + std::vector>* + shm_regions_info) { // Verify that the batch-byte-size of each input matches the size of // the provided tensor data (provided raw or from shared memory) @@ -432,9 +422,14 @@ InferGRPCToInput( .c_str()); } void* tmp; + std::shared_ptr shm_info = + nullptr; RETURN_IF_ERR(shm_manager->GetMemoryInfo( - region_name, offset, byte_size, &tmp, &memory_type, &memory_type_id)); + region_name, offset, byte_size, &tmp, &memory_type, &memory_type_id, + &shm_info)); base = tmp; + shm_regions_info->emplace_back(shm_info); + if (memory_type == TRITONSERVER_MEMORY_GPU) { #ifdef TRITON_ENABLE_GPU RETURN_IF_ERR(shm_manager->GetCUDAHandle( @@ -911,18 +906,32 @@ ModelInferHandler::Execute(InferHandler::State* state) // tensors are present in the request. std::list serialized_data; + // Maintain shared pointers(read-only reference) to the shared memory block's + // information for the shared memory regions used by the request. These + // pointers will automatically increase the usage count, preventing + // unregistration of the shared memory. This vector must be cleared in the + // `InferResponseComplete` callback (after inference) to decrease the count + // and permit unregistration. The vector will be included in + // `response_release_payload` for the callback. + std::vector> + shm_regions_info; + if (err == nullptr) { err = InferGRPCToInput( - tritonserver_, shm_manager_, request, &serialized_data, irequest); + tritonserver_, shm_manager_, request, &serialized_data, irequest, + &shm_regions_info); } if (err == nullptr) { err = InferAllocatorPayload( tritonserver_, shm_manager_, request, std::move(serialized_data), - response_queue, &state->alloc_payload_); + response_queue, &state->alloc_payload_, &shm_regions_info); } auto request_release_payload = std::make_unique(state->inference_request_); + auto response_release_payload = std::make_unique( + state, std::move(shm_regions_info)); + if (err == nullptr) { err = TRITONSERVER_InferenceRequestSetReleaseCallback( irequest, InferRequestComplete, @@ -932,7 +941,8 @@ ModelInferHandler::Execute(InferHandler::State* state) err = TRITONSERVER_InferenceRequestSetResponseCallback( irequest, allocator_, &state->alloc_payload_ /* response_allocator_userp */, - InferResponseComplete, reinterpret_cast(state)); + InferResponseComplete, + response_release_payload.get() /* response_userp */); } // Get request ID for logging in case of error. const char* request_id = ""; @@ -948,12 +958,14 @@ ModelInferHandler::Execute(InferHandler::State* state) if (err == nullptr) { TRITONSERVER_InferenceTrace* triton_trace = nullptr; #ifdef TRITON_ENABLE_TRACING - GrpcServerCarrier carrier(state->context_->ctx_.get()); - auto start_options = - trace_manager_->GetTraceStartOptions(carrier, request.model_name()); - state->trace_ = std::move(trace_manager_->SampleTrace(start_options)); - if (state->trace_ != nullptr) { - triton_trace = state->trace_->trace_; + if (trace_manager_) { + GrpcServerCarrier carrier(state->context_->ctx_.get()); + auto start_options = + trace_manager_->GetTraceStartOptions(carrier, request.model_name()); + state->trace_ = std::move(trace_manager_->SampleTrace(start_options)); + if (state->trace_ != nullptr) { + triton_trace = state->trace_->trace_; + } } #endif // TRITON_ENABLE_TRACING @@ -968,8 +980,9 @@ ModelInferHandler::Execute(InferHandler::State* state) // to handle gRPC stream cancellation. if (err == nullptr) { state->context_->InsertInflightState(state); - // The payload will be cleaned in request release callback. + // The payload will be cleaned in release callback. request_release_payload.release(); + response_release_payload.release(); } else { // If error go immediately to COMPLETE. LOG_VERBOSE(1) << "[request id: " << request_id << "] " @@ -982,8 +995,10 @@ ModelInferHandler::Execute(InferHandler::State* state) inference::ModelInferResponse error_response; #ifdef TRITON_ENABLE_TRACING - state->trace_timestamps_.emplace_back( - std::make_pair("GRPC_SEND_START", TraceManager::CaptureTimestamp())); + if (trace_manager_) { + state->trace_timestamps_.emplace_back( + std::make_pair("GRPC_SEND_START", TraceManager::CaptureTimestamp())); + } #endif // TRITON_ENABLE_TRACING state->step_ = COMPLETE; @@ -996,7 +1011,9 @@ ModelInferHandler::InferResponseComplete( TRITONSERVER_InferenceResponse* iresponse, const uint32_t flags, void* userp) { - State* state = reinterpret_cast(userp); + ResponseReleasePayload* response_release_payload( + static_cast(userp)); + auto state = response_release_payload->state_; // There are multiple handlers registered in the gRPC service // Hence, we would need to properly synchronize this thread @@ -1038,6 +1055,7 @@ ModelInferHandler::InferResponseComplete( // in the next cycle. state->context_->PutTaskBackToQueue(state); + delete response_release_payload; return; } @@ -1100,6 +1118,8 @@ ModelInferHandler::InferResponseComplete( if (response_created) { delete response; } + + delete response_release_payload; } }}} // namespace triton::server::grpc diff --git a/src/grpc/infer_handler.h b/src/grpc/infer_handler.h index 0e1091feb8..87536dd173 100644 --- a/src/grpc/infer_handler.h +++ b/src/grpc/infer_handler.h @@ -299,7 +299,9 @@ InferAllocatorPayload( const inference::ModelInferRequest& request, std::list&& serialized_data, std::shared_ptr> response_queue, - AllocPayload* alloc_payload) + AllocPayload* alloc_payload, + std::vector>* + shm_regions_info) { alloc_payload->response_queue_ = response_queue; alloc_payload->shm_map_.clear(); @@ -335,9 +337,12 @@ InferAllocatorPayload( void* base; TRITONSERVER_MemoryType memory_type; int64_t memory_type_id; + std::shared_ptr shm_info = + nullptr; RETURN_IF_ERR(shm_manager->GetMemoryInfo( - region_name, offset, byte_size, &base, &memory_type, - &memory_type_id)); + region_name, offset, byte_size, &base, &memory_type, &memory_type_id, + &shm_info)); + shm_regions_info->emplace_back(shm_info); if (memory_type == TRITONSERVER_MEMORY_GPU) { #ifdef TRITON_ENABLE_GPU @@ -373,7 +378,9 @@ TRITONSERVER_Error* InferGRPCToInput( const std::shared_ptr& shm_manager, const inference::ModelInferRequest& request, std::list* serialized_data, - TRITONSERVER_InferenceRequest* inference_request); + TRITONSERVER_InferenceRequest* inference_request, + std::vector>* + shm_regions_info); TRITONSERVER_Error* ResponseAllocatorHelper( TRITONSERVER_ResponseAllocator* allocator, const char* tensor_name, @@ -646,6 +653,7 @@ class InferHandlerState { { ctx_.reset(new ::grpc::ServerContext()); responder_.reset(new ServerResponderType(ctx_.get())); + gRPCErrorTracker_ = std::make_unique(); } void SetCompressionLevel(grpc_compression_level compression_level) @@ -666,9 +674,12 @@ class InferHandlerState { bool IsCancelled() { - return received_notification_ ? ctx_->IsCancelled() : false; + std::lock_guard lock(mu_); + return received_notification_ + ? (ctx_->IsCancelled() || + gRPCErrorTracker_->CheckAndUpdateGRPCError()) + : false; } - // Increments the ongoing request counter void IncrementRequestCounter() { ongoing_requests_++; } @@ -710,6 +721,37 @@ class InferHandlerState { return false; } + // Extracts headers from GRPC request and updates state + void ExtractStateFromHeaders(InferHandlerStateType* state) + { + const auto& metadata = state->context_->ctx_->client_metadata(); + std::string triton_grpc_error_key = "triton_grpc_error"; + + auto it = metadata.find( + {triton_grpc_error_key.data(), triton_grpc_error_key.size()}); + + if (it != metadata.end()) { + if (it->second == "true") { + LOG_VERBOSE(2) + << "GRPC: triton_grpc_error mode detected in new grpc stream"; + state->context_->gRPCErrorTracker_->triton_grpc_error_ = true; + } + } + } + + void WriteGRPCErrorResponse(InferHandlerStateType* state) + { + std::lock_guard lock(state->context_->mu_); + // Check if Error not responded previously + // Avoid closing connection twice on multiple errors from core + if (!state->context_->gRPCErrorTracker_->GRPCErrorEncountered()) { + state->step_ = Steps::COMPLETE; + state->context_->responder_->Finish(state->status_, state); + // Mark error for this stream + state->context_->gRPCErrorTracker_->MarkGRPCErrorEncountered(); + } + } + const std::string DebugString(InferHandlerStateType* state) { std::string debug_string(""); @@ -793,6 +835,7 @@ class InferHandlerState { bool HandleCancellation( InferHandlerStateType* state, bool rpc_ok, const std::string& name) { + // Check to avoid early exit in case of triton_grpc_error if (!IsCancelled()) { LOG_ERROR << "[INTERNAL] HandleCancellation called even when the context was " @@ -816,7 +859,6 @@ class InferHandlerState { IssueRequestCancellation(); // Mark the context as cancelled state->context_->step_ = Steps::CANCELLED; - // The state returns true because the CancelExecution // call above would have raised alarm objects on all // pending inflight states objects. This state will @@ -999,6 +1041,8 @@ class InferHandlerState { // Tracks whether the async notification has been delivered by // completion queue. bool received_notification_; + + std::unique_ptr gRPCErrorTracker_; }; // This constructor is used to build a wrapper state object @@ -1090,7 +1134,6 @@ class InferHandlerState { void MarkAsAsyncNotifyState() { async_notify_state_ = true; } bool IsAsyncNotifyState() { return async_notify_state_; } - // Needed in the response handle for classification outputs. TRITONSERVER_Server* tritonserver_; @@ -1227,6 +1270,23 @@ class InferHandler : public HandlerBase { delete state; } + // Simple structure that carries the payload needed for + // response release callback. + struct ResponseReleasePayload final { + State* state_; + std::vector> + shm_regions_info_; + + ResponseReleasePayload( + State* state, + std::vector< + std::shared_ptr>&& + shm_regions_info) + : state_(state), shm_regions_info_(std::move(shm_regions_info)) + { + } + }; + virtual void StartNewRequest() = 0; virtual bool Process(State* state, bool rpc_ok) = 0; bool ExecutePrecondition(InferHandler::State* state); diff --git a/src/grpc/stream_infer_handler.cc b/src/grpc/stream_infer_handler.cc index 585f88d536..e912e1512c 100644 --- a/src/grpc/stream_infer_handler.cc +++ b/src/grpc/stream_infer_handler.cc @@ -189,7 +189,7 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok) state->context_->responder_->Finish(status, state); return !finished; } - + state->context_->ExtractStateFromHeaders(state); } else if (state->step_ == Steps::READ) { TRITONSERVER_Error* err = nullptr; const inference::ModelInferRequest& request = state->request_; @@ -282,18 +282,32 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok) // tensors are present in the request. std::list serialized_data; + // Maintain shared pointers(read-only reference) to the shared memory + // block's information for the shared memory regions used by the request. + // These pointers will automatically increase the usage count, preventing + // unregistration of the shared memory. This vector must be cleared in the + // `StreamInferResponseComplete` callback (after inference) to decrease the + // count and permit unregistration. The vector will be included in + // `response_release_payload` for the callback. + std::vector> + shm_regions_info; + if (err == nullptr) { err = InferGRPCToInput( - tritonserver_, shm_manager_, request, &serialized_data, irequest); + tritonserver_, shm_manager_, request, &serialized_data, irequest, + &shm_regions_info); } if (err == nullptr) { err = InferAllocatorPayload( tritonserver_, shm_manager_, request, std::move(serialized_data), - response_queue_, &state->alloc_payload_); + response_queue_, &state->alloc_payload_, &shm_regions_info); } auto request_release_payload = std::make_unique(state->inference_request_); + auto response_release_payload = std::make_unique( + state, std::move(shm_regions_info)); + if (err == nullptr) { err = TRITONSERVER_InferenceRequestSetReleaseCallback( irequest, InferRequestComplete, @@ -303,18 +317,21 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok) err = TRITONSERVER_InferenceRequestSetResponseCallback( irequest, allocator_, &state->alloc_payload_ /* response_allocator_userp */, - StreamInferResponseComplete, reinterpret_cast(state)); + StreamInferResponseComplete, + response_release_payload.get() /* response_userp */); } if (err == nullptr) { TRITONSERVER_InferenceTrace* triton_trace = nullptr; #ifdef TRITON_ENABLE_TRACING - GrpcServerCarrier carrier(state->context_->ctx_.get()); - auto start_options = - trace_manager_->GetTraceStartOptions(carrier, request.model_name()); - state->trace_ = std::move(trace_manager_->SampleTrace(start_options)); - if (state->trace_ != nullptr) { - triton_trace = state->trace_->trace_; + if (trace_manager_ != nullptr) { + GrpcServerCarrier carrier(state->context_->ctx_.get()); + auto start_options = + trace_manager_->GetTraceStartOptions(carrier, request.model_name()); + state->trace_ = std::move(trace_manager_->SampleTrace(start_options)); + if (state->trace_ != nullptr) { + triton_trace = state->trace_->trace_; + } } #endif // TRITON_ENABLE_TRACING @@ -330,8 +347,9 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok) // irequest to handle gRPC stream cancellation. if (err == nullptr) { state->context_->InsertInflightState(state); - // The payload will be cleaned in request release callback. + // The payload will be cleaned in release callback. request_release_payload.release(); + response_release_payload.release(); } else { // If there was an error then enqueue the error response and show // it to be ready for writing. @@ -355,7 +373,6 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok) GrpcStatusUtil::Create(&status, err); TRITONSERVER_ErrorDelete(err); response->set_error_message(status.error_message()); - response->mutable_infer_response()->Clear(); // repopulate the id so that client knows which request failed. response->mutable_infer_response()->set_id(request.id()); @@ -522,15 +539,18 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok) } else if (state->step_ == Steps::WRITEREADY) { // Finish the state if all the transactions associated with // the state have completed. - if (state->IsComplete()) { - state->context_->DecrementRequestCounter(); - finished = Finish(state); - } else { - LOG_ERROR << "Should not print this! Decoupled should NOT write via " - "WRITEREADY!"; - // Remove the state from the completion queue - std::lock_guard lock(state->step_mtx_); - state->step_ = Steps::ISSUED; + std::lock_guard lk1(state->context_->mu_); + { + if (state->IsComplete()) { + state->context_->DecrementRequestCounter(); + finished = Finish(state); + } else { + LOG_ERROR << "Should not print this! Decoupled should NOT write via " + "WRITEREADY!"; + // Remove the state from the completion queue + std::lock_guard lock(state->step_mtx_); + state->step_ = Steps::ISSUED; + } } } } @@ -595,8 +615,17 @@ ModelStreamInferHandler::StreamInferResponseComplete( TRITONSERVER_InferenceResponse* iresponse, const uint32_t flags, void* userp) { - State* state = reinterpret_cast(userp); - + ResponseReleasePayload* response_release_payload( + static_cast(userp)); + auto state = response_release_payload->state_; + + // Ignore Response from CORE in case GRPC Strict as we dont care about + if (state->context_->gRPCErrorTracker_->triton_grpc_error_) { + std::lock_guard lock(state->context_->mu_); + if (state->context_->gRPCErrorTracker_->GRPCErrorEncountered()) { + return; + } + } // Increment the callback index uint32_t response_index = state->cb_count_++; @@ -643,6 +672,7 @@ ModelStreamInferHandler::StreamInferResponseComplete( if (is_complete) { state->step_ = Steps::CANCELLED; state->context_->PutTaskBackToQueue(state); + delete response_release_payload; } state->complete_ = is_complete; @@ -671,14 +701,28 @@ ModelStreamInferHandler::StreamInferResponseComplete( } else { LOG_ERROR << "expected the response allocator to have added the response"; } - if (err != nullptr) { failed = true; ::grpc::Status status; + // Converts CORE errors to GRPC error codes GrpcStatusUtil::Create(&status, err); response->mutable_infer_response()->Clear(); response->set_error_message(status.error_message()); LOG_VERBOSE(1) << "Failed for ID: " << log_request_id << std::endl; + if (state->context_->gRPCErrorTracker_->triton_grpc_error_) { + state->status_ = status; + // Finish only once, if backend ignores cancellation + LOG_VERBOSE(1) << "GRPC streaming error detected with status: " + << status.error_code() << "Closing stream connection." + << std::endl; + state->context_->WriteGRPCErrorResponse(state); + TRITONSERVER_ErrorDelete(err); + LOG_TRITONSERVER_ERROR( + TRITONSERVER_InferenceResponseDelete(iresponse), + "deleting GRPC inference response"); + delete response_release_payload; + return; + } } TRITONSERVER_ErrorDelete(err); @@ -756,6 +800,7 @@ ModelStreamInferHandler::StreamInferResponseComplete( if (is_complete) { state->step_ = Steps::CANCELLED; state->context_->PutTaskBackToQueue(state); + delete response_release_payload; } state->complete_ = is_complete; @@ -800,6 +845,48 @@ ModelStreamInferHandler::StreamInferResponseComplete( } state->complete_ = is_complete; } + + if (is_complete) { + delete response_release_payload; + } +} + +// Changes the state of grpc_stream_error_state_ to ERROR_HANDLING_COMPLETE, +// indicating we have closed the stream and initiated the cancel flow +void +gRPCErrorTracker::MarkGRPCErrorHandlingComplete() +{ + grpc_stream_error_state_ = TritonGRPCErrorSteps::ERROR_HANDLING_COMPLETE; +} + +// Returns true ONLY when GRPC_ERROR from CORE is waiting to be processed. +bool +gRPCErrorTracker::CheckAndUpdateGRPCError() +{ + if (grpc_stream_error_state_ == TritonGRPCErrorSteps::ERROR_ENCOUNTERED) { + // Change the state to ERROR_HANDLING_COMPLETE as we have called + // HandleCancellation + MarkGRPCErrorHandlingComplete(); + return true; + } + return false; +} + +// Marks error after it has been responded to +void +gRPCErrorTracker::MarkGRPCErrorEncountered() +{ + grpc_stream_error_state_ = TritonGRPCErrorSteps::ERROR_ENCOUNTERED; +} + +// Checks if error already responded to in triton_grpc_error mode +bool +gRPCErrorTracker::GRPCErrorEncountered() +{ + if (grpc_stream_error_state_ == TritonGRPCErrorSteps::NONE) { + return false; + } + return true; } }}} // namespace triton::server::grpc diff --git a/src/http_server.cc b/src/http_server.cc index 68b22ae649..99aed411b5 100644 --- a/src/http_server.cc +++ b/src/http_server.cc @@ -1181,6 +1181,7 @@ HTTPAPIServer::HTTPAPIServer( HTTPAPIServer::~HTTPAPIServer() { + LOG_VERBOSE(1) << "~HTTPAPIServer()"; if (server_metadata_err_ != nullptr) { TRITONSERVER_ErrorDelete(server_metadata_err_); } @@ -1809,6 +1810,10 @@ HTTPAPIServer::HandleTrace(evhtp_request_t* req, const std::string& model_name) } #ifdef TRITON_ENABLE_TRACING + if (trace_manager_ == nullptr) { + return; + } + TRITONSERVER_InferenceTraceLevel level = TRITONSERVER_TRACE_LEVEL_DISABLED; uint32_t rate; int32_t count; @@ -2680,9 +2685,13 @@ HTTPAPIServer::ParseJsonTritonIO( void* base; TRITONSERVER_MemoryType memory_type; int64_t memory_type_id; + std::shared_ptr shm_info = + nullptr; RETURN_IF_ERR(shm_manager_->GetMemoryInfo( shm_region, shm_offset, byte_size, &base, &memory_type, - &memory_type_id)); + &memory_type_id, &shm_info)); + infer_req->AddShmRegionInfo(shm_info); + if (memory_type == TRITONSERVER_MEMORY_GPU) { #ifdef TRITON_ENABLE_GPU cudaIpcMemHandle_t* cuda_handle; @@ -2795,9 +2804,12 @@ HTTPAPIServer::ParseJsonTritonIO( void* base; TRITONSERVER_MemoryType memory_type; int64_t memory_type_id; + std::shared_ptr shm_info = + nullptr; RETURN_IF_ERR(shm_manager_->GetMemoryInfo( - shm_region, offset, byte_size, &base, &memory_type, - &memory_type_id)); + shm_region, offset, byte_size, &base, &memory_type, &memory_type_id, + &shm_info)); + infer_req->AddShmRegionInfo(shm_info); if (memory_type == TRITONSERVER_MEMORY_GPU) { #ifdef TRITON_ENABLE_GPU @@ -3225,8 +3237,11 @@ HTTPAPIServer::HandleGenerate( // If tracing is enabled see if this request should be traced. TRITONSERVER_InferenceTrace* triton_trace = nullptr; - std::shared_ptr trace = - StartTrace(req, model_name, &triton_trace); + std::shared_ptr trace; + if (trace_manager_) { + // If tracing is enabled see if this request should be traced. + trace = StartTrace(req, model_name, &triton_trace); + } std::map input_metadata; triton::common::TritonJson::Value meta_data_root; @@ -3549,6 +3564,8 @@ HTTPAPIServer::GenerateRequestClass::ExactMappingInput( } } + // get original element count back + element_cnt = tensor_data.IsArray() ? tensor_data.ArraySize() : 1; serialized_data_.emplace_back(); std::vector& serialized = serialized_data_.back(); serialized.resize(byte_size); @@ -3586,10 +3603,12 @@ HTTPAPIServer::HandleInfer( RETURN_AND_RESPOND_IF_ERR( req, CheckTransactionPolicy(req, model_name, requested_model_version)); - // If tracing is enabled see if this request should be traced. TRITONSERVER_InferenceTrace* triton_trace = nullptr; - std::shared_ptr trace = - StartTrace(req, model_name, &triton_trace); + std::shared_ptr trace; + if (trace_manager_) { + // If tracing is enabled see if this request should be traced. + trace = StartTrace(req, model_name, &triton_trace); + } // Decompress request body if it is compressed in supported type evbuffer* decompressed_buffer = nullptr; @@ -4696,6 +4715,35 @@ HTTPAPIServer::Create( return nullptr; } + +TRITONSERVER_Error* +HTTPAPIServer::Create( + std::shared_ptr& server, + const UnorderedMapType& options, + triton::server::TraceManager* trace_manager, + const std::shared_ptr& shm_manager, + const RestrictedFeatures& restricted_features, + std::unique_ptr* service) +{ + int port; + bool reuse_port; + std::string address; + std::string header_forward_pattern; + int thread_count; + + RETURN_IF_ERR(GetValue(options, "port", &port)); + RETURN_IF_ERR(GetValue(options, "reuse_port", &reuse_port)); + RETURN_IF_ERR(GetValue(options, "address", &address)); + RETURN_IF_ERR( + GetValue(options, "header_forward_pattern", &header_forward_pattern)); + RETURN_IF_ERR(GetValue(options, "thread_count", &thread_count)); + + return Create( + server, trace_manager, shm_manager, port, reuse_port, address, + header_forward_pattern, thread_count, restricted_features, service); +} + + bool HTTPAPIServer::RespondIfRestricted( evhtp_request_t* req, const Restriction& restriction) diff --git a/src/http_server.h b/src/http_server.h index 077324cba3..3949f97e27 100644 --- a/src/http_server.h +++ b/src/http_server.h @@ -196,6 +196,14 @@ class HTTPAPIServer : public HTTPServer { const RestrictedFeatures& restricted_apis, std::unique_ptr* http_server); + static TRITONSERVER_Error* Create( + std::shared_ptr& server, + const UnorderedMapType& options, + triton::server::TraceManager* trace_manager, + const std::shared_ptr& shm_manager, + const RestrictedFeatures& restricted_features, + std::unique_ptr* service); + virtual ~HTTPAPIServer(); // @@ -303,6 +311,13 @@ class HTTPAPIServer : public HTTPServer { static void ReplyCallback(evthr_t* thr, void* arg, void* shared); + void AddShmRegionInfo( + const std::shared_ptr& + shm_info) + { + shm_regions_info_.push_back(shm_info); + } + protected: TRITONSERVER_Server* server_{nullptr}; evhtp_request_t* req_{nullptr}; @@ -322,6 +337,14 @@ class HTTPAPIServer : public HTTPServer { // TRITONSERVER_ServerInferAsync (except for cancellation). std::shared_ptr triton_request_{nullptr}; + // Maintain shared pointers(read-only reference) to the shared memory + // block's information for the shared memory regions used by the request. + // These pointers will automatically increase the usage count, preventing + // unregistration of the shared memory. This vector must be cleared when no + // longer needed to decrease the count and permit unregistration. + std::vector> + shm_regions_info_; + evhtp_res response_code_{EVHTP_RES_OK}; }; diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt new file mode 100644 index 0000000000..f447f7eab2 --- /dev/null +++ b/src/python/CMakeLists.txt @@ -0,0 +1,78 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +cmake_minimum_required(VERSION 3.18) + +message("tritonfrontend python package build skipped when relevant frontends are disabled.") +message("In order to build tritonfrontend, the following flags are needed: -DTRITON_ENABLE_HTTP=ON -DTRITON_ENABLE_GRPC=ON") + +# [DLIS-7232] tritonfrontend package expects all supported packages to be +# built, without any check/verification for respective frontend enable flags. +# Support for partial builds(ex: HTTP but not gRPC) will be addressed later. +if(NOT (${TRITON_ENABLE_HTTP} AND ${TRITON_ENABLE_GRPC})) + return() +endif() + +add_subdirectory(tritonfrontend) + +file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/TRITON_VERSION ${TRITON_VERSION}) +configure_file(../../LICENSE LICENSE.txt COPYONLY) +configure_file(setup.py setup.py @ONLY) + +set(WHEEL_DEPENDS + ${CMAKE_CURRENT_BINARY_DIR}/TRITON_VERSION + ${CMAKE_CURRENT_BINARY_DIR}/LICENSE.txt + ${CMAKE_CURRENT_BINARY_DIR}/setup.py + ${CMAKE_CURRENT_BINARY_DIR}/tritonfrontend + py-bindings +) + +set(wheel_stamp_file "stamp.whl") + +add_custom_command( + OUTPUT "${wheel_stamp_file}" + COMMAND python3 + ARGS + "${CMAKE_CURRENT_SOURCE_DIR}/build_wheel.py" + --dest-dir "${CMAKE_CURRENT_BINARY_DIR}/generic" + --binding-path $ + DEPENDS ${WHEEL_DEPENDS} +) + +add_custom_target( + frontend-server-wheel ALL + DEPENDS + "${wheel_stamp_file}" +) + + +# Wheel +set(WHEEL_OUT_DIR "${CMAKE_CURRENT_BINARY_DIR}/generic/wheel/dist/") +install( + DIRECTORY + ${WHEEL_OUT_DIR} + DESTINATION "${CMAKE_INSTALL_PREFIX}/python" +) \ No newline at end of file diff --git a/src/python/build_wheel.py b/src/python/build_wheel.py new file mode 100755 index 0000000000..875dd32a70 --- /dev/null +++ b/src/python/build_wheel.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python3 +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import argparse +import os +import pathlib +import re +import shutil +import subprocess +import sys +from distutils.dir_util import copy_tree +from tempfile import mkstemp + + +def fail_if(p, msg): + if p: + print("error: {}".format(msg), file=sys.stderr) + sys.exit(1) + + +def mkdir(path): + pathlib.Path(path).mkdir(parents=True, exist_ok=True) + + +def touch(path): + pathlib.Path(path).touch() + + +def cpdir(src, dest): + copy_tree(src, dest, preserve_symlinks=1) + + +def sed(pattern, replace, source, dest=None): + name = None + if dest: + name = dest + if dest is None: + fd, name = mkstemp() + + with open(source, "r") as fin, open(name, "w") as fout: + for line in fin: + out = re.sub(pattern, replace, line) + fout.write(out) + + if not dest: + shutil.copyfile(name, source) + + +def main(): + parser = argparse.ArgumentParser() + + parser.add_argument( + "--dest-dir", type=str, required=True, help="Destination directory." + ) + parser.add_argument( + "--binding-path", + type=str, + required=True, + help="Path to Triton Frontend Python binding.", + ) + + FLAGS = parser.parse_args() + + FLAGS.triton_version = None + with open("TRITON_VERSION", "r") as vfile: + FLAGS.triton_version = vfile.readline().strip() + + FLAGS.whl_dir = os.path.join(FLAGS.dest_dir, "wheel") + + print("=== Building in: {}".format(os.getcwd())) + print("=== Using builddir: {}".format(FLAGS.whl_dir)) + print("Adding package files") + mkdir(os.path.join(FLAGS.whl_dir, "tritonfrontend")) + shutil.copy( + "tritonfrontend/__init__.py", os.path.join(FLAGS.whl_dir, "tritonfrontend") + ) + # Type checking marker file indicating support for type checkers. + # https://peps.python.org/pep-0561/ + shutil.copy( + "tritonfrontend/py.typed", os.path.join(FLAGS.whl_dir, "tritonfrontend") + ) + cpdir("tritonfrontend/_c", os.path.join(FLAGS.whl_dir, "tritonfrontend", "_c")) + cpdir("tritonfrontend/_api", os.path.join(FLAGS.whl_dir, "tritonfrontend", "_api")) + PYBIND_LIB = os.path.basename(FLAGS.binding_path) + shutil.copyfile( + FLAGS.binding_path, + os.path.join(FLAGS.whl_dir, "tritonfrontend", "_c", PYBIND_LIB), + ) + + shutil.copyfile("LICENSE.txt", os.path.join(FLAGS.whl_dir, "LICENSE.txt")) + shutil.copyfile("setup.py", os.path.join(FLAGS.whl_dir, "setup.py")) + + os.chdir(FLAGS.whl_dir) + print("=== Building wheel") + args = ["python3", "setup.py", "bdist_wheel"] + + wenv = os.environ.copy() + wenv["VERSION"] = FLAGS.triton_version + wenv["TRITON_PYBIND"] = PYBIND_LIB + p = subprocess.Popen(args, env=wenv) + p.wait() + fail_if(p.returncode != 0, "setup.py failed") + + cpdir("dist", FLAGS.dest_dir) + + print(f"=== Output wheel file is in: {FLAGS.dest_dir}") + touch(os.path.join(FLAGS.dest_dir, "stamp.whl")) + + +if __name__ == "__main__": + main() diff --git a/src/python/examples/example.py b/src/python/examples/example.py new file mode 100644 index 0000000000..2d2ca78920 --- /dev/null +++ b/src/python/examples/example.py @@ -0,0 +1,84 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pathlib + +import numpy as np +import tritonclient.http as httpclient +import tritonserver +from tritonfrontend import KServeHttp + + +def main(): + # Constructing path to Model Repository + model_path = f"{pathlib.Path(__file__).parent.resolve()}/example_model_repository" + # Selecting Server Options + server_options = tritonserver.Options( + server_id="ExampleServer", + model_repository=model_path, + log_error=True, + log_info=True, + log_warn=True, + ) + + # Creating server instance + server = tritonserver.Server(server_options).start(wait_until_ready=True) + + # Selecting Options for KServeHttp Frontend + http_options = KServeHttp.Options(port=8005) + + # or http_service = KServeHttp.Server(server, http_options) & http_service.stop() + with KServeHttp(server, http_options) as http_service: + # The identity model returns an exact duplicate of the input data as output + model_name = "identity" + url = "localhost:8005" + + # Create a Triton client + client = httpclient.InferenceServerClient(url=url) + + # Prepare input data + input_data = np.array([["Roger Roger"]], dtype=object) + + # Create input and output objects + inputs = [httpclient.InferInput("INPUT0", input_data.shape, "BYTES")] + + # Set the data for the input tensor + inputs[0].set_data_from_numpy(input_data) + + results = client.infer(model_name, inputs=inputs) + + # Get the output data + output_data = results.as_numpy("OUTPUT0") + + print("--------------------- INFERENCE RESULTS ---------------------") + print("Output data:", output_data) + print("-------------------------------------------------------------") + + server.stop() + + +if __name__ == "__main__": + main() diff --git a/src/python/examples/example_model_repository/identity/1/model.savedmodel/saved_model.pb b/src/python/examples/example_model_repository/identity/1/model.savedmodel/saved_model.pb new file mode 100755 index 0000000000..63f78fecb4 Binary files /dev/null and b/src/python/examples/example_model_repository/identity/1/model.savedmodel/saved_model.pb differ diff --git a/src/python/examples/example_model_repository/identity/config.pbtxt b/src/python/examples/example_model_repository/identity/config.pbtxt new file mode 100644 index 0000000000..ae83e47556 --- /dev/null +++ b/src/python/examples/example_model_repository/identity/config.pbtxt @@ -0,0 +1,44 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "identity" +platform: "tensorflow_savedmodel" +max_batch_size: 8 + +input [ + { + name: "INPUT0" + data_type: TYPE_STRING + dims: [ -1 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_STRING + dims: [ -1 ] + } +] diff --git a/src/python/setup.py b/src/python/setup.py new file mode 100755 index 0000000000..ee1e7c0ec4 --- /dev/null +++ b/src/python/setup.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python3 +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +import sys + +from setuptools import find_packages, setup + +if "--plat-name" in sys.argv: + PLATFORM_FLAG = sys.argv[sys.argv.index("--plat-name") + 1] +else: + PLATFORM_FLAG = "any" + +if "VERSION" not in os.environ: + raise Exception("envvar VERSION must be specified") + +VERSION = os.environ["VERSION"] + +try: + from wheel.bdist_wheel import bdist_wheel as _bdist_wheel + + class bdist_wheel(_bdist_wheel): + def finalize_options(self): + _bdist_wheel.finalize_options(self) + self.root_is_pure = False + + def get_tag(self): + pyver, abi, plat = "py3", "none", PLATFORM_FLAG + return pyver, abi, plat + +except ImportError: + bdist_wheel = None + +this_directory = os.path.abspath(os.path.dirname(__file__)) + +data_files = [ + ("", ["LICENSE.txt"]), +] + +# Type checking marker file indicating support for type checkers. +# https://peps.python.org/pep-0561/ +# Type hints for c extension generated by mypy +platform_package_data = [ + os.environ["TRITON_PYBIND"], + "py.typed", + "_c/__init__.pyi", + "_c/triton_bindings.pyi", +] + +gpu_extras = ["cupy-cuda12x"] +test_extras = ["pytest"] +all_extras = gpu_extras + test_extras + +setup( + name="tritonfrontend", + version=VERSION, + author="NVIDIA Inc.", + author_email="sw-dl-triton@nvidia.com", + description="Triton Inference Server In-Process Python API", + license="BSD", + url="https://developer.nvidia.com/nvidia-triton-inference-server", + classifiers=[ + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Developers", + "Intended Audience :: Science/Research", + "Intended Audience :: Information Technology", + "Topic :: Scientific/Engineering", + "Topic :: Scientific/Engineering :: Image Recognition", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "Topic :: Software Development :: Libraries", + "Topic :: Utilities", + "License :: OSI Approved :: BSD License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.10", + "Environment :: Console", + "Natural Language :: English", + "Operating System :: OS Independent", + ], + packages=find_packages(), + package_data={ + "": platform_package_data, + }, + zip_safe=False, + cmdclass={"bdist_wheel": bdist_wheel}, + data_files=data_files, + install_requires=["tritonserver", "pydantic"], + extras_require={"GPU": gpu_extras, "test": test_extras, "all": all_extras}, +) diff --git a/src/python/tritonfrontend/CMakeLists.txt b/src/python/tritonfrontend/CMakeLists.txt new file mode 100644 index 0000000000..e22be30602 --- /dev/null +++ b/src/python/tritonfrontend/CMakeLists.txt @@ -0,0 +1,181 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +cmake_minimum_required(VERSION 3.18) + +# ================= Ensures Package is Structured Properly ================== +# Top level module entry point and typed marker +file(COPY __init__.py DESTINATION .) +file(COPY py.typed DESTINATION .) +# Copy the '__init__.py' for the '_c' module +file(COPY _c/__init__.py DESTINATION ./_c/.) +file(COPY _c/__init__.pyi DESTINATION ./_c/.) +file(COPY _c/tritonfrontend_bindings.pyi DESTINATION ./_c/.) +# Find and copy _api modules +file(GLOB PYTHON_MODULE_FILES ./_api/*.py) +file(COPY ${PYTHON_MODULE_FILES} DESTINATION ./_api/.) +# ================================= END ===================================== + + +# =================== Downloading and Installing pybind11 =================== +include(FetchContent) + +FetchContent_Declare( + pybind11 + GIT_REPOSITORY https://github.com/pybind/pybind11.git + GIT_TAG v2.13.1 + GIT_SHALLOW ON +) + +FetchContent_MakeAvailable(pybind11) +# ================================= END ===================================== + +# ================== Collect the Dependencies =============================== +set( + PYTHON_FRONTEND_BINDING_DEPS + ../../shared_memory_manager.h + ../../shared_memory_manager.cc + ../../data_compressor.h + ../../common.h + ../../common.cc + ../../restricted_features.h + ../../tracer.h + $<$:../../tracer.cc> + ../../classification.cc +) + +set(PY_BINDING_DEPENDENCY_LIBS + triton-common-json + triton-common-logging + triton-core-serverapi + triton-core-serverstub + ) + +# Conditional Linking Based on Flags +if(${TRITON_ENABLE_HTTP}) + list(APPEND PY_BINDING_DEPENDENCY_LIBS + http-endpoint-library + ) +endif() + +if(${TRITON_ENABLE_GRPC}) + list(APPEND PY_BINDING_DEPENDENCY_LIBS + grpc-endpoint-library + ) +endif() + +if(${TRITON_ENABLE_GPU}) + find_package(CUDAToolkit REQUIRED) + list(APPEND PY_BINDING_DEPENDENCY_LIBS + CUDA::cudart + ) +endif() + +if(${TRITON_ENABLE_TRACING}) + message("TRACING/STATS IS CURRENTLY NOT SUPPORTED.") + find_package(absl CONFIG REQUIRED) + find_package(CURL CONFIG REQUIRED) + find_package(nlohmann_json CONFIG REQUIRED) + find_package(opentelemetry-cpp CONFIG REQUIRED) + list(APPEND PY_BINDING_DEPENDENCY_LIBS + tracing-library + ) +endif() + +# ===================== End of Collection =================================== + + +# ================== Create Python Frontend Bindings ======================== +set( + PYTHON_FRONTEND_BINDING_SRCS + _c/tritonfrontend.h + _c/tritonfrontend_pybind.cc +) + +pybind11_add_module( + py-bindings + MODULE + ${PYTHON_FRONTEND_BINDING_DEPS} + ${PYTHON_FRONTEND_BINDING_SRCS} +) + +target_include_directories(py-bindings PRIVATE ${CMAKE_SOURCE_DIR}/src) + +target_link_libraries( + py-bindings + PRIVATE + ${PY_BINDING_DEPENDENCY_LIBS} +) + +if(${TRITON_ENABLE_HTTP}) + target_compile_definitions( + py-bindings + PRIVATE TRITON_ENABLE_HTTP=1 + ) +endif() + +if(${TRITON_ENABLE_GRPC}) + target_compile_definitions( + py-bindings + PRIVATE TRITON_ENABLE_GRPC=1 + ) +endif() + +if(${TRITON_ENABLE_GPU}) + target_compile_definitions( + py-bindings + PRIVATE TRITON_ENABLE_GPU=1 + PRIVATE TRITON_MIN_COMPUTE_CAPABILITY=${TRITON_MIN_COMPUTE_CAPABILITY} + ) +endif() + +if(${TRITON_ENABLE_TRACING}) + target_include_directories( + py-bindings + PRIVATE ${OPENTELEMETRY_CPP_INCLUDE_DIRS} + ) + target_compile_definitions( + py-bindings + PRIVATE TRITON_ENABLE_TRACING=1 + ) +endif() + +if(${TRITON_ENABLE_STATS}) + target_compile_definitions( + py-bindings + PRIVATE TRITON_ENABLE_STATS=1 + ) +endif() + + +set_property(TARGET py-bindings PROPERTY OUTPUT_NAME tritonfrontend_bindings) + +set_target_properties( + py-bindings + PROPERTIES + BUILD_RPATH "$ORIGIN:/opt/tritonserver/lib" +) +# ===================== End of Python Bindings ============================== diff --git a/src/python/tritonfrontend/__init__.py b/src/python/tritonfrontend/__init__.py new file mode 100644 index 0000000000..48eaf64e8b --- /dev/null +++ b/src/python/tritonfrontend/__init__.py @@ -0,0 +1,33 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# triton/server/src/python/tritonfrontend/__init__.py + +import builtins +from importlib.metadata import PackageNotFoundError, version + +from tritonfrontend._api._kservegrpc import KServeGrpc +from tritonfrontend._api._kservehttp import KServeHttp diff --git a/src/python/tritonfrontend/__init__.pyi b/src/python/tritonfrontend/__init__.pyi new file mode 100644 index 0000000000..0afb0cb886 --- /dev/null +++ b/src/python/tritonfrontend/__init__.pyi @@ -0,0 +1,27 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# Need to automate stubgen process as a part of build: https://github.com/triton-inference-server/server/pull/7501#discussion_r1720135228 diff --git a/src/python/tritonfrontend/_api/__init__.py b/src/python/tritonfrontend/_api/__init__.py new file mode 100644 index 0000000000..dc1c939c66 --- /dev/null +++ b/src/python/tritonfrontend/_api/__init__.py @@ -0,0 +1,25 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/src/python/tritonfrontend/_api/_error_mapping.py b/src/python/tritonfrontend/_api/_error_mapping.py new file mode 100644 index 0000000000..39a1e9aeb1 --- /dev/null +++ b/src/python/tritonfrontend/_api/_error_mapping.py @@ -0,0 +1,48 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import tritonserver +from tritonfrontend._c.tritonfrontend_bindings import ( + AlreadyExistsError, + InternalError, + InvalidArgumentError, + NotFoundError, + TritonError, + UnavailableError, + UnknownError, + UnsupportedError, +) + +ERROR_MAPPING = { + TritonError: tritonserver.TritonError, + NotFoundError: tritonserver.NotFoundError, + UnknownError: tritonserver.UnknownError, + InternalError: tritonserver.InternalError, + InvalidArgumentError: tritonserver.InvalidArgumentError, + UnavailableError: tritonserver.UnavailableError, + AlreadyExistsError: tritonserver.AlreadyExistsError, + UnsupportedError: tritonserver.UnsupportedError, +} diff --git a/src/python/tritonfrontend/_api/_kservegrpc.py b/src/python/tritonfrontend/_api/_kservegrpc.py new file mode 100644 index 0000000000..b8f199ac53 --- /dev/null +++ b/src/python/tritonfrontend/_api/_kservegrpc.py @@ -0,0 +1,136 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys +from enum import IntEnum +from typing import Union + +import tritonserver +from pydantic import Field +from pydantic.dataclasses import dataclass +from tritonfrontend._api._error_mapping import ERROR_MAPPING +from tritonfrontend._c.tritonfrontend_bindings import ( + InvalidArgumentError, + TritonError, + TritonFrontendGrpc, +) + + +# Enum (mirroring C++ format) +class Grpc_compression_level(IntEnum): + NONE = 0 + LOW = 1 + MED = 2 + HIGH = 3 + COUNT = 4 + + +class KServeGrpc: + Grpc_compression_level = ( + Grpc_compression_level # Include the enum as a class attribute + ) + + # triton::server::grpc::Options + @dataclass + class Options: + # triton::server::grpc::SocketOptions + address: str = "0.0.0.0" + port: int = Field(8001, ge=0, le=65535) + reuse_port: bool = False + # triton::server::grpc::SslOptions + use_ssl: bool = False + server_cert: str = "" + server_key: str = "" + root_cert: str = "" + use_mutual_auth: bool = False + # triton::server::grpc::KeepAliveOptions + keepalive_time_ms: int = Field(7_200_000, ge=0) + keepalive_timeout_ms: int = Field(20_000, ge=0) + keepalive_permit_without_calls: bool = False + http2_max_pings_without_data: int = Field(2, ge=0) + http2_min_recv_ping_interval_without_data_ms: int = Field(300_000, ge=0) + http2_max_ping_strikes: int = Field(2, ge=0) + max_connection_age_ms: int = Field(0, ge=0) + max_connection_age_grace_ms: int = Field(0, ge=0) + + # triton::server::grpc::Options + + infer_compression_level: Union[ + int, Grpc_compression_level + ] = Grpc_compression_level.NONE + infer_allocation_pool_size: int = Field(8, ge=0) + forward_header_pattern: str = "" + # DLIS-7215: Add restricted protocol support + # restricted_protocols: str = "" + + def __post_init__(self): + if isinstance(self.infer_compression_level, Grpc_compression_level): + self.infer_compression_level = self.infer_compression_level.value + + def __init__(self, server: tritonserver, options: "KServeGrpc.Options" = None): + try: + server_ptr = server._ptr() # TRITONSERVER_Server pointer + + # If no options provided, default options are selected + if options is None: + options = KServeGrpc.Options() + + if not isinstance(options, KServeGrpc.Options): + raise InvalidArgumentError( + "Incorrect type for options. options argument must be of type KServeGrpc.Options" + ) + + # Converts dataclass instance -> python dictionary -> unordered_map> + options_dict: dict[str, Union[int, bool, str]] = options.__dict__ + + self.triton_frontend = TritonFrontendGrpc(server_ptr, options_dict) + except TritonError: + exc_type, exc_value, _ = sys.exc_info() + # raise ... from None masks the tritonfrontend Error from being added in traceback + raise ERROR_MAPPING[exc_type](exc_value) from None + + def __enter__(self): + self.triton_frontend.start() + return self + + def __exit__(self, exc_type, exc_value, traceback): + self.triton_frontend.stop() + if exc_type: + raise ERROR_MAPPING[exc_type](exc_value) from None + + def start(self): + try: + self.triton_frontend.start() + except TritonError: + exc_type, exc_value, _ = sys.exc_info() + raise ERROR_MAPPING[exc_type](exc_value) from None + + def stop(self): + try: + self.triton_frontend.stop() + except TritonError: + exc_type, exc_value, _ = sys.exc_info() + raise ERROR_MAPPING[exc_type](exc_value) from None diff --git a/src/python/tritonfrontend/_api/_kservegrpc.pyi b/src/python/tritonfrontend/_api/_kservegrpc.pyi new file mode 100644 index 0000000000..c81d3d6afc --- /dev/null +++ b/src/python/tritonfrontend/_api/_kservegrpc.pyi @@ -0,0 +1,74 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from enum import IntEnum + +import tritonserver +from _typeshed import Incomplete +from tritonfrontend._c.tritonfrontend_bindings import ( + InvalidArgumentError as InvalidArgumentError, +) +from tritonfrontend._c.tritonfrontend_bindings import ( + TritonFrontendGrpc as TritonFrontendGrpc, +) + +class Grpc_compression_level(IntEnum): + NONE = 0 + LOW = 1 + MED = 2 + HIGH = 3 + COUNT = 4 + +class KServeGrpc: + Grpc_compression_level = Grpc_compression_level + class Options: + address: str + port: int + reuse_port: bool + use_ssl: bool + server_cert: str + server_key: str + root_cert: str + use_mutual_auth: bool + keepalive_time_ms: int + keepalive_timeout_ms: int + keepalive_permit_without_calls: bool + http2_max_pings_without_data: int + http2_min_recv_ping_interval_without_data_ms: int + http2_max_ping_strikes: int + max_connection_age_ms: int + max_connection_age_grace_ms: int + infer_compression_level: int | Grpc_compression_level + infer_allocation_pool_size: int + forward_header_pattern: str + def __post_init__(self) -> None: ... + class Server: + triton_frontend: Incomplete + def __init__(self, server: tritonserver, options: KServeGrpc.Options = None) -> None: ... + def __enter__(self): ... + def __exit__(self, exc_type: type[BaseException] | None, exc_value: BaseException | None, traceback: types.TracebackType | None) -> None: ... + def start(self): ... + def stop(self): ... diff --git a/src/python/tritonfrontend/_api/_kservehttp.py b/src/python/tritonfrontend/_api/_kservehttp.py new file mode 100644 index 0000000000..4a5abef4a3 --- /dev/null +++ b/src/python/tritonfrontend/_api/_kservehttp.py @@ -0,0 +1,96 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import sys +from typing import Union + +import tritonserver +from pydantic import Field +from pydantic.dataclasses import dataclass +from tritonfrontend._api._error_mapping import ERROR_MAPPING +from tritonfrontend._c.tritonfrontend_bindings import ( + InvalidArgumentError, + TritonError, + TritonFrontendHttp, +) + + +class KServeHttp: + @dataclass + class Options: + address: str = "0.0.0.0" + port: int = Field(8000, ge=0, le=65535) + reuse_port: bool = False + thread_count: int = Field(8, ge=0) + header_forward_pattern: str = "" + # DLIS-7215: Add restricted protocol support + # restricted_protocols: list + + def __init__(self, server: tritonserver, options: "KServeHttp.Options" = None): + try: + server_ptr = server._ptr() # TRITONSERVER_Server pointer + + # If no options provided, default options are selected + if options is None: + options = KServeHttp.Options() + + if not isinstance(options, KServeHttp.Options): + raise InvalidArgumentError( + "Incorrect type for options. options argument must be of type KServeHttp.Options" + ) + + options_dict: dict[str, Union[int, bool, str]] = options.__dict__ + # Converts dataclass instance -> python dictionary -> unordered_map> + + self.triton_frontend = TritonFrontendHttp(server_ptr, options_dict) + except TritonError: + exc_type, exc_value, _ = sys.exc_info() + # raise ... from None masks the tritonfrontend Error from being added in traceback + raise ERROR_MAPPING[exc_type](exc_value) from None + + def __enter__(self): + self.triton_frontend.start() + return self + + def __exit__(self, exc_type, exc_value, traceback): + self.triton_frontend.stop() + if exc_type: + raise ERROR_MAPPING[exc_type](exc_value) from None + + def start(self): + try: + self.triton_frontend.start() + except TritonError: + exc_type, exc_value, _ = sys.exc_info() + raise ERROR_MAPPING[exc_type](exc_value) from None + + def stop(self): + try: + self.triton_frontend.stop() + except TritonError: + exc_type, exc_value, _ = sys.exc_info() + raise ERROR_MAPPING[exc_type](exc_value) from None diff --git a/src/python/tritonfrontend/_api/_kservehttp.pyi b/src/python/tritonfrontend/_api/_kservehttp.pyi new file mode 100644 index 0000000000..60f3997f39 --- /dev/null +++ b/src/python/tritonfrontend/_api/_kservehttp.pyi @@ -0,0 +1,49 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import tritonserver +from _typeshed import Incomplete +from tritonfrontend._c.tritonfrontend_bindings import ( + InvalidArgumentError as InvalidArgumentError, +) +from tritonfrontend._c.tritonfrontend_bindings import ( + TritonFrontendHttp as TritonFrontendHttp, +) + +class KServeHttp: + class Options: + address: str + port: int + reuse_port: bool + thread_count: int + header_forward_pattern: str + class Server: + triton_frontend: Incomplete + def __init__(self, server: tritonserver, options: KServeHttp.Options = None) -> None: ... + def __enter__(self): ... + def __exit__(self, exc_type: type[BaseException] | None, exc_value: BaseException | None, traceback: types.TracebackType | None) -> None: ... + def start(self) -> None: ... + def stop(self) -> None: ... diff --git a/src/python/tritonfrontend/_c/__init__.py b/src/python/tritonfrontend/_c/__init__.py new file mode 100644 index 0000000000..3e892ede64 --- /dev/null +++ b/src/python/tritonfrontend/_c/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from .tritonfrontend_bindings import * diff --git a/src/python/tritonfrontend/_c/__init__.pyi b/src/python/tritonfrontend/_c/__init__.pyi new file mode 100644 index 0000000000..99eaf9dace --- /dev/null +++ b/src/python/tritonfrontend/_c/__init__.pyi @@ -0,0 +1,27 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from tritonfrontend._c.tritonfrontend_bindings import * diff --git a/src/python/tritonfrontend/_c/tritonfrontend.h b/src/python/tritonfrontend/_c/tritonfrontend.h new file mode 100644 index 0000000000..172147f566 --- /dev/null +++ b/src/python/tritonfrontend/_c/tritonfrontend.h @@ -0,0 +1,139 @@ +// Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#pragma once + +#include // For shared_ptr +#include +#include + +#include "../../../common.h" +#include "../../../restricted_features.h" +#include "../../../shared_memory_manager.h" +#include "../../../tracer.h" +#include "triton/common/logging.h" +#include "triton/core/tritonserver.h" + + +struct TRITONSERVER_Server {}; + +namespace triton { namespace server { namespace python { + +// base exception for all Triton error code +struct TritonError : public std::runtime_error { + explicit TritonError(const std::string& what) : std::runtime_error(what) {} +}; + +// triton::core::python exceptions map 1:1 to TRITONSERVER_Error_Code. +struct UnknownError : public TritonError { + explicit UnknownError(const std::string& what) : TritonError(what) {} +}; +struct InternalError : public TritonError { + explicit InternalError(const std::string& what) : TritonError(what) {} +}; +struct NotFoundError : public TritonError { + explicit NotFoundError(const std::string& what) : TritonError(what) {} +}; +struct InvalidArgumentError : public TritonError { + explicit InvalidArgumentError(const std::string& what) : TritonError(what) {} +}; +struct UnavailableError : public TritonError { + explicit UnavailableError(const std::string& what) : TritonError(what) {} +}; +struct UnsupportedError : public TritonError { + explicit UnsupportedError(const std::string& what) : TritonError(what) {} +}; +struct AlreadyExistsError : public TritonError { + explicit AlreadyExistsError(const std::string& what) : TritonError(what) {} +}; + +void +ThrowIfError(TRITONSERVER_Error* err) +{ + if (err == nullptr) { + return; + } + std::shared_ptr managed_err( + err, TRITONSERVER_ErrorDelete); + std::string msg = TRITONSERVER_ErrorMessage(err); + switch (TRITONSERVER_ErrorCode(err)) { + case TRITONSERVER_ERROR_INTERNAL: + throw InternalError(std::move(msg)); + case TRITONSERVER_ERROR_NOT_FOUND: + throw NotFoundError(std::move(msg)); + case TRITONSERVER_ERROR_INVALID_ARG: + throw InvalidArgumentError(std::move(msg)); + case TRITONSERVER_ERROR_UNAVAILABLE: + throw UnavailableError(std::move(msg)); + case TRITONSERVER_ERROR_UNSUPPORTED: + throw UnsupportedError(std::move(msg)); + case TRITONSERVER_ERROR_ALREADY_EXISTS: + throw AlreadyExistsError(std::move(msg)); + default: + throw UnknownError(std::move(msg)); + } +} + + +template +class TritonFrontend { + private: + std::shared_ptr server_; + std::unique_ptr service; + triton::server::RestrictedFeatures restricted_features; + // TODO: [DLIS-7194] Add support for TraceManager & SharedMemoryManager + // triton::server::TraceManager trace_manager_; + // triton::server::SharedMemoryManager shm_manager_; + + public: + TritonFrontend(uintptr_t server_mem_addr, UnorderedMapType data) + { + TRITONSERVER_Server* server_ptr = + reinterpret_cast(server_mem_addr); + + server_.reset(server_ptr, EmptyDeleter); + + ThrowIfError(FrontendServer::Create( + server_, data, nullptr /* TraceManager */, + nullptr /* SharedMemoryManager */, restricted_features, &service)); + }; + + // TODO: [DLIS-7194] Add support for TraceManager & SharedMemoryManager + // TritonFrontend( + // uintptr_t server_mem_addr, UnorderedMapType data, + // TraceManager trace_manager, SharedMemoryManager shm_manager) + + void StartService() { ThrowIfError(service->Start()); }; + void StopService() { ThrowIfError(service->Stop()); }; + + // The frontend does not own the TRITONSERVER_Server* object. + // Hence, deleting the underlying server instance, + // will cause a double-free when the core bindings attempt to + // delete the TRITONSERVER_Server instance. + static void EmptyDeleter(TRITONSERVER_Server* obj){}; +}; + +}}} // namespace triton::server::python diff --git a/src/python/tritonfrontend/_c/tritonfrontend_bindings.pyi b/src/python/tritonfrontend/_c/tritonfrontend_bindings.pyi new file mode 100644 index 0000000000..535693a5cb --- /dev/null +++ b/src/python/tritonfrontend/_c/tritonfrontend_bindings.pyi @@ -0,0 +1,44 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from tritonfrontend import AlreadyExistsError as AlreadyExistsError +from tritonfrontend import InternalError as InternalError +from tritonfrontend import InvalidArgumentError as InvalidArgumentError +from tritonfrontend import NotFoundError as NotFoundError +from tritonfrontend import TritonError as TritonError +from tritonfrontend import UnavailableError as UnavailableError +from tritonfrontend import UnknownError as UnknownError +from tritonfrontend import UnsupportedError as UnsupportedError + +class TritonFrontendGrpc: + def __init__(self, arg0: int, arg1: dict[str, bool | int | str]) -> None: ... + def start(self) -> None: ... + def stop(self) -> None: ... + +class TritonFrontendHttp: + def __init__(self, arg0: int, arg1: dict[str, bool | int | str]) -> None: ... + def start(self) -> None: ... + def stop(self) -> None: ... diff --git a/src/python/tritonfrontend/_c/tritonfrontend_pybind.cc b/src/python/tritonfrontend/_c/tritonfrontend_pybind.cc new file mode 100644 index 0000000000..86a0ac1c41 --- /dev/null +++ b/src/python/tritonfrontend/_c/tritonfrontend_pybind.cc @@ -0,0 +1,76 @@ +// Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include +#include + +#include "../../../grpc/grpc_server.h" +#include "../../../http_server.h" +#include "triton/core/tritonserver.h" +#include "tritonfrontend.h" + + +namespace py = pybind11; + +namespace triton { namespace server { namespace python { + + +PYBIND11_MODULE(tritonfrontend_bindings, m) +{ + m.doc() = "Python bindings for Triton Inference Server Frontend Endpoints"; + + auto tfe = py::register_exception(m, "TritonError"); + py::register_exception(m, "UnknownError", tfe.ptr()); + py::register_exception(m, "InternalError", tfe.ptr()); + py::register_exception(m, "NotFoundError", tfe.ptr()); + py::register_exception( + m, "InvalidArgumentError", tfe.ptr()); + py::register_exception(m, "UnavailableError", tfe.ptr()); + py::register_exception(m, "UnsupportedError", tfe.ptr()); + py::register_exception( + m, "AlreadyExistsError", tfe.ptr()); + + + py::class_>(m, "TritonFrontendHttp") + .def(py::init()) + .def("start", &TritonFrontend::StartService) + .def("stop", &TritonFrontend::StopService); + + py::class_>( + m, "TritonFrontendGrpc") + .def(py::init()) + .def( + "start", &TritonFrontend< + triton::server::grpc::Server, + triton::server::grpc::Server>::StartService) + .def( + "stop", &TritonFrontend< + triton::server::grpc::Server, + triton::server::grpc::Server>::StopService); +} + +}}} // namespace triton::server::python diff --git a/src/python/tritonfrontend/py.typed b/src/python/tritonfrontend/py.typed new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/shared_memory_manager.cc b/src/shared_memory_manager.cc index 1f4a77e887..7b845709a1 100644 --- a/src/shared_memory_manager.cc +++ b/src/shared_memory_manager.cc @@ -69,7 +69,8 @@ TRITONSERVER_Error* SharedMemoryManager::GetMemoryInfo( const std::string& name, size_t offset, size_t byte_size, void** shm_mapped_addr, TRITONSERVER_MemoryType* memory_type, - int64_t* device_id) + int64_t* device_id, + std::shared_ptr* shm_info) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_UNSUPPORTED, @@ -408,9 +409,9 @@ SharedMemoryManager::RegisterSystemSharedMemory( } shared_memory_map_.insert(std::make_pair( - name, std::unique_ptr(new SharedMemoryInfo( + name, std::make_shared( name, shm_key, offset, byte_size, shm_fd, mapped_addr, - TRITONSERVER_MEMORY_CPU, 0)))); + TRITONSERVER_MEMORY_CPU, 0))); return nullptr; // success } @@ -444,9 +445,9 @@ SharedMemoryManager::RegisterCUDASharedMemory( name, reinterpret_cast(mapped_addr), byte_size)); shared_memory_map_.insert(std::make_pair( - name, std::unique_ptr(new CUDASharedMemoryInfo( + name, std::make_shared( name, "", 0, byte_size, 0, mapped_addr, TRITONSERVER_MEMORY_GPU, - device_id, cuda_shm_handle)))); + device_id, cuda_shm_handle))); return nullptr; // success } @@ -456,7 +457,8 @@ TRITONSERVER_Error* SharedMemoryManager::GetMemoryInfo( const std::string& name, size_t offset, size_t byte_size, void** shm_mapped_addr, TRITONSERVER_MemoryType* memory_type, - int64_t* device_id) + int64_t* device_id, + std::shared_ptr* shm_info) { // protect shared_memory_map_ from concurrent access std::lock_guard lock(mu_); @@ -494,6 +496,10 @@ SharedMemoryManager::GetMemoryInfo( .c_str()); } + if (shm_info != nullptr) { + *shm_info = std::static_pointer_cast(it->second); + } + if (it->second->kind_ == TRITONSERVER_MEMORY_CPU) { *shm_mapped_addr = (void*)((uint8_t*)it->second->mapped_addr_ + it->second->offset_ + offset); @@ -561,11 +567,19 @@ SharedMemoryManager::GetStatus( } else { auto it = shared_memory_map_.find(name); if (it == shared_memory_map_.end()) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_NOT_FOUND, - std::string( - "Unable to find system shared memory region: '" + name + "'") - .c_str()); + if (memory_type == TRITONSERVER_MEMORY_GPU) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_NOT_FOUND, + std::string( + "Unable to find cuda shared memory region: '" + name + "'") + .c_str()); + } else { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_NOT_FOUND, + std::string( + "Unable to find system shared memory region: '" + name + "'") + .c_str()); + } } if (it->second->kind_ != memory_type) { @@ -632,6 +646,7 @@ SharedMemoryManager::UnregisterAll(TRITONSERVER_MemoryType memory_type) TRITONSERVER_Error* err = UnregisterHelper(it->first, memory_type); if (err != nullptr) { unregister_fails.push_back(it->first); + LOG_VERBOSE(1) << TRITONSERVER_ErrorMessage(err); } } } @@ -645,6 +660,7 @@ SharedMemoryManager::UnregisterAll(TRITONSERVER_MemoryType memory_type) ; if (err != nullptr) { unregister_fails.push_back(it->first); + LOG_VERBOSE(1) << TRITONSERVER_ErrorMessage(err); } } } @@ -669,6 +685,15 @@ SharedMemoryManager::UnregisterHelper( // Must hold the lock on register_mu_ while calling this function. auto it = shared_memory_map_.find(name); if (it != shared_memory_map_.end() && it->second->kind_ == memory_type) { + if (it->second.use_count() > 1) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + std::string( + "Cannot unregister shared memory region '" + name + + "', it is currently in use.") + .c_str()); + } + if (it->second->kind_ == TRITONSERVER_MEMORY_CPU) { RETURN_IF_ERR( UnmapSharedMemory(it->second->mapped_addr_, it->second->byte_size_)); diff --git a/src/shared_memory_manager.h b/src/shared_memory_manager.h index 51eb0f0786..393fd29128 100644 --- a/src/shared_memory_manager.h +++ b/src/shared_memory_manager.h @@ -1,4 +1,4 @@ -// Copyright 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -50,6 +50,48 @@ class SharedMemoryManager { SharedMemoryManager() = default; ~SharedMemoryManager(); + /// A struct that records the shared memory regions registered by the shared + /// memory manager. + struct SharedMemoryInfo { + SharedMemoryInfo( + const std::string& name, const std::string& shm_key, + const size_t offset, const size_t byte_size, int shm_fd, + void* mapped_addr, const TRITONSERVER_MemoryType kind, + const int64_t device_id) + : name_(name), shm_key_(shm_key), offset_(offset), + byte_size_(byte_size), shm_fd_(shm_fd), mapped_addr_(mapped_addr), + kind_(kind), device_id_(device_id) + { + } + + std::string name_; + std::string shm_key_; + size_t offset_; + size_t byte_size_; + int shm_fd_; + void* mapped_addr_; + TRITONSERVER_MemoryType kind_; + int64_t device_id_; + }; + +#ifdef TRITON_ENABLE_GPU + struct CUDASharedMemoryInfo : SharedMemoryInfo { + CUDASharedMemoryInfo( + const std::string& name, const std::string& shm_key, + const size_t offset, const size_t byte_size, int shm_fd, + void* mapped_addr, const TRITONSERVER_MemoryType kind, + const int64_t device_id, const cudaIpcMemHandle_t* cuda_ipc_handle) + : SharedMemoryInfo( + name, shm_key, offset, byte_size, shm_fd, mapped_addr, kind, + device_id), + cuda_ipc_handle_(*cuda_ipc_handle) + { + } + + cudaIpcMemHandle_t cuda_ipc_handle_; + }; +#endif + /// Add a shared memory block representing shared memory in system /// (CPU) memory to the manager. Return TRITONSERVER_ERROR_ALREADY_EXISTS /// if a shared memory block of the same name already exists in the manager. @@ -90,11 +132,18 @@ class SharedMemoryManager { /// \param memory_type Returns the type of the memory /// \param device_id Returns the device id associated with the /// memory block - /// \return a TRITONSERVER_Error indicating success or failure. + /// \param shm_info Returns a shared pointer reference(read-only) to the + /// shared memory block's information. + /// This pointer will automatically increase the usage count, preventing + /// unregistration while the reference is held. The reference must be cleared + /// or set to nullptr when no longer needed, to decrease the count and allow + /// unregistration. + /// \return a TRITONSERVER_Error indicating success or + /// failure. TRITONSERVER_Error* GetMemoryInfo( const std::string& name, size_t offset, size_t byte_size, void** shm_mapped_addr, TRITONSERVER_MemoryType* memory_type, - int64_t* device_id); + int64_t* device_id, std::shared_ptr* shm_info); #ifdef TRITON_ENABLE_GPU /// Get the CUDA memory handle associated with the block name. @@ -139,50 +188,8 @@ class SharedMemoryManager { TRITONSERVER_Error* UnregisterHelper( const std::string& name, TRITONSERVER_MemoryType memory_type); - /// A struct that records the shared memory regions registered by the shared - /// memory manager. - struct SharedMemoryInfo { - SharedMemoryInfo( - const std::string& name, const std::string& shm_key, - const size_t offset, const size_t byte_size, int shm_fd, - void* mapped_addr, const TRITONSERVER_MemoryType kind, - const int64_t device_id) - : name_(name), shm_key_(shm_key), offset_(offset), - byte_size_(byte_size), shm_fd_(shm_fd), mapped_addr_(mapped_addr), - kind_(kind), device_id_(device_id) - { - } - - std::string name_; - std::string shm_key_; - size_t offset_; - size_t byte_size_; - int shm_fd_; - void* mapped_addr_; - TRITONSERVER_MemoryType kind_; - int64_t device_id_; - }; - -#ifdef TRITON_ENABLE_GPU - struct CUDASharedMemoryInfo : SharedMemoryInfo { - CUDASharedMemoryInfo( - const std::string& name, const std::string& shm_key, - const size_t offset, const size_t byte_size, int shm_fd, - void* mapped_addr, const TRITONSERVER_MemoryType kind, - const int64_t device_id, const cudaIpcMemHandle_t* cuda_ipc_handle) - : SharedMemoryInfo( - name, shm_key, offset, byte_size, shm_fd, mapped_addr, kind, - device_id), - cuda_ipc_handle_(*cuda_ipc_handle) - { - } - - cudaIpcMemHandle_t cuda_ipc_handle_; - }; -#endif - using SharedMemoryStateMap = - std::map>; + std::map>; // A map between the name and the details of the associated // shared memory block SharedMemoryStateMap shared_memory_map_; diff --git a/tools/add_copyright.py b/tools/add_copyright.py new file mode 100644 index 0000000000..34432bb0c6 --- /dev/null +++ b/tools/add_copyright.py @@ -0,0 +1,365 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import argparse +import os +import re +import subprocess +import sys +from datetime import datetime +from typing import Callable, Dict, Optional, Sequence + +current_year = str(datetime.now().year) + +ROOT_DIR = os.path.join(os.path.dirname(__file__), os.path.pardir) + +LICENSE_PATH = os.path.join(ROOT_DIR, "LICENSE") + +COPYRIGHT_YEAR_PAT = re.compile( + r"Copyright( \(c\))? (\d{4})?-?(\d{4}), NVIDIA CORPORATION" +) + + +def has_copyright(content: str) -> bool: + return COPYRIGHT_YEAR_PAT.search(content) + + +def update_copyright_year( + path: str, content: Optional[str] = None, disallow_range: bool = False +) -> str: + """ + Updates the copyright year in the provided file. + If the copyright is not present in the file, this function has no effect. + """ + if content is None: + with open(path, "r") as f: + content = f.read() + + match = COPYRIGHT_YEAR_PAT.search(content) + min_year = match.groups()[1] or match.groups()[2] + + new_copyright = f"Copyright{match.groups()[0] or ''} " + if min_year < current_year and not disallow_range: + new_copyright += f"{min_year}-{current_year}" + else: + new_copyright += f"{current_year}" + new_copyright += ", NVIDIA CORPORATION" + + updated_content = COPYRIGHT_YEAR_PAT.sub(new_copyright, content) + + if content != updated_content: + with open(path, "w") as f: + f.write(updated_content) + + +def update_and_get_license() -> str: + """ + Updates the copyright year in the LICENSE file if necessary and then + returns its contents. + """ + # TODO: Check if this is right - if the license file needs to have a range, + # we need to remove the range before returning the license text. + # + # License file should always have the current year. + update_copyright_year(LICENSE_PATH, disallow_range=True) + + with open(LICENSE_PATH, "r") as license_file: + return license_file.read() + + +LICENSE_TEXT = update_and_get_license() + +# +# Header manipulation helpers +# + + +def prefix_lines(content: str, prefix: str) -> str: + # NOTE: This could have been done via `textwrap.indent`, but we're not actually indenting, + # so it seems semantically wrong to do that. + return prefix + f"\n{prefix}".join(content.splitlines()) + + +def insert_after(regex: str) -> Callable[[str], str]: + """ + Builds a callback that will insert a provided header after + the specified regular expression. If the expression is not + found in the file contents, the header will be inserted at the + beginning of the file. + + Args: + regex: The regular expression to match. + + Returns: + A callable that can be used as the `add_header` argument to `update_or_add_header`. + """ + + def add_header(header: str, content: str) -> str: + match = re.match(regex, content) + + if match is None: + return header + "\n" + content + + insertion_point = match.span()[-1] + + return content[:insertion_point] + f"{header}\n" + content[insertion_point:] + + return add_header + + +def update_or_add_header( + path: str, header: str, add_header: Optional[Callable[[str, str], str]] = None +): + """ + Updates in place or adds a new copyright header to the specified file. + + Args: + path: The path of the file. + header: The contents of the copyright header. + add_header: A callback that receives the copyright header and file contents and + controls how the contents of the file are updated. By default, the copyright + header is prepended to the file. + """ + with open(path, "r") as f: + content = f.read() + + if has_copyright(content): + update_copyright_year(path, content) + return + + add_header = add_header or (lambda header, content: header + "\n" + content) + + content = add_header(header, content) + + # As a sanity check, make sure we didn't accidentally add the copyright header + # twice, or add a new header when one was already present. + if content.count("Copyright (c)") != 1: + print( + f"WARNING: Something went wrong while processing: {path}!\n" + "Please check if the copyright header was included twice or wasn't added at all. " + ) + + with open(path, "w") as f: + f.write(content) + + +# Each file type requires slightly different handling when inserting the copyright +# header. For example, for C++ files, the header must be prefixed with `//` and for +# shell scripts, it must be prefixed with `#` and must be inserted *after* the shebang. +# +# This mapping stores callables that return whether a handler wants to process a specified +# file based on the path along with callables that will accept the file path and update +# it with the copyright header. +FILE_TYPE_HANDLERS: Dict[Callable[[str], bool], Callable[[str], None]] = {} + + +# +# Path matching callables +# These allow registered functions to more easily specify what kinds of +# paths they should be applied to. +# +def has_ext(exts: Sequence[str]): + def has_ext_impl(path: str): + _, ext = os.path.splitext(path) + return ext in exts + + return has_ext_impl + + +def basename_is(expected_path: str): + return lambda path: os.path.basename(path) == expected_path + + +def path_contains(expected: str): + return lambda path: expected in path + + +def any_of(*funcs: Sequence[Callable[[str], bool]]): + return lambda path: any(func(path) for func in funcs) + + +# +# File handlers for different types of files. +# Many types of files require very similar handling - those are combined where possible. +# + + +def register(match: Callable[[str], bool]): + def register_impl(func): + FILE_TYPE_HANDLERS[match] = func + return func + + return register_impl + + +@register( + any_of( + has_ext([".py", ".sh", ".bash", ".yaml", ".pbtxt"]), + basename_is("CMakeLists.txt"), + path_contains("Dockerfile"), + ) +) +def py_or_shell_like(path): + update_or_add_header( + path, + prefix_lines(LICENSE_TEXT, "# "), + # Insert the header *after* the shebang. + # NOTE: This could break if there is a shebang-like pattern elsewhere in the file. + # In that case, this could be edited to check only the first line of the file (after removing whitespace). + insert_after(r"#!(.*)\n"), + ) + + +@register(has_ext([".cc", ".h"])) +def cpp(path): + update_or_add_header(path, prefix_lines(LICENSE_TEXT, "// ")) + + +@register(has_ext([".tpl"])) +def tpl(path): + update_or_add_header(path, "{{/*\n" + prefix_lines(LICENSE_TEXT, "# ") + "\n*/}}") + + +@register(has_ext([".html", ".md"])) +def html_md(path): + update_or_add_header(path, "") + + +def add_copyrights(paths): + for path in paths: + for match, handler in FILE_TYPE_HANDLERS.items(): + if match(path): + handler(path) + break + else: + print( + f"WARNING: No handler registered for file: {path}. Please add a new handler to {__file__}!" + ) + + subprocess.run(["git", "add"] + paths) + + print(f"Processed copyright headers for {len(paths)} file(s).") + + +def main() -> int: + parser = argparse.ArgumentParser( + description="Adds copyright headers to source files" + ) + parser.add_argument("files", nargs="*") + + args, _ = parser.parse_known_args() + add_copyrights(args.files) + return 0 + + +if __name__ == "__main__": + # sys.exit is important here to avoid the test-related imports below during normal execution. + sys.exit(main()) + + +# +# Integration Tests +# +import tempfile + +import pytest + + +# Processes provided text through the copyright hook by writing it to a temporary file. +def process_text(content, extension): + with tempfile.NamedTemporaryFile("w+", suffix=extension) as f: + f.write(content) + f.flush() + + add_copyrights([f.name]) + + f.seek(0) + return f.read() + + +# We use this slightly weird hack to make sure the copyright hook does not do a text replacement +# of the parameters in the test, since they look exactly like copyright headers. +def make_copyright_text(text): + return f"Copyright {text}" + + +@pytest.mark.parametrize( + "content, expected", + [ + # Convert to range if the year that's already present is older than the current year. + ( + make_copyright_text("(c) 2018, NVIDIA CORPORATION"), + make_copyright_text(f"(c) 2018-{current_year}, NVIDIA CORPORATION"), + ), + ( + make_copyright_text("2018, NVIDIA CORPORATION"), + make_copyright_text(f"2018-{current_year}, NVIDIA CORPORATION"), + ), + # No effect if the year is current: + ( + make_copyright_text(f"(c) {current_year}, NVIDIA CORPORATION"), + make_copyright_text(f"(c) {current_year}, NVIDIA CORPORATION"), + ), + ( + make_copyright_text(f"{current_year}, NVIDIA CORPORATION"), + make_copyright_text(f"{current_year}, NVIDIA CORPORATION"), + ), + # If there is already a range, update the upper bound of the range: + ( + make_copyright_text("(c) 2018-2023, NVIDIA CORPORATION"), + make_copyright_text(f"(c) 2018-{current_year}, NVIDIA CORPORATION"), + ), + ], +) +def test_copyright_update(content, expected): + # We don't really care about the extension here - just needs to be something the hook will recognize. + assert process_text(content, ".py") == expected + + +@pytest.mark.parametrize( + "content, extension, expected", + [ + ("", ".cc", f"// {make_copyright_text(f'(c) {current_year}')}"), + ("", ".h", f"// {make_copyright_text(f'(c) {current_year}')}"), + ("", ".py", f"# {make_copyright_text(f'(c) {current_year}')}"), + ("", ".sh", f"# {make_copyright_text(f'(c) {current_year}')}"), + # Make sure copyright comes after shebangs + ( + "#!/bin/python\n", + ".py", + f"#!/bin/python\n# {make_copyright_text(f'(c) {current_year}')}", + ), + ( + "#!/bin/bash\n", + ".sh", + f"#!/bin/bash\n# {make_copyright_text(f'(c) {current_year}')}", + ), + ], +) +def test_adding_new_copyrights(content, extension, expected): + assert process_text(content, extension).startswith(expected) + + +def test_license_has_no_range(): + assert LICENSE_TEXT.startswith(f"Copyright (c) {current_year},")