build: Upgrade to 24.07, TRT-LLM 0.11.0, and Triton CLI v0.0.10 (#81)

triton-inference-server · Aug 6, 2024 · a050ec1 · a050ec1
1 parent 449f6b8
commit a050ec1
Show file tree

Hide file tree

Showing 20 changed files with 1,544 additions and 543 deletions.
diff --git a/.github/workflows/python-package.yaml b/.github/workflows/python-package.yaml
@@ -36,7 +36,7 @@ jobs:
   build:
     runs-on: ${{ matrix.os }}
     container:
-      image: nvcr.io/nvidia/tritonserver:24.06-py3
+      image: nvcr.io/nvidia/tritonserver:24.07-py3
     strategy:
       fail-fast: false
       matrix:

diff --git a/README.md b/README.md
@@ -22,8 +22,8 @@ and running the CLI from within the latest corresponding `tritonserver`
 container image, which should have all necessary system dependencies installed.
 
 For vLLM and TRT-LLM, you can use their respective images:
-- `nvcr.io/nvidia/tritonserver:24.06-vllm-python-py3`
-- `nvcr.io/nvidia/tritonserver:24.06-trtllm-python-py3`
+- `nvcr.io/nvidia/tritonserver:24.07-vllm-python-py3`
+- `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
 
 If you decide to run the CLI on the host or in a custom image, please
 see this list of [additional dependencies](#additional-dependencies-for-custom-environments)
@@ -38,6 +38,7 @@ matrix below:
 
 | Triton CLI Version | TRT-LLM Version | Triton Container Tag |
 |:------------------:|:---------------:|:--------------------:|
+| 0.0.10 | v0.11.0 | 24.07 |
 | 0.0.9 | v0.10.0 | 24.06 |
 | 0.0.8 | v0.9.0 | 24.05 |
 | 0.0.7 | v0.9.0 | 24.04 |
@@ -56,7 +57,7 @@ It is also possible to install from a specific branch name, a commit hash
 or a tag name. For example to install `triton_cli` with a specific tag:
 
 ```bash
-GIT_REF="0.0.9"
+GIT_REF="0.0.10"
 pip install git+https://github.com/triton-inference-server/triton_cli.git@${GIT_REF}
 ```
 
@@ -91,7 +92,7 @@ triton -h
 triton import -m gpt2
 
 # Start server pointing at the default model repository
-triton start --image nvcr.io/nvidia/tritonserver:24.06-vllm-python-py3
+triton start --image nvcr.io/nvidia/tritonserver:24.07-vllm-python-py3
 
 # Infer with CLI
 triton infer -m gpt2 --prompt "machine learning is"
@@ -145,10 +146,10 @@ docker run -ti \
   --shm-size=1g --ulimit memlock=-1 \
   -v ${HOME}/models:/root/models \
   -v ${HOME}/.cache/huggingface:/root/.cache/huggingface \
-  nvcr.io/nvidia/tritonserver:24.06-vllm-python-py3
+  nvcr.io/nvidia/tritonserver:24.07-vllm-python-py3
 
 # Install the Triton CLI
-pip install git+https://github.com/triton-inference-server/[email protected].9
+pip install git+https://github.com/triton-inference-server/[email protected].10
 
 # Authenticate with huggingface for restricted models like Llama-2 and Llama-3
 huggingface-cli login
@@ -214,10 +215,10 @@ docker run -ti \
   -v /tmp:/tmp \
   -v ${HOME}/models:/root/models \
   -v ${HOME}/.cache/huggingface:/root/.cache/huggingface \
-  nvcr.io/nvidia/tritonserver:24.06-trtllm-python-py3
+  nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
 
 # Install the Triton CLI
-pip install git+https://github.com/triton-inference-server/[email protected].9
+pip install git+https://github.com/triton-inference-server/[email protected].10
 
 # Authenticate with huggingface for restricted models like Llama-2 and Llama-3
 huggingface-cli login

diff --git a/pyproject.toml b/pyproject.toml
@@ -50,7 +50,7 @@ dependencies = [
     "grpcio>=1.64.0",
     "directory-tree == 0.0.4", # may remove in future
     "docker == 6.1.3",
-    "genai-perf @ git+https://github.com/triton-inference-server/client.git@r24.06#subdirectory=src/c++/perf_analyzer/genai-perf",
+    "genai-perf @ git+https://github.com/triton-inference-server/client.git@r24.07#subdirectory=src/c++/perf_analyzer/genai-perf",
     # TODO: rely on tritonclient to pull in protobuf and numpy dependencies?
     "numpy >=1.21,<2",
     "protobuf>=3.7.0",
@@ -59,7 +59,7 @@ dependencies = [
     "rich == 13.5.2",
     # TODO: Test on cpu-only machine if [cuda] dependency is an issue,
     # Use explicit client version matching genai-perf version for tagged release
-    "tritonclient[all] == 2.47",
+    "tritonclient[all] == 2.48",
     "huggingface-hub >= 0.19.4",
     # Testing
     "pytest >= 8.1.1", # may remove later

diff --git a/src/triton_cli/__init__.py b/src/triton_cli/__init__.py
@@ -24,4 +24,4 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-__version__ = "0.0.9"
+__version__ = "0.0.10"
diff --git a/src/triton_cli/client/client.py b/src/triton_cli/client/client.py
@@ -197,6 +197,13 @@ def generate_data(self, config: dict, data_mode: str):
             infer_inputs.append(
                 self.__create_triton_input(name, shape, triton_dtype, data)
             )
+            json_input = {
+                "name": name,
+                "shape": str(data.shape),
+                "dtype": triton_dtype,
+                "value": np.array_str(data),
+            }
+            logger.info(f"Input:\n{json.dumps(json_input, indent=4)}")
 
         return infer_inputs
 
@@ -294,17 +301,18 @@ def __process_infer_result(self, result):
             for output in response["outputs"]:
                 name = output["name"]
                 # TODO: Need special logic for string/bytes type
-                np_data = result.as_numpy(name)
+                data = result.as_numpy(name)
                 # WAR for LLMs
-                if np_data.dtype == np.object_:
+                if data.dtype == np.object_:
                     # Assume 2D-output (batch_size, texts)
-                    texts = np_data.flatten()
-                    np_data = np.array([text.decode("utf-8") for text in texts])
+                    texts = data.flatten()
+                    data = np.array([text.decode("utf-8") for text in texts])
 
-                output_data_str = np.array_str(np_data)
+                output_data_str = np.array_str(data)
                 json_output = {
                     "name": name,
-                    "shape": str(np_data.shape),
+                    "shape": str(data.shape),
+                    "dtype": output["datatype"],
                     "value": output_data_str,
                 }
                 logger.info(f"Output:\n{json.dumps(json_output, indent=4)}")

diff --git a/src/triton_cli/docker/Dockerfile b/src/triton_cli/docker/Dockerfile
@@ -1,9 +1,9 @@
 # TRT-LLM image contains engine building and runtime dependencies
-FROM nvcr.io/nvidia/tritonserver:24.06-trtllm-python-py3
+FROM nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
 
 # Setup vLLM Triton backend
 RUN mkdir -p /opt/tritonserver/backends/vllm && \
-    wget -P /opt/tritonserver/backends/vllm https://raw.githubusercontent.com/triton-inference-server/vllm_backend/r24.06/src/model.py
+    wget -P /opt/tritonserver/backends/vllm https://raw.githubusercontent.com/triton-inference-server/vllm_backend/r24.07/src/model.py
 
 # vLLM runtime dependencies
-RUN pip install "vllm==0.4.3"
+RUN pip install "vllm==0.5.0.post1"
diff --git a/src/triton_cli/templates/trt_llm/postprocessing/1/model.py b/src/triton_cli/templates/trt_llm/postprocessing/1/model.py
@@ -52,22 +52,46 @@ def initialize(self, args):
           * model_name: Model name
         """
         # Parse model configs
-        model_config = json.loads(args["model_config"])
-        tokenizer_dir = model_config["parameters"]["tokenizer_dir"]["string_value"]
-        self.skip_special_tokens = model_config["parameters"].get(
-            "skip_special_tokens", {"string_value": "true"}
-        )["string_value"].lower() in ["true", "1", "t", "y", "yes"]
+        model_config = json.loads(args['model_config'])
+        tokenizer_dir = model_config['parameters']['tokenizer_dir'][
+            'string_value']
+
+        skip_special_tokens = model_config['parameters'].get(
+            'skip_special_tokens')
+        if skip_special_tokens is not None:
+            skip_special_tokens_str = skip_special_tokens[
+                'string_value'].lower()
+            if skip_special_tokens_str in [
+                    'true', 'false', '1', '0', 't', 'f', 'y', 'n', 'yes', 'no'
+            ]:
+                self.skip_special_tokens = skip_special_tokens_str in [
+                    'true', '1', 't', 'y', 'yes'
+                ]
+            else:
+                print(
+                    f"[TensorRT-LLM][WARNING] Don't setup 'skip_special_tokens' correctly (set value is {skip_special_tokens['string_value']}). Set it as True by default."
+                )
+                self.skip_special_tokens = True
+        else:
+            print(
+                f"[TensorRT-LLM][WARNING] Don't setup 'skip_special_tokens'. Set it as True by default."
+            )
+            self.skip_special_tokens = True
 
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            tokenizer_dir, legacy=False, padding_side="left", trust_remote_code=True
-        )
-        self.tokenizer.pad_token = self.tokenizer.eos_token
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
+                                                       legacy=False,
+                                                       padding_side='left',
+                                                       trust_remote_code=True)
+        if not self.tokenizer.pad_token:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
 
         # Parse model output configs
-        output_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT")
+        output_config = pb_utils.get_output_config_by_name(
+            model_config, "OUTPUT")
 
         # Convert Triton types to numpy types
-        self.output_dtype = pb_utils.triton_string_to_numpy(output_config["data_type"])
+        self.output_dtype = pb_utils.triton_string_to_numpy(
+            output_config['data_type'])
 
     def execute(self, requests):
         """`execute` must be implemented in every Python model. `execute`
@@ -96,31 +120,27 @@ def execute(self, requests):
         for idx, request in enumerate(requests):
             # Get input tensors
             tokens_batch = pb_utils.get_input_tensor_by_name(
-                request, "TOKENS_BATCH"
-            ).as_numpy()
+                request, 'TOKENS_BATCH').as_numpy()
 
             # Get sequence length
             sequence_lengths = pb_utils.get_input_tensor_by_name(
-                request, "SEQUENCE_LENGTH"
-            ).as_numpy()
+                request, 'SEQUENCE_LENGTH').as_numpy()
 
             # Get cum log probs
-            cum_log_probs = pb_utils.get_input_tensor_by_name(request, "CUM_LOG_PROBS")
+            cum_log_probs = pb_utils.get_input_tensor_by_name(
+                request, 'CUM_LOG_PROBS')
 
             # Get sequence length
             output_log_probs = pb_utils.get_input_tensor_by_name(
-                request, "OUTPUT_LOG_PROBS"
-            )
+                request, 'OUTPUT_LOG_PROBS')
 
             # Get context logits
             context_logits = pb_utils.get_input_tensor_by_name(
-                request, "CONTEXT_LOGITS"
-            )
+                request, 'CONTEXT_LOGITS')
 
             # Get generation logits
             generation_logits = pb_utils.get_input_tensor_by_name(
-                request, "GENERATION_LOGITS"
-            )
+                request, 'GENERATION_LOGITS')
 
             # Reshape Input
             # tokens_batch = tokens_batch.reshape([-1, tokens_batch.shape[0]])
@@ -132,54 +152,49 @@ def execute(self, requests):
             # Create output tensors. You need pb_utils.Tensor
             # objects to create pb_utils.InferenceResponse.
             output_tensor = pb_utils.Tensor(
-                "OUTPUT", np.array(outputs).astype(self.output_dtype)
-            )
+                'OUTPUT',
+                np.array(outputs).astype(self.output_dtype))
 
             outputs = []
             outputs.append(output_tensor)
 
             if cum_log_probs:
-                out_cum_log_probs = pb_utils.Tensor(
-                    "OUT_CUM_LOG_PROBS", cum_log_probs.as_numpy()
-                )
+                out_cum_log_probs = pb_utils.Tensor('OUT_CUM_LOG_PROBS',
+                                                    cum_log_probs.as_numpy())
                 outputs.append(out_cum_log_probs)
             else:
                 out_cum_log_probs = pb_utils.Tensor(
-                    "OUT_CUM_LOG_PROBS", np.array([[0.0]], dtype=np.float32)
-                )
+                    'OUT_CUM_LOG_PROBS', np.array([[0.0]], dtype=np.float32))
                 outputs.append(out_cum_log_probs)
 
             if output_log_probs:
                 out_output_log_probs = pb_utils.Tensor(
-                    "OUT_OUTPUT_LOG_PROBS", output_log_probs.as_numpy()
-                )
+                    'OUT_OUTPUT_LOG_PROBS', output_log_probs.as_numpy())
                 outputs.append(out_output_log_probs)
             else:
                 out_output_log_probs = pb_utils.Tensor(
-                    "OUT_OUTPUT_LOG_PROBS", np.array([[[0.0]]], dtype=np.float32)
-                )
+                    'OUT_OUTPUT_LOG_PROBS',
+                    np.array([[[0.0]]], dtype=np.float32))
                 outputs.append(out_output_log_probs)
 
             if context_logits:
-                out_context_logits = pb_utils.Tensor(
-                    "OUT_CONTEXT_LOGITS", context_logits.as_numpy()
-                )
+                out_context_logits = pb_utils.Tensor('OUT_CONTEXT_LOGITS',
+                                                     context_logits.as_numpy())
                 outputs.append(out_context_logits)
             else:
                 out_context_logits = pb_utils.Tensor(
-                    "OUT_CONTEXT_LOGITS", np.array([[[0.0]]], dtype=np.float32)
-                )
+                    'OUT_CONTEXT_LOGITS', np.array([[[0.0]]],
+                                                   dtype=np.float32))
                 outputs.append(out_context_logits)
 
             if generation_logits:
                 out_generation_logits = pb_utils.Tensor(
-                    "OUT_GENERATION_LOGITS", generation_logits.as_numpy()
-                )
+                    'OUT_GENERATION_LOGITS', generation_logits.as_numpy())
                 outputs.append(out_generation_logits)
             else:
                 out_generation_logits = pb_utils.Tensor(
-                    "OUT_GENERATION_LOGITS", np.array([[[[0.0]]]], dtype=np.float32)
-                )
+                    'OUT_GENERATION_LOGITS',
+                    np.array([[[[0.0]]]], dtype=np.float32))
                 outputs.append(out_generation_logits)
 
             # Create InferenceResponse. You can set an error here in case
@@ -189,7 +204,8 @@ def execute(self, requests):
             #
             # pb_utils.InferenceResponse(
             #    output_tensors=..., TritonError("An error occurred"))
-            inference_response = pb_utils.InferenceResponse(output_tensors=outputs)
+            inference_response = pb_utils.InferenceResponse(
+                output_tensors=outputs)
             responses.append(inference_response)
 
         # You should return a list of pb_utils.InferenceResponse. Length
@@ -201,15 +217,15 @@ def finalize(self):
         Implementing `finalize` function is optional. This function allows
         the model to perform any necessary clean ups before exit.
         """
-        print("Cleaning up...")
+        print('Cleaning up...')
 
     def _postprocessing(self, tokens_batch, sequence_lengths):
         outputs = []
         for batch_idx, beam_tokens in enumerate(tokens_batch):
             for beam_idx, tokens in enumerate(beam_tokens):
                 seq_len = sequence_lengths[batch_idx][beam_idx]
                 output = self.tokenizer.decode(
-                    tokens[:seq_len], skip_special_tokens=self.skip_special_tokens
-                )
-                outputs.append(output.encode("utf8"))
+                    tokens[:seq_len],
+                    skip_special_tokens=self.skip_special_tokens)
+                outputs.append(output.encode('utf8'))
         return outputs
diff --git a/src/triton_cli/templates/trt_llm/postprocessing/config.pbtxt b/src/triton_cli/templates/trt_llm/postprocessing/config.pbtxt
@@ -24,6 +24,7 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+name: "postprocessing"
 backend: "python"
 max_batch_size: ${triton_max_batch_size}
 input [
@@ -100,7 +101,7 @@ parameters {
 parameters {
   key: "skip_special_tokens"
   value: {
-    string_value: "True"
+    string_value: "${skip_special_tokens}"
   }
 }