From 17b371061869f038a264586276c164b60c2b5a9c Mon Sep 17 00:00:00 2001
From: Krishnan Prashanth <kprashanth@nvidia.com>
Date: Mon, 23 Sep 2024 12:40:06 -0700
Subject: [PATCH 01/12] Working ensemble example

---
 README.md                                     |  1 +
 ci/L0_backend_vllm/vllm_backend/test.sh       |  9 +--
 .../vllm_backend/vllm_backend_test.py         |  3 +
 samples/{ => basic_model}/client.py           |  0
 .../model_repository}/1/model.json            |  0
 .../model_repository}/config.pbtxt            |  0
 samples/{ => basic_model}/prompts.txt         |  0
 samples/ensemble_model/README.md              | 18 ++++++
 samples/ensemble_model/client.py              |  0
 .../ensemble_model/config.pbtxt               | 57 +++++++++++++++++++
 .../model_repository/gpt2/1/model.json        |  5 ++
 .../model_repository/gpt2/config.pbtxt        |  2 +
 .../model_repository/prefix_model/1/model.py  | 18 ++++++
 .../prefix_model/config.pbtxt                 | 17 ++++++
 .../uppercase_model/1/model.py                | 17 ++++++
 .../uppercase_model/config.pbtxt              | 17 ++++++
 16 files changed, 160 insertions(+), 4 deletions(-)
 rename samples/{ => basic_model}/client.py (100%)
 rename samples/{model_repository/vllm_model => basic_model/model_repository}/1/model.json (100%)
 rename samples/{model_repository/vllm_model => basic_model/model_repository}/config.pbtxt (100%)
 rename samples/{ => basic_model}/prompts.txt (100%)
 create mode 100644 samples/ensemble_model/README.md
 create mode 100644 samples/ensemble_model/client.py
 create mode 100644 samples/ensemble_model/model_repository/ensemble_model/config.pbtxt
 create mode 100644 samples/ensemble_model/model_repository/gpt2/1/model.json
 create mode 100644 samples/ensemble_model/model_repository/gpt2/config.pbtxt
 create mode 100644 samples/ensemble_model/model_repository/prefix_model/1/model.py
 create mode 100644 samples/ensemble_model/model_repository/prefix_model/config.pbtxt
 create mode 100644 samples/ensemble_model/model_repository/uppercase_model/1/model.py
 create mode 100644 samples/ensemble_model/model_repository/uppercase_model/config.pbtxt

diff --git a/README.md b/README.md
index eb545e77..0a665962 100644
--- a/README.md
+++ b/README.md
@@ -99,6 +99,7 @@ export TRITON_CONTAINER_VERSION=<YY.MM>
                 --endpoint=vertex-ai
                 --upstream-container-version=${TRITON_CONTAINER_VERSION}
                 --backend=python:r${TRITON_CONTAINER_VERSION}
+                --backend=ensemble
                 --backend=vllm:r${TRITON_CONTAINER_VERSION}
 ```
 
diff --git a/ci/L0_backend_vllm/vllm_backend/test.sh b/ci/L0_backend_vllm/vllm_backend/test.sh
index 43b20af7..c6a4c660 100755
--- a/ci/L0_backend_vllm/vllm_backend/test.sh
+++ b/ci/L0_backend_vllm/vllm_backend/test.sh
@@ -35,7 +35,8 @@ SERVER_LOG="./vllm_backend_server.log"
 CLIENT_LOG="./vllm_backend_client.log"
 TEST_RESULT_FILE='test_results.txt'
 CLIENT_PY="./vllm_backend_test.py"
-SAMPLE_MODELS_REPO="../../../samples/model_repository"
+SAMPLE_BASIC_MODELS_REPO="../../../samples/basic_model/model_repository"
+SAMPLE_ENSEMBLE_MODELS_REPO="../../../samples/ensemble_model/model_repository"
 EXPECTED_NUM_TESTS=6
 
 # Helpers =======================================
@@ -49,7 +50,7 @@ function assert_curl_success {
 }
 
 rm -rf models && mkdir -p models
-cp -r ${SAMPLE_MODELS_REPO}/vllm_model models/vllm_opt
+cp -r ${SAMPLE_BASIC_MODELS_REPO}/vllm_model models/vllm_opt
 # `vllm_opt` model will be loaded on server start and stay loaded throughout
 # unittesting. To test vllm model load/unload we use a dedicated
 # `vllm_load_test`. To ensure that vllm's memory profiler will not error out
@@ -63,11 +64,11 @@ wget -P models/add_sub/1/ https://raw.githubusercontent.com/triton-inference-ser
 wget -P models/add_sub https://raw.githubusercontent.com/triton-inference-server/python_backend/main/examples/add_sub/config.pbtxt
 
 # Invalid model attribute
-cp -r ${SAMPLE_MODELS_REPO}/vllm_model models/vllm_invalid_1/
+cp -r ${SAMPLE_BASIC_MODELS_REPO}/vllm_model models/vllm_invalid_1/
 sed -i 's/"disable_log_requests"/"invalid_attribute"/' models/vllm_invalid_1/1/model.json
 
 # Invalid model name
-cp -r ${SAMPLE_MODELS_REPO}/vllm_model models/vllm_invalid_2/
+cp -r ${SAMPLE_BASIC_MODELS_REPO}/vllm_model models/vllm_invalid_2/
 sed -i 's/"facebook\/opt-125m"/"invalid_model"/' models/vllm_invalid_2/1/model.json
 
 RET=0
diff --git a/ci/L0_backend_vllm/vllm_backend/vllm_backend_test.py b/ci/L0_backend_vllm/vllm_backend/vllm_backend_test.py
index 8ca206f0..12b5c357 100644
--- a/ci/L0_backend_vllm/vllm_backend/vllm_backend_test.py
+++ b/ci/L0_backend_vllm/vllm_backend/vllm_backend_test.py
@@ -247,6 +247,9 @@ def _test_python_model(self):
             np.allclose(input0_data - input1_data, response.as_numpy("OUTPUT1"))
         )
 
+    def _test_ensemble_model(self):
+        pass
+
     def tearDown(self):
         self.triton_client.close()
 
diff --git a/samples/client.py b/samples/basic_model/client.py
similarity index 100%
rename from samples/client.py
rename to samples/basic_model/client.py
diff --git a/samples/model_repository/vllm_model/1/model.json b/samples/basic_model/model_repository/1/model.json
similarity index 100%
rename from samples/model_repository/vllm_model/1/model.json
rename to samples/basic_model/model_repository/1/model.json
diff --git a/samples/model_repository/vllm_model/config.pbtxt b/samples/basic_model/model_repository/config.pbtxt
similarity index 100%
rename from samples/model_repository/vllm_model/config.pbtxt
rename to samples/basic_model/model_repository/config.pbtxt
diff --git a/samples/prompts.txt b/samples/basic_model/prompts.txt
similarity index 100%
rename from samples/prompts.txt
rename to samples/basic_model/prompts.txt
diff --git a/samples/ensemble_model/README.md b/samples/ensemble_model/README.md
new file mode 100644
index 00000000..2c2018b2
--- /dev/null
+++ b/samples/ensemble_model/README.md
@@ -0,0 +1,18 @@
+
+
+
+```
+model_repository/
+├── gpt2 <--------- (vLLM model)
+│   ├── 1
+│   │   └── model.json
+│   └── config.pbtxt
+├── prefix_model (post-processing python model)
+│   ├── 1
+│   │   └── model.py
+│   └── config.pbtxt
+└── uppercase_model (pre-processing python model)
+    ├── 1
+    │   └── model.py
+    └── config.pbtxt
+```
\ No newline at end of file
diff --git a/samples/ensemble_model/client.py b/samples/ensemble_model/client.py
new file mode 100644
index 00000000..e69de29b
diff --git a/samples/ensemble_model/model_repository/ensemble_model/config.pbtxt b/samples/ensemble_model/model_repository/ensemble_model/config.pbtxt
new file mode 100644
index 00000000..19403e46
--- /dev/null
+++ b/samples/ensemble_model/model_repository/ensemble_model/config.pbtxt
@@ -0,0 +1,57 @@
+name: "ensemble_model"
+platform: "ensemble"
+max_batch_size: 1
+input [
+  {
+    name: "text_input"
+    data_type: TYPE_STRING
+    dims: [ 1 ]
+  }
+]
+output [
+  {
+    name: "text_output"
+    data_type: TYPE_STRING
+    dims: [ -1 ]
+  }
+]
+ensemble_scheduling {
+  step [
+    {
+      model_name: "uppercase_model"
+      model_version: -1
+      input_map {
+        key: "text_input"
+        value: "text_input"
+      }
+      output_map {
+        key: "text_output"
+        value: "uppercase_text"
+      }
+    },
+    {
+      model_name: "gpt2"
+      model_version: -1
+      input_map {
+        key: "text_input"
+        value: "uppercase_text"
+      }
+      output_map {
+        key: "text_output"
+        value: "gpt2_text"
+      }
+    },
+    {
+      model_name: "prefix_model"
+      model_version: -1
+      input_map {
+        key: "text_input"
+        value: "gpt2_text"
+      }
+      output_map {
+        key: "text_output"
+        value: "text_output"
+      }
+    }
+  ]
+}
\ No newline at end of file
diff --git a/samples/ensemble_model/model_repository/gpt2/1/model.json b/samples/ensemble_model/model_repository/gpt2/1/model.json
new file mode 100644
index 00000000..f782cd90
--- /dev/null
+++ b/samples/ensemble_model/model_repository/gpt2/1/model.json
@@ -0,0 +1,5 @@
+{
+    "model": "gpt2",
+    "disable_log_requests": true,
+    "gpu_memory_utilization": 0.85
+}
\ No newline at end of file
diff --git a/samples/ensemble_model/model_repository/gpt2/config.pbtxt b/samples/ensemble_model/model_repository/gpt2/config.pbtxt
new file mode 100644
index 00000000..291fdff0
--- /dev/null
+++ b/samples/ensemble_model/model_repository/gpt2/config.pbtxt
@@ -0,0 +1,2 @@
+backend: "vllm"
+instance_group [{kind: KIND_MODEL}]
diff --git a/samples/ensemble_model/model_repository/prefix_model/1/model.py b/samples/ensemble_model/model_repository/prefix_model/1/model.py
new file mode 100644
index 00000000..ca70cf8e
--- /dev/null
+++ b/samples/ensemble_model/model_repository/prefix_model/1/model.py
@@ -0,0 +1,18 @@
+import triton_python_backend_utils as pb_utils
+
+import numpy as np
+
+class TritonPythonModel:
+    def execute(self, requests):
+        responses = []
+        for request in requests:
+            input_tensor = pb_utils.get_input_tensor_by_name(request, "text_input")
+            input_string = input_tensor.as_numpy()[0][0].decode('utf-8')
+            output_string = f"gpt2 results: {input_string}"
+            output_np_array = np.array([output_string.encode('utf-8')])
+            output_tensor = pb_utils.Tensor("text_output", 
+                                             output_np_array)
+            inference_response = pb_utils.InferenceResponse(
+                output_tensors=[output_tensor])
+            responses.append(inference_response)
+        return responses
\ No newline at end of file
diff --git a/samples/ensemble_model/model_repository/prefix_model/config.pbtxt b/samples/ensemble_model/model_repository/prefix_model/config.pbtxt
new file mode 100644
index 00000000..59de7cd2
--- /dev/null
+++ b/samples/ensemble_model/model_repository/prefix_model/config.pbtxt
@@ -0,0 +1,17 @@
+name: "prefix_model"
+backend: "python"
+max_batch_size: 1
+input [
+  {
+    name: "text_input"
+    data_type: TYPE_STRING
+    dims: [ 1 ]
+  }
+]
+output [
+  {
+    name: "text_output"
+    data_type: TYPE_STRING
+    dims: [ 1 ]
+  }
+]
\ No newline at end of file
diff --git a/samples/ensemble_model/model_repository/uppercase_model/1/model.py b/samples/ensemble_model/model_repository/uppercase_model/1/model.py
new file mode 100644
index 00000000..a29c3bb1
--- /dev/null
+++ b/samples/ensemble_model/model_repository/uppercase_model/1/model.py
@@ -0,0 +1,17 @@
+import triton_python_backend_utils as pb_utils
+import numpy as np
+
+class TritonPythonModel:
+    def execute(self, requests):
+        responses = []
+        for request in requests:
+            input_tensor = pb_utils.get_input_tensor_by_name(request, "text_input")
+            input_string = input_tensor.as_numpy()[0][0].decode('utf-8')
+            output_string = input_string.upper()
+            output_np_array = np.array([output_string.encode('utf-8')])
+            output_tensor = pb_utils.Tensor("text_output", 
+                                             output_np_array)
+            inference_response = pb_utils.InferenceResponse(
+                output_tensors=[output_tensor])
+            responses.append(inference_response)
+        return responses
\ No newline at end of file
diff --git a/samples/ensemble_model/model_repository/uppercase_model/config.pbtxt b/samples/ensemble_model/model_repository/uppercase_model/config.pbtxt
new file mode 100644
index 00000000..7ff4582e
--- /dev/null
+++ b/samples/ensemble_model/model_repository/uppercase_model/config.pbtxt
@@ -0,0 +1,17 @@
+name: "uppercase_model"
+backend: "python"
+max_batch_size: 1
+input [
+  {
+    name: "text_input"
+    data_type: TYPE_STRING
+    dims: [ 1 ]
+  }
+]
+output [
+  {
+    name: "text_output"
+    data_type: TYPE_STRING
+    dims: [ 1 ]
+  }
+]
\ No newline at end of file

From 13bd54059ab598c7bcc043843654fc51e61477d7 Mon Sep 17 00:00:00 2001
From: Krishnan Prashanth <kprashanth@nvidia.com>
Date: Mon, 23 Sep 2024 15:13:27 -0700
Subject: [PATCH 02/12] adding ensemble testing

---
 .../vllm_backend/ensemble_config.pbtxt        | 33 +++++++++++
 ci/L0_backend_vllm/vllm_backend/test.sh       | 16 ++++--
 .../vllm_backend/vllm_backend_test.py         | 10 +++-
 samples/{basic_model => }/client.py           |  0
 samples/ensemble_model/README.md              | 18 ------
 samples/ensemble_model/client.py              |  0
 .../ensemble_model/config.pbtxt               | 57 -------------------
 .../model_repository/gpt2/1/model.json        |  5 --
 .../model_repository/gpt2/config.pbtxt        |  2 -
 .../model_repository/prefix_model/1/model.py  | 18 ------
 .../prefix_model/config.pbtxt                 | 17 ------
 .../uppercase_model/1/model.py                | 17 ------
 .../uppercase_model/config.pbtxt              | 17 ------
 .../vllm_model}/1/model.json                  |  0
 .../vllm_model}/config.pbtxt                  |  0
 samples/{basic_model => }/prompts.txt         |  0
 16 files changed, 50 insertions(+), 160 deletions(-)
 create mode 100644 ci/L0_backend_vllm/vllm_backend/ensemble_config.pbtxt
 rename samples/{basic_model => }/client.py (100%)
 delete mode 100644 samples/ensemble_model/README.md
 delete mode 100644 samples/ensemble_model/client.py
 delete mode 100644 samples/ensemble_model/model_repository/ensemble_model/config.pbtxt
 delete mode 100644 samples/ensemble_model/model_repository/gpt2/1/model.json
 delete mode 100644 samples/ensemble_model/model_repository/gpt2/config.pbtxt
 delete mode 100644 samples/ensemble_model/model_repository/prefix_model/1/model.py
 delete mode 100644 samples/ensemble_model/model_repository/prefix_model/config.pbtxt
 delete mode 100644 samples/ensemble_model/model_repository/uppercase_model/1/model.py
 delete mode 100644 samples/ensemble_model/model_repository/uppercase_model/config.pbtxt
 rename samples/{basic_model/model_repository => model_repository/vllm_model}/1/model.json (100%)
 rename samples/{basic_model/model_repository => model_repository/vllm_model}/config.pbtxt (100%)
 rename samples/{basic_model => }/prompts.txt (100%)

diff --git a/ci/L0_backend_vllm/vllm_backend/ensemble_config.pbtxt b/ci/L0_backend_vllm/vllm_backend/ensemble_config.pbtxt
new file mode 100644
index 00000000..96bebdde
--- /dev/null
+++ b/ci/L0_backend_vllm/vllm_backend/ensemble_config.pbtxt
@@ -0,0 +1,33 @@
+name: "ensemble_model"
+platform: "ensemble"
+max_batch_size: 1
+input [
+  {
+    name: "text_input"
+    data_type: TYPE_STRING
+    dims: [ -1 ]
+  }
+]
+output [
+  {
+    name: "text_output"
+    data_type: TYPE_STRING
+    dims: [ -1 ]
+  }
+]
+ensemble_scheduling {
+  step [
+    {
+      model_name: "vllm_model"
+      model_version: -1
+      input_map {
+        key: "text_input"
+        value: "text_input"
+      }
+      output_map {
+        key: "text_output"
+        value: "text_output"
+      }
+    }
+  ]
+}
\ No newline at end of file
diff --git a/ci/L0_backend_vllm/vllm_backend/test.sh b/ci/L0_backend_vllm/vllm_backend/test.sh
index c6a4c660..7e691f38 100755
--- a/ci/L0_backend_vllm/vllm_backend/test.sh
+++ b/ci/L0_backend_vllm/vllm_backend/test.sh
@@ -35,8 +35,7 @@ SERVER_LOG="./vllm_backend_server.log"
 CLIENT_LOG="./vllm_backend_client.log"
 TEST_RESULT_FILE='test_results.txt'
 CLIENT_PY="./vllm_backend_test.py"
-SAMPLE_BASIC_MODELS_REPO="../../../samples/basic_model/model_repository"
-SAMPLE_ENSEMBLE_MODELS_REPO="../../../samples/ensemble_model/model_repository"
+SAMPLE_MODELS_REPO="../../../samples/model_repository"
 EXPECTED_NUM_TESTS=6
 
 # Helpers =======================================
@@ -50,7 +49,7 @@ function assert_curl_success {
 }
 
 rm -rf models && mkdir -p models
-cp -r ${SAMPLE_BASIC_MODELS_REPO}/vllm_model models/vllm_opt
+cp -r ${SAMPLE_MODELS_REPO}/vllm_model models/vllm_opt
 # `vllm_opt` model will be loaded on server start and stay loaded throughout
 # unittesting. To test vllm model load/unload we use a dedicated
 # `vllm_load_test`. To ensure that vllm's memory profiler will not error out
@@ -64,13 +63,18 @@ wget -P models/add_sub/1/ https://raw.githubusercontent.com/triton-inference-ser
 wget -P models/add_sub https://raw.githubusercontent.com/triton-inference-server/python_backend/main/examples/add_sub/config.pbtxt
 
 # Invalid model attribute
-cp -r ${SAMPLE_BASIC_MODELS_REPO}/vllm_model models/vllm_invalid_1/
+cp -r ${SAMPLE_MODELS_REPO}/vllm_model models/vllm_invalid_1/
 sed -i 's/"disable_log_requests"/"invalid_attribute"/' models/vllm_invalid_1/1/model.json
 
 # Invalid model name
-cp -r ${SAMPLE_BASIC_MODELS_REPO}/vllm_model models/vllm_invalid_2/
+cp -r ${SAMPLE_MODELS_REPO}/vllm_model models/vllm_invalid_2/
 sed -i 's/"facebook\/opt-125m"/"invalid_model"/' models/vllm_invalid_2/1/model.json
 
+
+mkdir -p models/ensemble_model/1
+
+cp -r ensemble_config.pbtxt models/ensemble_model/config.pbtxt
+
 RET=0
 
 run_server
@@ -167,4 +171,4 @@ fi
 
 collect_artifacts_from_subdir
 
-exit $RET
+exit $RET
\ No newline at end of file
diff --git a/ci/L0_backend_vllm/vllm_backend/vllm_backend_test.py b/ci/L0_backend_vllm/vllm_backend/vllm_backend_test.py
index 12b5c357..26616afb 100644
--- a/ci/L0_backend_vllm/vllm_backend/vllm_backend_test.py
+++ b/ci/L0_backend_vllm/vllm_backend/vllm_backend_test.py
@@ -48,6 +48,7 @@ def setUp(self):
         self.triton_client = grpcclient.InferenceServerClient(url="localhost:8001")
         self.vllm_model_name = "vllm_opt"
         self.python_model_name = "add_sub"
+        self.enseble_model_name = "ensemble_model"
         self.vllm_load_test = "vllm_load_test"
 
     def test_vllm_triton_backend(self):
@@ -163,6 +164,12 @@ def test_exclude_input_in_output_true(self):
             expected_output=expected_output,
         )
 
+    def test_ensemble_model(self):
+        # Test to ensure that ensemble models are supported in vllm container.
+        # If ensemble support not present, triton will error out at model loading stage.
+        self.triton_client.load_model(self.enseble_model_name)
+        self.assertTrue(self.triton_client.is_model_ready(self.enseble_model_name))
+
     def _test_vllm_model(
         self,
         prompts,
@@ -247,9 +254,6 @@ def _test_python_model(self):
             np.allclose(input0_data - input1_data, response.as_numpy("OUTPUT1"))
         )
 
-    def _test_ensemble_model(self):
-        pass
-
     def tearDown(self):
         self.triton_client.close()
 
diff --git a/samples/basic_model/client.py b/samples/client.py
similarity index 100%
rename from samples/basic_model/client.py
rename to samples/client.py
diff --git a/samples/ensemble_model/README.md b/samples/ensemble_model/README.md
deleted file mode 100644
index 2c2018b2..00000000
--- a/samples/ensemble_model/README.md
+++ /dev/null
@@ -1,18 +0,0 @@
-
-
-
-```
-model_repository/
-├── gpt2 <--------- (vLLM model)
-│   ├── 1
-│   │   └── model.json
-│   └── config.pbtxt
-├── prefix_model (post-processing python model)
-│   ├── 1
-│   │   └── model.py
-│   └── config.pbtxt
-└── uppercase_model (pre-processing python model)
-    ├── 1
-    │   └── model.py
-    └── config.pbtxt
-```
\ No newline at end of file
diff --git a/samples/ensemble_model/client.py b/samples/ensemble_model/client.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/samples/ensemble_model/model_repository/ensemble_model/config.pbtxt b/samples/ensemble_model/model_repository/ensemble_model/config.pbtxt
deleted file mode 100644
index 19403e46..00000000
--- a/samples/ensemble_model/model_repository/ensemble_model/config.pbtxt
+++ /dev/null
@@ -1,57 +0,0 @@
-name: "ensemble_model"
-platform: "ensemble"
-max_batch_size: 1
-input [
-  {
-    name: "text_input"
-    data_type: TYPE_STRING
-    dims: [ 1 ]
-  }
-]
-output [
-  {
-    name: "text_output"
-    data_type: TYPE_STRING
-    dims: [ -1 ]
-  }
-]
-ensemble_scheduling {
-  step [
-    {
-      model_name: "uppercase_model"
-      model_version: -1
-      input_map {
-        key: "text_input"
-        value: "text_input"
-      }
-      output_map {
-        key: "text_output"
-        value: "uppercase_text"
-      }
-    },
-    {
-      model_name: "gpt2"
-      model_version: -1
-      input_map {
-        key: "text_input"
-        value: "uppercase_text"
-      }
-      output_map {
-        key: "text_output"
-        value: "gpt2_text"
-      }
-    },
-    {
-      model_name: "prefix_model"
-      model_version: -1
-      input_map {
-        key: "text_input"
-        value: "gpt2_text"
-      }
-      output_map {
-        key: "text_output"
-        value: "text_output"
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/samples/ensemble_model/model_repository/gpt2/1/model.json b/samples/ensemble_model/model_repository/gpt2/1/model.json
deleted file mode 100644
index f782cd90..00000000
--- a/samples/ensemble_model/model_repository/gpt2/1/model.json
+++ /dev/null
@@ -1,5 +0,0 @@
-{
-    "model": "gpt2",
-    "disable_log_requests": true,
-    "gpu_memory_utilization": 0.85
-}
\ No newline at end of file
diff --git a/samples/ensemble_model/model_repository/gpt2/config.pbtxt b/samples/ensemble_model/model_repository/gpt2/config.pbtxt
deleted file mode 100644
index 291fdff0..00000000
--- a/samples/ensemble_model/model_repository/gpt2/config.pbtxt
+++ /dev/null
@@ -1,2 +0,0 @@
-backend: "vllm"
-instance_group [{kind: KIND_MODEL}]
diff --git a/samples/ensemble_model/model_repository/prefix_model/1/model.py b/samples/ensemble_model/model_repository/prefix_model/1/model.py
deleted file mode 100644
index ca70cf8e..00000000
--- a/samples/ensemble_model/model_repository/prefix_model/1/model.py
+++ /dev/null
@@ -1,18 +0,0 @@
-import triton_python_backend_utils as pb_utils
-
-import numpy as np
-
-class TritonPythonModel:
-    def execute(self, requests):
-        responses = []
-        for request in requests:
-            input_tensor = pb_utils.get_input_tensor_by_name(request, "text_input")
-            input_string = input_tensor.as_numpy()[0][0].decode('utf-8')
-            output_string = f"gpt2 results: {input_string}"
-            output_np_array = np.array([output_string.encode('utf-8')])
-            output_tensor = pb_utils.Tensor("text_output", 
-                                             output_np_array)
-            inference_response = pb_utils.InferenceResponse(
-                output_tensors=[output_tensor])
-            responses.append(inference_response)
-        return responses
\ No newline at end of file
diff --git a/samples/ensemble_model/model_repository/prefix_model/config.pbtxt b/samples/ensemble_model/model_repository/prefix_model/config.pbtxt
deleted file mode 100644
index 59de7cd2..00000000
--- a/samples/ensemble_model/model_repository/prefix_model/config.pbtxt
+++ /dev/null
@@ -1,17 +0,0 @@
-name: "prefix_model"
-backend: "python"
-max_batch_size: 1
-input [
-  {
-    name: "text_input"
-    data_type: TYPE_STRING
-    dims: [ 1 ]
-  }
-]
-output [
-  {
-    name: "text_output"
-    data_type: TYPE_STRING
-    dims: [ 1 ]
-  }
-]
\ No newline at end of file
diff --git a/samples/ensemble_model/model_repository/uppercase_model/1/model.py b/samples/ensemble_model/model_repository/uppercase_model/1/model.py
deleted file mode 100644
index a29c3bb1..00000000
--- a/samples/ensemble_model/model_repository/uppercase_model/1/model.py
+++ /dev/null
@@ -1,17 +0,0 @@
-import triton_python_backend_utils as pb_utils
-import numpy as np
-
-class TritonPythonModel:
-    def execute(self, requests):
-        responses = []
-        for request in requests:
-            input_tensor = pb_utils.get_input_tensor_by_name(request, "text_input")
-            input_string = input_tensor.as_numpy()[0][0].decode('utf-8')
-            output_string = input_string.upper()
-            output_np_array = np.array([output_string.encode('utf-8')])
-            output_tensor = pb_utils.Tensor("text_output", 
-                                             output_np_array)
-            inference_response = pb_utils.InferenceResponse(
-                output_tensors=[output_tensor])
-            responses.append(inference_response)
-        return responses
\ No newline at end of file
diff --git a/samples/ensemble_model/model_repository/uppercase_model/config.pbtxt b/samples/ensemble_model/model_repository/uppercase_model/config.pbtxt
deleted file mode 100644
index 7ff4582e..00000000
--- a/samples/ensemble_model/model_repository/uppercase_model/config.pbtxt
+++ /dev/null
@@ -1,17 +0,0 @@
-name: "uppercase_model"
-backend: "python"
-max_batch_size: 1
-input [
-  {
-    name: "text_input"
-    data_type: TYPE_STRING
-    dims: [ 1 ]
-  }
-]
-output [
-  {
-    name: "text_output"
-    data_type: TYPE_STRING
-    dims: [ 1 ]
-  }
-]
\ No newline at end of file
diff --git a/samples/basic_model/model_repository/1/model.json b/samples/model_repository/vllm_model/1/model.json
similarity index 100%
rename from samples/basic_model/model_repository/1/model.json
rename to samples/model_repository/vllm_model/1/model.json
diff --git a/samples/basic_model/model_repository/config.pbtxt b/samples/model_repository/vllm_model/config.pbtxt
similarity index 100%
rename from samples/basic_model/model_repository/config.pbtxt
rename to samples/model_repository/vllm_model/config.pbtxt
diff --git a/samples/basic_model/prompts.txt b/samples/prompts.txt
similarity index 100%
rename from samples/basic_model/prompts.txt
rename to samples/prompts.txt

From cc0dfeb84215e99b5aac5e59dc7a779142ddf119 Mon Sep 17 00:00:00 2001
From: Krishnan Prashanth <kprashanth@nvidia.com>
Date: Mon, 23 Sep 2024 15:14:09 -0700
Subject: [PATCH 03/12] copyright info

---
 .../vllm_backend/ensemble_config.pbtxt        | 26 +++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/ci/L0_backend_vllm/vllm_backend/ensemble_config.pbtxt b/ci/L0_backend_vllm/vllm_backend/ensemble_config.pbtxt
index 96bebdde..d7e4baea 100644
--- a/ci/L0_backend_vllm/vllm_backend/ensemble_config.pbtxt
+++ b/ci/L0_backend_vllm/vllm_backend/ensemble_config.pbtxt
@@ -1,3 +1,29 @@
+# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 name: "ensemble_model"
 platform: "ensemble"
 max_batch_size: 1

From 54ace7af5b808d50856698914ecfa05d8a26ca95 Mon Sep 17 00:00:00 2001
From: Krishnan Prashanth <kprashanth@nvidia.com>
Date: Mon, 23 Sep 2024 15:34:29 -0700
Subject: [PATCH 04/12] ci modifications

---
 ci/L0_backend_vllm/vllm_backend/curl.out              | 0
 ci/L0_backend_vllm/vllm_backend/ensemble_config.pbtxt | 2 +-
 ci/L0_backend_vllm/vllm_backend/vllm_backend_test.py  | 6 ++++++
 3 files changed, 7 insertions(+), 1 deletion(-)
 create mode 100644 ci/L0_backend_vllm/vllm_backend/curl.out

diff --git a/ci/L0_backend_vllm/vllm_backend/curl.out b/ci/L0_backend_vllm/vllm_backend/curl.out
new file mode 100644
index 00000000..e69de29b
diff --git a/ci/L0_backend_vllm/vllm_backend/ensemble_config.pbtxt b/ci/L0_backend_vllm/vllm_backend/ensemble_config.pbtxt
index d7e4baea..991488c4 100644
--- a/ci/L0_backend_vllm/vllm_backend/ensemble_config.pbtxt
+++ b/ci/L0_backend_vllm/vllm_backend/ensemble_config.pbtxt
@@ -44,7 +44,7 @@ output [
 ensemble_scheduling {
   step [
     {
-      model_name: "vllm_model"
+      model_name: "vllm_opt"
       model_version: -1
       input_map {
         key: "text_input"
diff --git a/ci/L0_backend_vllm/vllm_backend/vllm_backend_test.py b/ci/L0_backend_vllm/vllm_backend/vllm_backend_test.py
index 26616afb..cdedaf3d 100644
--- a/ci/L0_backend_vllm/vllm_backend/vllm_backend_test.py
+++ b/ci/L0_backend_vllm/vllm_backend/vllm_backend_test.py
@@ -167,6 +167,12 @@ def test_exclude_input_in_output_true(self):
     def test_ensemble_model(self):
         # Test to ensure that ensemble models are supported in vllm container.
         # If ensemble support not present, triton will error out at model loading stage.
+        
+        # Before loading ensemble model, the dependency model is loaded.
+        self.triton_client.load_model(self.vllm_model_name)
+        self.assertTrue(self.triton_client.is_model_ready(self.vllm_model_name))
+         
+        
         self.triton_client.load_model(self.enseble_model_name)
         self.assertTrue(self.triton_client.is_model_ready(self.enseble_model_name))
 

From a600a2cf8248377fb3a0ce455c0efd8a5cc1352c Mon Sep 17 00:00:00 2001
From: Krishnan Prashanth <kprashanth@nvidia.com>
Date: Mon, 23 Sep 2024 15:36:14 -0700
Subject: [PATCH 05/12] Removing extra curl.out file

---
 .gitignore                               | 1 +
 ci/L0_backend_vllm/vllm_backend/curl.out | 0
 2 files changed, 1 insertion(+)
 delete mode 100644 ci/L0_backend_vllm/vllm_backend/curl.out

diff --git a/.gitignore b/.gitignore
index 9d4769c9..6b696074 100644
--- a/.gitignore
+++ b/.gitignore
@@ -50,6 +50,7 @@ coverage.xml
 .hypothesis/
 .pytest_cache/
 cover/
+*.out
 
 # Translations
 *.mo
diff --git a/ci/L0_backend_vllm/vllm_backend/curl.out b/ci/L0_backend_vllm/vllm_backend/curl.out
deleted file mode 100644
index e69de29b..00000000

From 78bf0fbaac9c4e1e40e717f0411ff6c48b5d6afe Mon Sep 17 00:00:00 2001
From: Krishnan Prashanth <kprashanth@nvidia.com>
Date: Mon, 23 Sep 2024 16:23:47 -0700
Subject: [PATCH 06/12] updated formatting

---
 ci/L0_backend_vllm/vllm_backend/vllm_backend_test.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/ci/L0_backend_vllm/vllm_backend/vllm_backend_test.py b/ci/L0_backend_vllm/vllm_backend/vllm_backend_test.py
index cdedaf3d..6724ed18 100644
--- a/ci/L0_backend_vllm/vllm_backend/vllm_backend_test.py
+++ b/ci/L0_backend_vllm/vllm_backend/vllm_backend_test.py
@@ -167,12 +167,11 @@ def test_exclude_input_in_output_true(self):
     def test_ensemble_model(self):
         # Test to ensure that ensemble models are supported in vllm container.
         # If ensemble support not present, triton will error out at model loading stage.
-        
+
         # Before loading ensemble model, the dependency model is loaded.
         self.triton_client.load_model(self.vllm_model_name)
         self.assertTrue(self.triton_client.is_model_ready(self.vllm_model_name))
-         
-        
+
         self.triton_client.load_model(self.enseble_model_name)
         self.assertTrue(self.triton_client.is_model_ready(self.enseble_model_name))
 

From ecda2e20b4db40ef5970fc74cde09f54ae1566a7 Mon Sep 17 00:00:00 2001
From: KrishnanPrash <140860868+KrishnanPrash@users.noreply.github.com>
Date: Mon, 23 Sep 2024 16:49:04 -0700
Subject: [PATCH 07/12] Update
 ci/L0_backend_vllm/vllm_backend/vllm_backend_test.py

Co-authored-by: Ryan McCormick <rmccormick@nvidia.com>
---
 ci/L0_backend_vllm/vllm_backend/vllm_backend_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/L0_backend_vllm/vllm_backend/vllm_backend_test.py b/ci/L0_backend_vllm/vllm_backend/vllm_backend_test.py
index 6724ed18..d1d4619f 100644
--- a/ci/L0_backend_vllm/vllm_backend/vllm_backend_test.py
+++ b/ci/L0_backend_vllm/vllm_backend/vllm_backend_test.py
@@ -48,7 +48,7 @@ def setUp(self):
         self.triton_client = grpcclient.InferenceServerClient(url="localhost:8001")
         self.vllm_model_name = "vllm_opt"
         self.python_model_name = "add_sub"
-        self.enseble_model_name = "ensemble_model"
+        self.ensemble_model_name = "ensemble_model"
         self.vllm_load_test = "vllm_load_test"
 
     def test_vllm_triton_backend(self):

From 8697eba13cb65ebeb7d7d3031255f31590a2fdb3 Mon Sep 17 00:00:00 2001
From: KrishnanPrash <140860868+KrishnanPrash@users.noreply.github.com>
Date: Mon, 23 Sep 2024 16:49:13 -0700
Subject: [PATCH 08/12] Update ci/L0_backend_vllm/vllm_backend/test.sh

Co-authored-by: Ryan McCormick <rmccormick@nvidia.com>
---
 ci/L0_backend_vllm/vllm_backend/test.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/L0_backend_vllm/vllm_backend/test.sh b/ci/L0_backend_vllm/vllm_backend/test.sh
index 7e691f38..87e04b21 100755
--- a/ci/L0_backend_vllm/vllm_backend/test.sh
+++ b/ci/L0_backend_vllm/vllm_backend/test.sh
@@ -71,8 +71,8 @@ cp -r ${SAMPLE_MODELS_REPO}/vllm_model models/vllm_invalid_2/
 sed -i 's/"facebook\/opt-125m"/"invalid_model"/' models/vllm_invalid_2/1/model.json
 
 
+# Sanity check ensembles are enabled and can successfully be loaded
 mkdir -p models/ensemble_model/1
-
 cp -r ensemble_config.pbtxt models/ensemble_model/config.pbtxt
 
 RET=0

From 23a0f7aea5b274d7f087502ebe0f92ae8cd72789 Mon Sep 17 00:00:00 2001
From: KrishnanPrash <140860868+KrishnanPrash@users.noreply.github.com>
Date: Mon, 23 Sep 2024 16:57:23 -0700
Subject: [PATCH 09/12] Update
 ci/L0_backend_vllm/vllm_backend/ensemble_config.pbtxt

Co-authored-by: Ryan McCormick <rmccormick@nvidia.com>
---
 ci/L0_backend_vllm/vllm_backend/ensemble_config.pbtxt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/L0_backend_vllm/vllm_backend/ensemble_config.pbtxt b/ci/L0_backend_vllm/vllm_backend/ensemble_config.pbtxt
index 991488c4..07977d0d 100644
--- a/ci/L0_backend_vllm/vllm_backend/ensemble_config.pbtxt
+++ b/ci/L0_backend_vllm/vllm_backend/ensemble_config.pbtxt
@@ -1,4 +1,4 @@
-# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions

From 0ac4b04c45a2f1facc121799f3051dbf04cf4b48 Mon Sep 17 00:00:00 2001
From: Krishnan Prashanth <kprashanth@nvidia.com>
Date: Mon, 23 Sep 2024 17:22:58 -0700
Subject: [PATCH 10/12] Condensed ensemble model testing into existing test
 function

---
 README.md                                     |   2 +-
 .../vllm_backend/models/add_sub/1/model.py    | 141 ++++++++++++++++++
 .../vllm_backend/models/add_sub/config.pbtxt  |  59 ++++++++
 .../models/ensemble_model/config.pbtxt        |  59 ++++++++
 .../models/vllm_invalid_1/1/model.json        |   6 +
 .../models/vllm_invalid_1/config.pbtxt        |  37 +++++
 .../models/vllm_invalid_2/1/model.json        |   6 +
 .../models/vllm_invalid_2/config.pbtxt        |  37 +++++
 .../models/vllm_load_test/1/model.json        |   6 +
 .../models/vllm_load_test/config.pbtxt        |  37 +++++
 .../vllm_backend/models/vllm_opt/1/model.json |   6 +
 .../vllm_backend/models/vllm_opt/config.pbtxt |  37 +++++
 .../vllm_backend/vllm_backend_test.py         |  19 ++-
 13 files changed, 441 insertions(+), 11 deletions(-)
 create mode 100644 ci/L0_backend_vllm/vllm_backend/models/add_sub/1/model.py
 create mode 100644 ci/L0_backend_vllm/vllm_backend/models/add_sub/config.pbtxt
 create mode 100644 ci/L0_backend_vllm/vllm_backend/models/ensemble_model/config.pbtxt
 create mode 100644 ci/L0_backend_vllm/vllm_backend/models/vllm_invalid_1/1/model.json
 create mode 100644 ci/L0_backend_vllm/vllm_backend/models/vllm_invalid_1/config.pbtxt
 create mode 100644 ci/L0_backend_vllm/vllm_backend/models/vllm_invalid_2/1/model.json
 create mode 100644 ci/L0_backend_vllm/vllm_backend/models/vllm_invalid_2/config.pbtxt
 create mode 100644 ci/L0_backend_vllm/vllm_backend/models/vllm_load_test/1/model.json
 create mode 100644 ci/L0_backend_vllm/vllm_backend/models/vllm_load_test/config.pbtxt
 create mode 100644 ci/L0_backend_vllm/vllm_backend/models/vllm_opt/1/model.json
 create mode 100644 ci/L0_backend_vllm/vllm_backend/models/vllm_opt/config.pbtxt

diff --git a/README.md b/README.md
index 0a665962..8a993d99 100644
--- a/README.md
+++ b/README.md
@@ -99,8 +99,8 @@ export TRITON_CONTAINER_VERSION=<YY.MM>
                 --endpoint=vertex-ai
                 --upstream-container-version=${TRITON_CONTAINER_VERSION}
                 --backend=python:r${TRITON_CONTAINER_VERSION}
-                --backend=ensemble
                 --backend=vllm:r${TRITON_CONTAINER_VERSION}
+                --backend=ensemble
 ```
 
 ### Option 3. Add the vLLM Backend to the Default Triton Container
diff --git a/ci/L0_backend_vllm/vllm_backend/models/add_sub/1/model.py b/ci/L0_backend_vllm/vllm_backend/models/add_sub/1/model.py
new file mode 100644
index 00000000..f416e79d
--- /dev/null
+++ b/ci/L0_backend_vllm/vllm_backend/models/add_sub/1/model.py
@@ -0,0 +1,141 @@
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import json
+
+# triton_python_backend_utils is available in every Triton Python model. You
+# need to use this module to create inference requests and responses. It also
+# contains some utility functions for extracting information from model_config
+# and converting Triton input/output types to numpy types.
+import triton_python_backend_utils as pb_utils
+
+
+class TritonPythonModel:
+    """Your Python model must use the same class name. Every Python model
+    that is created must have "TritonPythonModel" as the class name.
+    """
+
+    def initialize(self, args):
+        """`initialize` is called only once when the model is being loaded.
+        Implementing `initialize` function is optional. This function allows
+        the model to initialize any state associated with this model.
+
+        Parameters
+        ----------
+        args : dict
+          Both keys and values are strings. The dictionary keys and values are:
+          * model_config: A JSON string containing the model configuration
+          * model_instance_kind: A string containing model instance kind
+          * model_instance_device_id: A string containing model instance device ID
+          * model_repository: Model repository path
+          * model_version: Model version
+          * model_name: Model name
+        """
+
+        # You must parse model_config. JSON string is not parsed here
+        self.model_config = model_config = json.loads(args["model_config"])
+
+        # Get OUTPUT0 configuration
+        output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0")
+
+        # Get OUTPUT1 configuration
+        output1_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT1")
+
+        # Convert Triton types to numpy types
+        self.output0_dtype = pb_utils.triton_string_to_numpy(
+            output0_config["data_type"]
+        )
+        self.output1_dtype = pb_utils.triton_string_to_numpy(
+            output1_config["data_type"]
+        )
+
+    def execute(self, requests):
+        """`execute` MUST be implemented in every Python model. `execute`
+        function receives a list of pb_utils.InferenceRequest as the only
+        argument. This function is called when an inference request is made
+        for this model. Depending on the batching configuration (e.g. Dynamic
+        Batching) used, `requests` may contain multiple requests. Every
+        Python model, must create one pb_utils.InferenceResponse for every
+        pb_utils.InferenceRequest in `requests`. If there is an error, you can
+        set the error argument when creating a pb_utils.InferenceResponse
+
+        Parameters
+        ----------
+        requests : list
+          A list of pb_utils.InferenceRequest
+
+        Returns
+        -------
+        list
+          A list of pb_utils.InferenceResponse. The length of this list must
+          be the same as `requests`
+        """
+
+        output0_dtype = self.output0_dtype
+        output1_dtype = self.output1_dtype
+
+        responses = []
+
+        # Every Python backend must iterate over everyone of the requests
+        # and create a pb_utils.InferenceResponse for each of them.
+        for request in requests:
+            # Get INPUT0
+            in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0")
+            # Get INPUT1
+            in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1")
+
+            out_0, out_1 = (
+                in_0.as_numpy() + in_1.as_numpy(),
+                in_0.as_numpy() - in_1.as_numpy(),
+            )
+
+            # Create output tensors. You need pb_utils.Tensor
+            # objects to create pb_utils.InferenceResponse.
+            out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0.astype(output0_dtype))
+            out_tensor_1 = pb_utils.Tensor("OUTPUT1", out_1.astype(output1_dtype))
+
+            # Create InferenceResponse. You can set an error here in case
+            # there was a problem with handling this inference request.
+            # Below is an example of how you can set errors in inference
+            # response:
+            #
+            # pb_utils.InferenceResponse(
+            #    output_tensors=..., TritonError("An error occurred"))
+            inference_response = pb_utils.InferenceResponse(
+                output_tensors=[out_tensor_0, out_tensor_1]
+            )
+            responses.append(inference_response)
+
+        # You should return a list of pb_utils.InferenceResponse. Length
+        # of this list must match the length of `requests` list.
+        return responses
+
+    def finalize(self):
+        """`finalize` is called only once when the model is being unloaded.
+        Implementing `finalize` function is OPTIONAL. This function allows
+        the model to perform any necessary clean ups before exit.
+        """
+        print("Cleaning up...")
diff --git a/ci/L0_backend_vllm/vllm_backend/models/add_sub/config.pbtxt b/ci/L0_backend_vllm/vllm_backend/models/add_sub/config.pbtxt
new file mode 100644
index 00000000..0a932770
--- /dev/null
+++ b/ci/L0_backend_vllm/vllm_backend/models/add_sub/config.pbtxt
@@ -0,0 +1,59 @@
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+name: "add_sub"
+backend: "python"
+
+input [
+  {
+    name: "INPUT0"
+    data_type: TYPE_FP32
+    dims: [ 4 ]
+  }
+]
+input [
+  {
+    name: "INPUT1"
+    data_type: TYPE_FP32
+    dims: [ 4 ]
+  }
+]
+output [
+  {
+    name: "OUTPUT0"
+    data_type: TYPE_FP32
+    dims: [ 4 ]
+  }
+]
+output [
+  {
+    name: "OUTPUT1"
+    data_type: TYPE_FP32
+    dims: [ 4 ]
+  }
+]
+
+instance_group [{ kind: KIND_CPU }]
diff --git a/ci/L0_backend_vllm/vllm_backend/models/ensemble_model/config.pbtxt b/ci/L0_backend_vllm/vllm_backend/models/ensemble_model/config.pbtxt
new file mode 100644
index 00000000..07977d0d
--- /dev/null
+++ b/ci/L0_backend_vllm/vllm_backend/models/ensemble_model/config.pbtxt
@@ -0,0 +1,59 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+name: "ensemble_model"
+platform: "ensemble"
+max_batch_size: 1
+input [
+  {
+    name: "text_input"
+    data_type: TYPE_STRING
+    dims: [ -1 ]
+  }
+]
+output [
+  {
+    name: "text_output"
+    data_type: TYPE_STRING
+    dims: [ -1 ]
+  }
+]
+ensemble_scheduling {
+  step [
+    {
+      model_name: "vllm_opt"
+      model_version: -1
+      input_map {
+        key: "text_input"
+        value: "text_input"
+      }
+      output_map {
+        key: "text_output"
+        value: "text_output"
+      }
+    }
+  ]
+}
\ No newline at end of file
diff --git a/ci/L0_backend_vllm/vllm_backend/models/vllm_invalid_1/1/model.json b/ci/L0_backend_vllm/vllm_backend/models/vllm_invalid_1/1/model.json
new file mode 100644
index 00000000..c67b3d19
--- /dev/null
+++ b/ci/L0_backend_vllm/vllm_backend/models/vllm_invalid_1/1/model.json
@@ -0,0 +1,6 @@
+{
+    "model":"facebook/opt-125m",
+    "invalid_attribute": true,
+    "gpu_memory_utilization": 0.5,
+    "enforce_eager": true
+}
diff --git a/ci/L0_backend_vllm/vllm_backend/models/vllm_invalid_1/config.pbtxt b/ci/L0_backend_vllm/vllm_backend/models/vllm_invalid_1/config.pbtxt
new file mode 100644
index 00000000..b5a6c1ae
--- /dev/null
+++ b/ci/L0_backend_vllm/vllm_backend/models/vllm_invalid_1/config.pbtxt
@@ -0,0 +1,37 @@
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# Note: You do not need to change any fields in this configuration.
+
+backend: "vllm"
+
+# The usage of device is deferred to the vLLM engine
+instance_group [
+  {
+    count: 1
+    kind: KIND_MODEL
+  }
+]
diff --git a/ci/L0_backend_vllm/vllm_backend/models/vllm_invalid_2/1/model.json b/ci/L0_backend_vllm/vllm_backend/models/vllm_invalid_2/1/model.json
new file mode 100644
index 00000000..7418f17f
--- /dev/null
+++ b/ci/L0_backend_vllm/vllm_backend/models/vllm_invalid_2/1/model.json
@@ -0,0 +1,6 @@
+{
+    "model":"invalid_model",
+    "disable_log_requests": true,
+    "gpu_memory_utilization": 0.5,
+    "enforce_eager": true
+}
diff --git a/ci/L0_backend_vllm/vllm_backend/models/vllm_invalid_2/config.pbtxt b/ci/L0_backend_vllm/vllm_backend/models/vllm_invalid_2/config.pbtxt
new file mode 100644
index 00000000..b5a6c1ae
--- /dev/null
+++ b/ci/L0_backend_vllm/vllm_backend/models/vllm_invalid_2/config.pbtxt
@@ -0,0 +1,37 @@
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# Note: You do not need to change any fields in this configuration.
+
+backend: "vllm"
+
+# The usage of device is deferred to the vLLM engine
+instance_group [
+  {
+    count: 1
+    kind: KIND_MODEL
+  }
+]
diff --git a/ci/L0_backend_vllm/vllm_backend/models/vllm_load_test/1/model.json b/ci/L0_backend_vllm/vllm_backend/models/vllm_load_test/1/model.json
new file mode 100644
index 00000000..8fa8e151
--- /dev/null
+++ b/ci/L0_backend_vllm/vllm_backend/models/vllm_load_test/1/model.json
@@ -0,0 +1,6 @@
+{
+    "model":"facebook/opt-125m",
+    "disable_log_requests": true,
+    "gpu_memory_utilization": 0.4,
+    "enforce_eager": true
+}
diff --git a/ci/L0_backend_vllm/vllm_backend/models/vllm_load_test/config.pbtxt b/ci/L0_backend_vllm/vllm_backend/models/vllm_load_test/config.pbtxt
new file mode 100644
index 00000000..b5a6c1ae
--- /dev/null
+++ b/ci/L0_backend_vllm/vllm_backend/models/vllm_load_test/config.pbtxt
@@ -0,0 +1,37 @@
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# Note: You do not need to change any fields in this configuration.
+
+backend: "vllm"
+
+# The usage of device is deferred to the vLLM engine
+instance_group [
+  {
+    count: 1
+    kind: KIND_MODEL
+  }
+]
diff --git a/ci/L0_backend_vllm/vllm_backend/models/vllm_opt/1/model.json b/ci/L0_backend_vllm/vllm_backend/models/vllm_opt/1/model.json
new file mode 100644
index 00000000..8fa8e151
--- /dev/null
+++ b/ci/L0_backend_vllm/vllm_backend/models/vllm_opt/1/model.json
@@ -0,0 +1,6 @@
+{
+    "model":"facebook/opt-125m",
+    "disable_log_requests": true,
+    "gpu_memory_utilization": 0.4,
+    "enforce_eager": true
+}
diff --git a/ci/L0_backend_vllm/vllm_backend/models/vllm_opt/config.pbtxt b/ci/L0_backend_vllm/vllm_backend/models/vllm_opt/config.pbtxt
new file mode 100644
index 00000000..b5a6c1ae
--- /dev/null
+++ b/ci/L0_backend_vllm/vllm_backend/models/vllm_opt/config.pbtxt
@@ -0,0 +1,37 @@
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# Note: You do not need to change any fields in this configuration.
+
+backend: "vllm"
+
+# The usage of device is deferred to the vLLM engine
+instance_group [
+  {
+    count: 1
+    kind: KIND_MODEL
+  }
+]
diff --git a/ci/L0_backend_vllm/vllm_backend/vllm_backend_test.py b/ci/L0_backend_vllm/vllm_backend/vllm_backend_test.py
index d1d4619f..05618681 100644
--- a/ci/L0_backend_vllm/vllm_backend/vllm_backend_test.py
+++ b/ci/L0_backend_vllm/vllm_backend/vllm_backend_test.py
@@ -58,6 +58,13 @@ def test_vllm_triton_backend(self):
         self.triton_client.load_model(self.python_model_name)
         self.assertTrue(self.triton_client.is_model_ready(self.python_model_name))
 
+        # Test to ensure that ensemble models are supported in vllm container.
+        # If ensemble support not present, triton will error out at model loading stage.
+        # Ensemble Model is a pipeline consisting of 1 model (vllm_opt)
+        self.triton_client.load_model(self.ensemble_model_name)
+        self.assertTrue(self.triton_client.is_model_ready(self.ensemble_model_name))
+        self.triton_client.unload_model(self.ensemble_model_name)
+        
         # Unload vllm model and test add_sub model
         self.triton_client.unload_model(self.vllm_load_test)
         self.assertFalse(self.triton_client.is_model_ready(self.vllm_load_test))
@@ -86,6 +93,8 @@ def test_vllm_triton_backend(self):
         )
         self.triton_client.unload_model(self.vllm_load_test)
         self.assertFalse(self.triton_client.is_model_ready(self.vllm_load_test))
+        
+        
 
     def test_model_with_invalid_attributes(self):
         model_name = "vllm_invalid_1"
@@ -164,16 +173,6 @@ def test_exclude_input_in_output_true(self):
             expected_output=expected_output,
         )
 
-    def test_ensemble_model(self):
-        # Test to ensure that ensemble models are supported in vllm container.
-        # If ensemble support not present, triton will error out at model loading stage.
-
-        # Before loading ensemble model, the dependency model is loaded.
-        self.triton_client.load_model(self.vllm_model_name)
-        self.assertTrue(self.triton_client.is_model_ready(self.vllm_model_name))
-
-        self.triton_client.load_model(self.enseble_model_name)
-        self.assertTrue(self.triton_client.is_model_ready(self.enseble_model_name))
 
     def _test_vllm_model(
         self,

From 9c4aeddb1a4c7f10a3a0fc7dc866f5f9dcccd629 Mon Sep 17 00:00:00 2001
From: Krishnan Prashanth <kprashanth@nvidia.com>
Date: Mon, 23 Sep 2024 17:24:11 -0700
Subject: [PATCH 11/12] Condensed ensemble model testing into existing test
 function

---
 .../vllm_backend/models/add_sub/1/model.py    | 141 ------------------
 .../vllm_backend/models/add_sub/config.pbtxt  |  59 --------
 .../models/ensemble_model/config.pbtxt        |  59 --------
 .../models/vllm_invalid_1/1/model.json        |   6 -
 .../models/vllm_invalid_1/config.pbtxt        |  37 -----
 .../models/vllm_invalid_2/1/model.json        |   6 -
 .../models/vllm_invalid_2/config.pbtxt        |  37 -----
 .../models/vllm_load_test/1/model.json        |   6 -
 .../models/vllm_load_test/config.pbtxt        |  37 -----
 .../vllm_backend/models/vllm_opt/1/model.json |   6 -
 .../vllm_backend/models/vllm_opt/config.pbtxt |  37 -----
 11 files changed, 431 deletions(-)
 delete mode 100644 ci/L0_backend_vllm/vllm_backend/models/add_sub/1/model.py
 delete mode 100644 ci/L0_backend_vllm/vllm_backend/models/add_sub/config.pbtxt
 delete mode 100644 ci/L0_backend_vllm/vllm_backend/models/ensemble_model/config.pbtxt
 delete mode 100644 ci/L0_backend_vllm/vllm_backend/models/vllm_invalid_1/1/model.json
 delete mode 100644 ci/L0_backend_vllm/vllm_backend/models/vllm_invalid_1/config.pbtxt
 delete mode 100644 ci/L0_backend_vllm/vllm_backend/models/vllm_invalid_2/1/model.json
 delete mode 100644 ci/L0_backend_vllm/vllm_backend/models/vllm_invalid_2/config.pbtxt
 delete mode 100644 ci/L0_backend_vllm/vllm_backend/models/vllm_load_test/1/model.json
 delete mode 100644 ci/L0_backend_vllm/vllm_backend/models/vllm_load_test/config.pbtxt
 delete mode 100644 ci/L0_backend_vllm/vllm_backend/models/vllm_opt/1/model.json
 delete mode 100644 ci/L0_backend_vllm/vllm_backend/models/vllm_opt/config.pbtxt

diff --git a/ci/L0_backend_vllm/vllm_backend/models/add_sub/1/model.py b/ci/L0_backend_vllm/vllm_backend/models/add_sub/1/model.py
deleted file mode 100644
index f416e79d..00000000
--- a/ci/L0_backend_vllm/vllm_backend/models/add_sub/1/model.py
+++ /dev/null
@@ -1,141 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import json
-
-# triton_python_backend_utils is available in every Triton Python model. You
-# need to use this module to create inference requests and responses. It also
-# contains some utility functions for extracting information from model_config
-# and converting Triton input/output types to numpy types.
-import triton_python_backend_utils as pb_utils
-
-
-class TritonPythonModel:
-    """Your Python model must use the same class name. Every Python model
-    that is created must have "TritonPythonModel" as the class name.
-    """
-
-    def initialize(self, args):
-        """`initialize` is called only once when the model is being loaded.
-        Implementing `initialize` function is optional. This function allows
-        the model to initialize any state associated with this model.
-
-        Parameters
-        ----------
-        args : dict
-          Both keys and values are strings. The dictionary keys and values are:
-          * model_config: A JSON string containing the model configuration
-          * model_instance_kind: A string containing model instance kind
-          * model_instance_device_id: A string containing model instance device ID
-          * model_repository: Model repository path
-          * model_version: Model version
-          * model_name: Model name
-        """
-
-        # You must parse model_config. JSON string is not parsed here
-        self.model_config = model_config = json.loads(args["model_config"])
-
-        # Get OUTPUT0 configuration
-        output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0")
-
-        # Get OUTPUT1 configuration
-        output1_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT1")
-
-        # Convert Triton types to numpy types
-        self.output0_dtype = pb_utils.triton_string_to_numpy(
-            output0_config["data_type"]
-        )
-        self.output1_dtype = pb_utils.triton_string_to_numpy(
-            output1_config["data_type"]
-        )
-
-    def execute(self, requests):
-        """`execute` MUST be implemented in every Python model. `execute`
-        function receives a list of pb_utils.InferenceRequest as the only
-        argument. This function is called when an inference request is made
-        for this model. Depending on the batching configuration (e.g. Dynamic
-        Batching) used, `requests` may contain multiple requests. Every
-        Python model, must create one pb_utils.InferenceResponse for every
-        pb_utils.InferenceRequest in `requests`. If there is an error, you can
-        set the error argument when creating a pb_utils.InferenceResponse
-
-        Parameters
-        ----------
-        requests : list
-          A list of pb_utils.InferenceRequest
-
-        Returns
-        -------
-        list
-          A list of pb_utils.InferenceResponse. The length of this list must
-          be the same as `requests`
-        """
-
-        output0_dtype = self.output0_dtype
-        output1_dtype = self.output1_dtype
-
-        responses = []
-
-        # Every Python backend must iterate over everyone of the requests
-        # and create a pb_utils.InferenceResponse for each of them.
-        for request in requests:
-            # Get INPUT0
-            in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0")
-            # Get INPUT1
-            in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1")
-
-            out_0, out_1 = (
-                in_0.as_numpy() + in_1.as_numpy(),
-                in_0.as_numpy() - in_1.as_numpy(),
-            )
-
-            # Create output tensors. You need pb_utils.Tensor
-            # objects to create pb_utils.InferenceResponse.
-            out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0.astype(output0_dtype))
-            out_tensor_1 = pb_utils.Tensor("OUTPUT1", out_1.astype(output1_dtype))
-
-            # Create InferenceResponse. You can set an error here in case
-            # there was a problem with handling this inference request.
-            # Below is an example of how you can set errors in inference
-            # response:
-            #
-            # pb_utils.InferenceResponse(
-            #    output_tensors=..., TritonError("An error occurred"))
-            inference_response = pb_utils.InferenceResponse(
-                output_tensors=[out_tensor_0, out_tensor_1]
-            )
-            responses.append(inference_response)
-
-        # You should return a list of pb_utils.InferenceResponse. Length
-        # of this list must match the length of `requests` list.
-        return responses
-
-    def finalize(self):
-        """`finalize` is called only once when the model is being unloaded.
-        Implementing `finalize` function is OPTIONAL. This function allows
-        the model to perform any necessary clean ups before exit.
-        """
-        print("Cleaning up...")
diff --git a/ci/L0_backend_vllm/vllm_backend/models/add_sub/config.pbtxt b/ci/L0_backend_vllm/vllm_backend/models/add_sub/config.pbtxt
deleted file mode 100644
index 0a932770..00000000
--- a/ci/L0_backend_vllm/vllm_backend/models/add_sub/config.pbtxt
+++ /dev/null
@@ -1,59 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-name: "add_sub"
-backend: "python"
-
-input [
-  {
-    name: "INPUT0"
-    data_type: TYPE_FP32
-    dims: [ 4 ]
-  }
-]
-input [
-  {
-    name: "INPUT1"
-    data_type: TYPE_FP32
-    dims: [ 4 ]
-  }
-]
-output [
-  {
-    name: "OUTPUT0"
-    data_type: TYPE_FP32
-    dims: [ 4 ]
-  }
-]
-output [
-  {
-    name: "OUTPUT1"
-    data_type: TYPE_FP32
-    dims: [ 4 ]
-  }
-]
-
-instance_group [{ kind: KIND_CPU }]
diff --git a/ci/L0_backend_vllm/vllm_backend/models/ensemble_model/config.pbtxt b/ci/L0_backend_vllm/vllm_backend/models/ensemble_model/config.pbtxt
deleted file mode 100644
index 07977d0d..00000000
--- a/ci/L0_backend_vllm/vllm_backend/models/ensemble_model/config.pbtxt
+++ /dev/null
@@ -1,59 +0,0 @@
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-name: "ensemble_model"
-platform: "ensemble"
-max_batch_size: 1
-input [
-  {
-    name: "text_input"
-    data_type: TYPE_STRING
-    dims: [ -1 ]
-  }
-]
-output [
-  {
-    name: "text_output"
-    data_type: TYPE_STRING
-    dims: [ -1 ]
-  }
-]
-ensemble_scheduling {
-  step [
-    {
-      model_name: "vllm_opt"
-      model_version: -1
-      input_map {
-        key: "text_input"
-        value: "text_input"
-      }
-      output_map {
-        key: "text_output"
-        value: "text_output"
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/ci/L0_backend_vllm/vllm_backend/models/vllm_invalid_1/1/model.json b/ci/L0_backend_vllm/vllm_backend/models/vllm_invalid_1/1/model.json
deleted file mode 100644
index c67b3d19..00000000
--- a/ci/L0_backend_vllm/vllm_backend/models/vllm_invalid_1/1/model.json
+++ /dev/null
@@ -1,6 +0,0 @@
-{
-    "model":"facebook/opt-125m",
-    "invalid_attribute": true,
-    "gpu_memory_utilization": 0.5,
-    "enforce_eager": true
-}
diff --git a/ci/L0_backend_vllm/vllm_backend/models/vllm_invalid_1/config.pbtxt b/ci/L0_backend_vllm/vllm_backend/models/vllm_invalid_1/config.pbtxt
deleted file mode 100644
index b5a6c1ae..00000000
--- a/ci/L0_backend_vllm/vllm_backend/models/vllm_invalid_1/config.pbtxt
+++ /dev/null
@@ -1,37 +0,0 @@
-# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-# Note: You do not need to change any fields in this configuration.
-
-backend: "vllm"
-
-# The usage of device is deferred to the vLLM engine
-instance_group [
-  {
-    count: 1
-    kind: KIND_MODEL
-  }
-]
diff --git a/ci/L0_backend_vllm/vllm_backend/models/vllm_invalid_2/1/model.json b/ci/L0_backend_vllm/vllm_backend/models/vllm_invalid_2/1/model.json
deleted file mode 100644
index 7418f17f..00000000
--- a/ci/L0_backend_vllm/vllm_backend/models/vllm_invalid_2/1/model.json
+++ /dev/null
@@ -1,6 +0,0 @@
-{
-    "model":"invalid_model",
-    "disable_log_requests": true,
-    "gpu_memory_utilization": 0.5,
-    "enforce_eager": true
-}
diff --git a/ci/L0_backend_vllm/vllm_backend/models/vllm_invalid_2/config.pbtxt b/ci/L0_backend_vllm/vllm_backend/models/vllm_invalid_2/config.pbtxt
deleted file mode 100644
index b5a6c1ae..00000000
--- a/ci/L0_backend_vllm/vllm_backend/models/vllm_invalid_2/config.pbtxt
+++ /dev/null
@@ -1,37 +0,0 @@
-# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-# Note: You do not need to change any fields in this configuration.
-
-backend: "vllm"
-
-# The usage of device is deferred to the vLLM engine
-instance_group [
-  {
-    count: 1
-    kind: KIND_MODEL
-  }
-]
diff --git a/ci/L0_backend_vllm/vllm_backend/models/vllm_load_test/1/model.json b/ci/L0_backend_vllm/vllm_backend/models/vllm_load_test/1/model.json
deleted file mode 100644
index 8fa8e151..00000000
--- a/ci/L0_backend_vllm/vllm_backend/models/vllm_load_test/1/model.json
+++ /dev/null
@@ -1,6 +0,0 @@
-{
-    "model":"facebook/opt-125m",
-    "disable_log_requests": true,
-    "gpu_memory_utilization": 0.4,
-    "enforce_eager": true
-}
diff --git a/ci/L0_backend_vllm/vllm_backend/models/vllm_load_test/config.pbtxt b/ci/L0_backend_vllm/vllm_backend/models/vllm_load_test/config.pbtxt
deleted file mode 100644
index b5a6c1ae..00000000
--- a/ci/L0_backend_vllm/vllm_backend/models/vllm_load_test/config.pbtxt
+++ /dev/null
@@ -1,37 +0,0 @@
-# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-# Note: You do not need to change any fields in this configuration.
-
-backend: "vllm"
-
-# The usage of device is deferred to the vLLM engine
-instance_group [
-  {
-    count: 1
-    kind: KIND_MODEL
-  }
-]
diff --git a/ci/L0_backend_vllm/vllm_backend/models/vllm_opt/1/model.json b/ci/L0_backend_vllm/vllm_backend/models/vllm_opt/1/model.json
deleted file mode 100644
index 8fa8e151..00000000
--- a/ci/L0_backend_vllm/vllm_backend/models/vllm_opt/1/model.json
+++ /dev/null
@@ -1,6 +0,0 @@
-{
-    "model":"facebook/opt-125m",
-    "disable_log_requests": true,
-    "gpu_memory_utilization": 0.4,
-    "enforce_eager": true
-}
diff --git a/ci/L0_backend_vllm/vllm_backend/models/vllm_opt/config.pbtxt b/ci/L0_backend_vllm/vllm_backend/models/vllm_opt/config.pbtxt
deleted file mode 100644
index b5a6c1ae..00000000
--- a/ci/L0_backend_vllm/vllm_backend/models/vllm_opt/config.pbtxt
+++ /dev/null
@@ -1,37 +0,0 @@
-# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-# Note: You do not need to change any fields in this configuration.
-
-backend: "vllm"
-
-# The usage of device is deferred to the vLLM engine
-instance_group [
-  {
-    count: 1
-    kind: KIND_MODEL
-  }
-]

From 4654f3501facc2a5b093082a5f3e8735a6e54c4a Mon Sep 17 00:00:00 2001
From: Krishnan Prashanth <kprashanth@nvidia.com>
Date: Mon, 23 Sep 2024 17:26:21 -0700
Subject: [PATCH 12/12] formatting fixed

---
 ci/L0_backend_vllm/vllm_backend/vllm_backend_test.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/ci/L0_backend_vllm/vllm_backend/vllm_backend_test.py b/ci/L0_backend_vllm/vllm_backend/vllm_backend_test.py
index 05618681..c53c391a 100644
--- a/ci/L0_backend_vllm/vllm_backend/vllm_backend_test.py
+++ b/ci/L0_backend_vllm/vllm_backend/vllm_backend_test.py
@@ -64,7 +64,7 @@ def test_vllm_triton_backend(self):
         self.triton_client.load_model(self.ensemble_model_name)
         self.assertTrue(self.triton_client.is_model_ready(self.ensemble_model_name))
         self.triton_client.unload_model(self.ensemble_model_name)
-        
+
         # Unload vllm model and test add_sub model
         self.triton_client.unload_model(self.vllm_load_test)
         self.assertFalse(self.triton_client.is_model_ready(self.vllm_load_test))
@@ -93,8 +93,6 @@ def test_vllm_triton_backend(self):
         )
         self.triton_client.unload_model(self.vllm_load_test)
         self.assertFalse(self.triton_client.is_model_ready(self.vllm_load_test))
-        
-        
 
     def test_model_with_invalid_attributes(self):
         model_name = "vllm_invalid_1"
@@ -173,7 +171,6 @@ def test_exclude_input_in_output_true(self):
             expected_output=expected_output,
         )
 
-
     def _test_vllm_model(
         self,
         prompts,