From de7da4a3283a7cffb44fd8b9361be5e2baab4ad7 Mon Sep 17 00:00:00 2001
From: Francesco Petrini <francescogpetrini@gmail.com>
Date: Mon, 23 Oct 2023 17:50:25 -0700
Subject: [PATCH] Hugging Face Transformer Deployment Tutorial (#49)

* Initial Commit

* Mount model repo so changes reflect, parameter tweaking, README file

* Image name error

* Incorporating review comments. Separate docker and model repo builds, add README, restructure repo

* Tutorial restructuring. Using static model configurations

* Bump triton container and update README

* Remove client script

* Incorporating review comments

* Modify WIP line in vLLM tutorial

* Remove trust_remote_code parameter from falcon model

* Removing Mistral

* Incorporating Feedback

* Change input/output names

* Pre-commit format

* Different perf_analyzer example, config file format fixes

* Deep dive changes to Triton tools section

* Remove unused variable
---
 .../HuggingFaceTransformers/Dockerfile        |  27 ++
 .../HuggingFaceTransformers/README.md         | 355 ++++++++++++++++++
 .../falcon7b/1/model.py                       | 109 ++++++
 .../falcon7b/config.pbtxt                     |  36 ++
 .../persimmon8b/1/model.py                    | 103 +++++
 .../persimmon8b/config.pbtxt                  |  36 ++
 Quick_Deploy/vLLM/README.md                   |   3 +-
 7 files changed, 667 insertions(+), 2 deletions(-)
 create mode 100644 Quick_Deploy/HuggingFaceTransformers/Dockerfile
 create mode 100644 Quick_Deploy/HuggingFaceTransformers/README.md
 create mode 100644 Quick_Deploy/HuggingFaceTransformers/falcon7b/1/model.py
 create mode 100644 Quick_Deploy/HuggingFaceTransformers/falcon7b/config.pbtxt
 create mode 100644 Quick_Deploy/HuggingFaceTransformers/persimmon8b/1/model.py
 create mode 100644 Quick_Deploy/HuggingFaceTransformers/persimmon8b/config.pbtxt

diff --git a/Quick_Deploy/HuggingFaceTransformers/Dockerfile b/Quick_Deploy/HuggingFaceTransformers/Dockerfile
new file mode 100644
index 00000000..285acde8
--- /dev/null
+++ b/Quick_Deploy/HuggingFaceTransformers/Dockerfile
@@ -0,0 +1,27 @@
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+FROM nvcr.io/nvidia/tritonserver:23.09-py3
+RUN pip install transformers==4.34.0 protobuf==3.20.3 sentencepiece==0.1.99 accelerate==0.23.0 einops==0.6.1
diff --git a/Quick_Deploy/HuggingFaceTransformers/README.md b/Quick_Deploy/HuggingFaceTransformers/README.md
new file mode 100644
index 00000000..762a0b34
--- /dev/null
+++ b/Quick_Deploy/HuggingFaceTransformers/README.md
@@ -0,0 +1,355 @@
+<!--
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+-->
+
+# Deploying Hugging Face Transformer Models in Triton
+
+The following tutorial demonstrates how to deploy an arbitrary hugging face transformer
+model on the Triton Inference Server using Triton's [Python backend](https://github.com/triton-inference-server/python_backend). For the purposes of this example, two transformer
+models will be deployed:
+- [tiiuae/falcon-7b](https://huggingface.co/tiiuae/falcon-7b)
+- [adept/persimmon-8b-base](https://huggingface.co/adept/persimmon-8b-base)
+
+These models were selected because of their popularity and consistent response quality.
+However, this tutorial is also generalizable for any transformer model provided
+sufficient infrastructure.
+
+*NOTE*: The tutorial is intended to be a reference example only. It may not be tuned for
+optimal performance.
+
+## Step 1: Create a Model Repository
+
+The first step is to create a model repository containing the models we want the Triton
+Inference Server to load and use for inference processing. To accomplish this, create a
+directory called `model_repository` and copy the `falcon7b` model folder into it:
+
+```
+mkdir -p model_repository
+cp -r falcon7b/ model_repository/
+```
+
+The `falcon7b/` folder we copied is organized in the way Triton expects and contains
+two important files needed to serve models in Triton:
+- **config.pbtxt** - Outlines the backend to use, model input/output details, and custom
+parameters to use for execution. More information on the full range of model configuration
+properties Triton supports can be found [here](https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/user_guide/model_configuration.html).
+- **model.py** - Implements how Triton should handle the model during the initialization,
+execution, and finalization stages. More information regarding python backend usage
+can be found [here](https://github.com/triton-inference-server/python_backend#usage).
+
+
+## Step 2: Build a Triton Container Image
+
+The second step is to create an image that includes all the dependencies necessary
+to deploy hugging face transformer models on the Triton Inference Server. This can be done
+by building an image from the provided Dockerfile:
+
+```
+docker build -t triton_transformer_server .
+```
+
+## Step 3: Launch the Triton Inference Server
+
+Once the ```triton_transformer_server``` image is created, you can launch the Triton Inference
+Server in a container with the following command:
+
+```bash
+docker run --gpus all -it --rm --net=host --shm-size=1G --ulimit memlock=-1 --ulimit stack=67108864 -v ${PWD}/model_repository:/opt/tritonserver/model_repository triton_transformer_server tritonserver --model-repository=model_repository
+```
+
+The server has launched successfully when you see the following outputs in your console:
+
+```
+I0922 23:28:40.351809 1 grpc_server.cc:2451] Started GRPCInferenceService at 0.0.0.0:8001
+I0922 23:28:40.352017 1 http_server.cc:3558] Started HTTPService at 0.0.0.0:8000
+I0922 23:28:40.395611 1 http_server.cc:187] Started Metrics Service at 0.0.0.0:8002
+```
+
+## Step 4: Query the Server
+
+Now we can query the server using curl, specifying the server address and input details:
+
+```json
+curl -X POST localhost:8000/v2/models/falcon7b/infer -d '{"inputs": [{"name":"text_input","datatype":"BYTES","shape":[1],"data":["I am going"]}]}'
+```
+In our testing, the server returned the following result (formatted for legibility):
+```json
+{
+  "model_name": "falcon7b",
+  "model_version": "1",
+  "outputs": [
+    {
+      "name": "text",
+      "datatype": "BYTES",
+      "shape": [
+        1
+      ],
+      "data": [
+        "I am going to be in the market for a new laptop soon. I"
+      ]
+    }
+  ]
+}
+```
+
+## Step 5: Host Multiple Models in Triton
+
+So far in this tutorial, we have only loaded a single model. However, Triton is capable
+of hosting many models, simultaneously. To accomplish this, first ensure you have
+exited the docker container by invoking `Ctrl+C` and waiting for the container to exit.
+
+Next copy the remaining model provided into the model repository:
+```
+cp -r persimmon8b/ model_repository/
+```
+*NOTE*: The combined size of these two models is large. If your current hardware cannot
+support hosting both models simultaneously, consider loading a smaller model, such as
+[opt-125m](https://huggingface.co/facebook/opt-125m), by creating a folder for it
+using the templates provided and copying it into `model_repository`.
+
+Again, launch the server by invoking the `docker run` command from above and wait for confirmation
+that the server has launched successfully.
+
+Query the server making sure to change the host address for each model:
+```json
+curl -X POST localhost:8000/v2/models/falcon7b/infer -d '{"inputs": [{"name":"text_input","datatype":"BYTES","shape":[1],"data":["How can you be"]}]}'
+curl -X POST localhost:8000/v2/models/persimmon8b/infer -d '{"inputs": [{"name":"text_input","datatype":"BYTES","shape":[1],"data":["Where is the nearest"]}]}'
+```
+In our testing, these queries returned the following parsed results:
+```bash
+# falcon7b
+"How can you be sure that you are getting the best deal on your car"
+
+# persimmon8b
+"Where is the nearest starbucks?"
+```
+
+## 'Day Zero' Support
+
+The latest transformer models may not always be supported in the most recent, official
+release of the `transformers` package. In such a case, you should still be able to
+load these 'bleeding edge' models in Triton by building `transformers` from source.
+This can be done by replacing the transformers install directive in the provided
+Dockerfile with:
+```docker
+RUN pip install git+https://github.com/huggingface/transformers.git
+```
+Using this technique you should be able to serve any transformer models supported by
+hugging face with Triton.
+
+
+# Next Steps
+The following sections expand on the base tutorial and provide guidance for future sandboxing.
+
+## Loading Cached Models
+In the previous steps, we downloaded the falcon-7b model from hugging face when we
+launched the Triton server. We can avoid this lengthy download process in subsequent runs
+by loading cached models into Triton. By default, the provided `model.py` files will cache
+the falcon and persimmon models in their respective directories within the `model_repository`
+folder. This is accomplished by setting the `TRANSFORMERS_CACHE` environmental variable.
+To set this environmental variable for an abtitrary model, include the following lines in
+your `model.py` **before** importing the 'transformers' module, making sure to replace
+`{MODEL}` with your target model.
+
+```python
+import os
+os.environ['TRANSFORMERS_CACHE'] = '/opt/tritonserver/model_repository/{MODEL}/hf_cache'
+```
+
+Alternatively, if your system has already cached a hugging face model you wish to deploy in Triton,
+you can mount it to the Triton container by adding the following mount option to the `docker run`
+command from earlier (making sure to replace `${HOME}` with the path to your associated username's home directory):
+
+```bash
+# Option to mount a specific cached model (falcon-7b in this case)
+-v ${HOME}/.cache/huggingface/hub/models--tiiuae--falcon-7b:/root/.cache/huggingface/hub/models--tiiuae--falcon-7b
+
+# Option to mount all cached models on the host system
+-v ${HOME}/.cache/huggingface:/root/.cache/huggingface
+```
+
+## Triton Tool Ecosystem
+Deploying models in Triton also comes with the benefit of access to a fully-supported suite
+of deployment analyzers to help you better understand and tailor your systems to fit your
+needs. Triton currently has two options for deployment analysis:
+- [Performance Analyzer](https://docs.nvidia.com/deeplearning/triton-inference-server/archives/triton-inference-server-2310/user-guide/docs/user_guide/perf_analyzer.html): An inference performance optimizer.
+- [Model Analyzer](https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/user_guide/model_analyzer.html) A GPU memory and compute utilization optimizer.
+
+### Performance Analyzer
+To use the performance analyzer, please remove the persimmon8b model from `model_repository` and restart
+the Triton server using the `docker run` command from above.
+
+Once Triton launches successfully, start a Triton SDK container by running the following in a separate window:
+
+```bash
+docker run -it --net=host nvcr.io/nvidia/tritonserver:23.09-py3-sdk bash
+```
+This container comes with all of Triton's deployment analyzers pre-installed, meaning
+we can simply enter the following to get feedback on our model's inference performance:
+
+```bash
+perf_analyzer -m falcon7b --collect-metrics
+```
+
+This command should run quickly and profile the performance of our falcon7b model.
+As the analyzer runs, it will output useful metrics such as latency percentiles,
+latency by stage of inference, and successful request count. A subset of the output
+data is shown below:
+
+```bash
+#Avg request latency
+46307 usec (overhead 25 usec + queue 25 usec + compute input 26 usec + compute infer 46161 usec + compute output 68 usec)
+
+#Avg GPU Utilization
+GPU-57c7b00e-ca04-3876-91e2-c1eae40a0733 : 66.0556%
+
+#Inferences/Second vs. Client Average Batch Latency
+Concurrency: 1, throughput: 21.3841 infer/sec, latency 46783 usec
+```
+
+These metrics tell us that we are not fully utilizing our hardware and that our
+throughput is low. We can immediately improve these results by batching our requests
+instead of computing inferences one at a time. The `model.py` file for the falcon model
+is already configured to handle batched requests. Enabling batching in Triton is as simple
+as adding the following to falcon's `config.pbtxt` file:
+
+```
+dynamic_batching { }
+max_batch_size: 8
+```
+The integer corresponding to the `max_batch_size`, can be any of your choosing, however,
+for this example, we select 8. Now let's re-run the perf_analyzer with increasing levels
+of concurrency and see how it impacts GPU utilization and throughput by executing:
+```bash
+perf_analyzer -m falcon7b --collect-metrics --concurrency-range=2:16:2
+```
+After executing for a few minutes, the performance analyzer should return
+results similar to these (depending on hardware):
+```bash
+# Concurrency = 4
+GPU-57c7b00e-ca04-3876-91e2-c1eae40a0733 : 74.1111%
+Throughput: 31.8264 infer/sec, latency 125174 usec
+
+# Concurrency = 8
+GPU-57c7b00e-ca04-3876-91e2-c1eae40a0733 : 81.7895%
+Throughput: 46.2105 infer/sec, latency 172920 usec
+
+# Concurrency = 16
+GPU-57c7b00e-ca04-3876-91e2-c1eae40a0733 : 90.5556%
+Throughput: 53.6549 infer/sec, latency 299178 usec
+```
+Using the performance analyzer we were able to quickly profile different model configurations
+to obtain better throughput and hardware utilization. In this case, we were able to
+identify a configuration that nearly triples our throughput and increases GPU
+utilization by ~24% in less than 5 minutes.
+
+This is a single, simple use case for the performance analyzer. For more information and
+a more complete list of performance analyzer parameters and use cases, please see
+[this](https://docs.nvidia.com/deeplearning/triton-inference-server/archives/triton-inference-server-2310/user-guide/docs/user_guide/perf_analyzer.html)
+guide.
+
+For more information regarding dynamic batching in Triton, please see [this](https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/user_guide/model_configuration.html#dynamic-batcher)
+guide.
+
+### Model Analyzer
+
+In the performance analyzer section, we used intuition to increase our throughput by changing
+a subset of variables and measuring the difference in performance. However, we only changed
+a few variables across a wide search space.
+
+To sweep this parameter space in a more robust fashion, we can use Triton's model analyzer, which
+not only sweeps a large spectrum of configuration parameters, but also generates visual reports
+to analyze post-execution.
+
+To use the model analyzer, please terminate your Triton server by invoking `Ctrl+C` and relaunching
+it with the following command (ensuring the dynamic_batching parameters from above have been added
+to the falcon model's config.pbtxt):
+```bash
+docker run --gpus all -it --rm --net=host --shm-size=1G --ulimit memlock=-1 --ulimit stack=67108864 -v ${PWD}/model_repository:/opt/tritonserver/model_repository triton_transformer_server
+```
+
+Next, to get the most accurate GPU metrics from the model analyzer, we will install and launch it from
+our local server container. To accomplish this, first install the model analyzer:
+```bash
+pip3 install triton-model-analyzer
+```
+
+Once the model analyzer installs successfully, enter the following command (modifying the instance
+count to something lower for your GPU, if necessary):
+```bash
+model-analyzer profile -m /opt/tritonserver/model_repository/ --profile-models falcon7b --run-config-search-max-instance-count=3 --run-config-search-min-model-batch-size=8
+```
+This tool will take longer to execute than the performance analyzer example (~40 minutes).
+If this execution time is too long, you can also run the analyzer with the
+`--run-config-search-mode quick` option. In our experimentation, enabling the quick search option
+yielded fewer results but took half the time. Regardless, once the model analyzer is complete,
+it will provide you a full summary relating to throughput, latency, and hardware utilization
+in multiple formats. A snippet from the summary report produced by the model analyzer for
+our run is ranked by performance and shown below:
+
+| Model Config Name | Max Batch Size | Dynamic Batching | Total Instance Count | p99 Latency (ms) | Throughput (infer/sec) | Max GPU Memory Usage (MB) | Average GPU Utilization (%) |
+| :---: | :----: | :---: | :----: | :---: | :----:   | :---: | :---: |
+| falcon7b_config_7 | 16 | Enabled | 3:GPU | 1412.581 | 71.944 | 46226 | 100.0 |
+| falcon7b_config_8 | 32 | Enabled | 3:GPU | 2836.225 | 63.9652 | 46268 | 100.0 |
+| falcon7b_config_4 | 16 | Enabled | 2:GPU | 7601.437 | 63.9454 | 31331 | 100.0 |
+| falcon7b_config_default | 8 | Enabled | 1:GPU | 4151.873 | 63.9384 | 16449 | 89.3 |
+
+We can examine the performance of any of these configurations with more granularity by viewing
+their detailed reports. This subset of reports focuses on a single configuration's latency
+and concurrency metrics as they relate to throughput and hardware utilization. A snippet from
+the top performing configuration for our tests is shown below (abridged for brevity):
+
+| Request Concurrency | p99 Latency (ms) | Client Response Wait (ms) | Server Queue (ms) | Server Compute Input (ms) | Server Compute Infer (ms) | Throughput (infer/sec) | Max GPU Memory Usage (MB) | Average GPU Utilization (%) |
+| :---: | :----: | :---: | :----: | :---: | :----:   | :---: | :---: | :---: |
+| 512	| 8689.491 | 8190.506 | 7397.975 | 0.166 | 778.565 | 63.954 | 46230.667264 | 100.0 |
+| | | | | ... | | | | |
+| 128 | 2289.118 | 2049.37 | 1277.34 | 0.159 | 770.771 | 61.2953 | 46230.667264 | 100.0 |
+| 64 | 1412.581 | 896.924 | 227.108 | 0.157 | 667.757 | 71.944 | 46226.47296 | 100.0 |
+| 32 | 781.362 | 546.35 | 86.078 | 0.103 | 459.257 | 57.7877 | 46226.47296 | 100.0 |
+| | | | | ... | | | | |
+| 1 | 67.12 | 49.707 | 0.049 | 0.024 | 49.121 | 20.0993 | 46207.598592 | 54.9 |
+
+Similarly, this is a single use case for the model analyzer. For more information and a more complete list
+of model analyzer parameters and run options, please see [this](https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/user_guide/model_analyzer.html) guide.
+
+*Please note that both the performance and model analyzer experiments were conducted
+on a system with an Intel i9 and NVIDIA A6000 GPU. Your results may vary depending on
+you hardware.*
+
+## Customization
+
+The `model.py` files have been kept minimal in order to maximize generalizability. Should you wish
+to modify the behavior of the transformer models, such as increasing the number of generated sequences
+to return, be sure to modify the corresponding `config.pbtxt` and `model.py` files and copy them
+into the `model_repository`.
+
+The transformers used in this tutorial were all suited for text-generation tasks, however, this
+is not a limitation. The principles of this tutorial can be applied to serve models suited for
+any other transformer task.
+
+Triton offers a rich variety of available server configuration options not mentioned in this tutorial.
+For a more custom deployment, please see our [model configuration guide](https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/user_guide/model_configuration.html) to see how the scope of this tutorial can be expanded to fit your needs.
diff --git a/Quick_Deploy/HuggingFaceTransformers/falcon7b/1/model.py b/Quick_Deploy/HuggingFaceTransformers/falcon7b/1/model.py
new file mode 100644
index 00000000..71bede0e
--- /dev/null
+++ b/Quick_Deploy/HuggingFaceTransformers/falcon7b/1/model.py
@@ -0,0 +1,109 @@
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import os
+
+os.environ[
+    "TRANSFORMERS_CACHE"
+] = "/opt/tritonserver/model_repository/falcon7b/hf_cache"
+import json
+
+import numpy as np
+import torch
+import transformers
+import triton_python_backend_utils as pb_utils
+
+
+class TritonPythonModel:
+    def initialize(self, args):
+        self.logger = pb_utils.Logger
+        self.model_config = json.loads(args["model_config"])
+        self.model_params = self.model_config.get("parameters", {})
+        default_hf_model = "tiiuae/falcon-7b"
+        default_max_gen_length = "15"
+        # Check for user-specified model name in model config parameters
+        hf_model = self.model_params.get("huggingface_model", {}).get(
+            "string_value", default_hf_model
+        )
+        # Check for user-specified max length in model config parameters
+        self.max_output_length = int(
+            self.model_params.get("max_output_length", {}).get(
+                "string_value", default_max_gen_length
+            )
+        )
+
+        self.logger.log_info(f"Max sequence length: {self.max_output_length}")
+        self.logger.log_info(f"Loading HuggingFace model: {hf_model}...")
+        # Assume tokenizer available for same model
+        self.tokenizer = transformers.AutoTokenizer.from_pretrained(hf_model)
+        self.pipeline = transformers.pipeline(
+            "text-generation",
+            model=hf_model,
+            torch_dtype=torch.float16,
+            tokenizer=self.tokenizer,
+            device_map="auto",
+        )
+        self.pipeline.tokenizer.pad_token_id = self.tokenizer.eos_token_id
+
+    def execute(self, requests):
+        prompts = []
+        for request in requests:
+            input_tensor = pb_utils.get_input_tensor_by_name(request, "text_input")
+            multi_dim = input_tensor.as_numpy().ndim > 1
+            if not multi_dim:
+                prompt = input_tensor.as_numpy()[0].decode("utf-8")
+                self.logger.log_info(f"Generating sequences for text_input: {prompt}")
+                prompts.append(prompt)
+            else:
+                # Implementation to accept dynamically batched inputs
+                num_prompts = input_tensor.as_numpy().shape[0]
+                for prompt_index in range(0, num_prompts):
+                    prompt = input_tensor.as_numpy()[prompt_index][0].decode("utf-8")
+                    prompts.append(prompt)
+
+        batch_size = len(prompts)
+        return self.generate(prompts, batch_size)
+
+    def generate(self, prompts, batch_size):
+        sequences = self.pipeline(
+            prompts,
+            max_length=self.max_output_length,
+            pad_token_id=self.tokenizer.eos_token_id,
+            batch_size=batch_size,
+        )
+        responses = []
+        texts = []
+        for i, seq in enumerate(sequences):
+            output_tensors = []
+            text = seq[0]["generated_text"]
+            texts.append(text)
+            tensor = pb_utils.Tensor("text_output", np.array(texts, dtype=np.object_))
+            output_tensors.append(tensor)
+            responses.append(pb_utils.InferenceResponse(output_tensors=output_tensors))
+
+        return responses
+
+    def finalize(self):
+        print("Cleaning up...")
diff --git a/Quick_Deploy/HuggingFaceTransformers/falcon7b/config.pbtxt b/Quick_Deploy/HuggingFaceTransformers/falcon7b/config.pbtxt
new file mode 100644
index 00000000..9949472d
--- /dev/null
+++ b/Quick_Deploy/HuggingFaceTransformers/falcon7b/config.pbtxt
@@ -0,0 +1,36 @@
+# Triton backend to use
+backend: "python"
+
+# Hugging face model path. Parameters must follow this
+# key/value structure
+parameters: {
+  key: "huggingface_model",
+  value: {string_value: "tiiuae/falcon-7b"}
+}
+
+# The maximum number of tokens to generate in response
+# to our input
+parameters: {
+  key: "max_output_length",
+  value: {string_value: "15"}
+}
+
+# Triton should expect as input a single string of set
+# length named 'text_input'
+input [
+  {
+    name: "text_input"
+    data_type: TYPE_STRING
+    dims: [ 1 ]
+  }
+]
+
+# Triton should expect to respond with a single string
+# output of variable length named 'text_output'
+output [
+  {
+    name: "text_output"
+    data_type: TYPE_STRING
+    dims: [ -1 ]
+  }
+]
diff --git a/Quick_Deploy/HuggingFaceTransformers/persimmon8b/1/model.py b/Quick_Deploy/HuggingFaceTransformers/persimmon8b/1/model.py
new file mode 100644
index 00000000..5119d406
--- /dev/null
+++ b/Quick_Deploy/HuggingFaceTransformers/persimmon8b/1/model.py
@@ -0,0 +1,103 @@
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import os
+
+os.environ[
+    "TRANSFORMERS_CACHE"
+] = "/opt/tritonserver/model_repository/persimmon8b/hf_cache"
+
+import json
+
+import numpy as np
+import torch
+import transformers
+import triton_python_backend_utils as pb_utils
+
+
+class TritonPythonModel:
+    def initialize(self, args):
+        self.logger = pb_utils.Logger
+        self.model_config = json.loads(args["model_config"])
+        self.model_params = self.model_config.get("parameters", {})
+        default_hf_model = "adept/persimmon-8b-base"
+        default_max_gen_length = "15"
+        # Check for user-specified model name in model config parameters
+        hf_model = self.model_params.get("huggingface_model", {}).get(
+            "string_value", default_hf_model
+        )
+        # Check for user-specified max length in model config parameters
+        self.max_output_length = int(
+            self.model_params.get("max_output_length", {}).get(
+                "string_value", default_max_gen_length
+            )
+        )
+
+        self.logger.log_info(f"Max output length: {self.max_output_length}")
+        self.logger.log_info(f"Loading HuggingFace model: {hf_model}...")
+        # Assume tokenizer available for same model
+        self.tokenizer = transformers.AutoTokenizer.from_pretrained(hf_model)
+        self.pipeline = transformers.pipeline(
+            "text-generation",
+            model=hf_model,
+            torch_dtype=torch.float16,
+            tokenizer=self.tokenizer,
+            device_map="auto",
+        )
+
+    def execute(self, requests):
+        responses = []
+        for request in requests:
+            # Assume input named "prompt", specified in autocomplete above
+            input_tensor = pb_utils.get_input_tensor_by_name(request, "text_input")
+            prompt = input_tensor.as_numpy()[0].decode("utf-8")
+
+            self.logger.log_info(f"Generating sequences for text_input: {prompt}")
+            response = self.generate(prompt)
+            responses.append(response)
+
+        return responses
+
+    def generate(self, prompt):
+        sequences = self.pipeline(
+            prompt,
+            max_length=self.max_output_length,
+            pad_token_id=self.tokenizer.eos_token_id,
+        )
+
+        output_tensors = []
+        texts = []
+        for i, seq in enumerate(sequences):
+            text = seq["generated_text"]
+            self.logger.log_info(f"Sequence {i+1}: {text}")
+            texts.append(text)
+
+        tensor = pb_utils.Tensor("text_output", np.array(texts, dtype=np.object_))
+        output_tensors.append(tensor)
+        response = pb_utils.InferenceResponse(output_tensors=output_tensors)
+        return response
+
+    def finalize(self):
+        print("Cleaning up...")
diff --git a/Quick_Deploy/HuggingFaceTransformers/persimmon8b/config.pbtxt b/Quick_Deploy/HuggingFaceTransformers/persimmon8b/config.pbtxt
new file mode 100644
index 00000000..5098c2a6
--- /dev/null
+++ b/Quick_Deploy/HuggingFaceTransformers/persimmon8b/config.pbtxt
@@ -0,0 +1,36 @@
+# Triton backend to use
+backend: "python"
+
+# Hugging face model path. Parameters must follow this
+# key/value structure
+parameters: {
+  key: "huggingface_model",
+  value: {string_value: "adept/persimmon-8b-base"}
+}
+
+# The maximum number of tokens to generate in response
+# to our input
+parameters: {
+  key: "max_output_length",
+  value: {string_value: "15"}
+}
+
+# Triton should expect as input a single string of set
+# length named 'text_input'
+input [
+  {
+    name: "text_input"
+    data_type: TYPE_STRING
+    dims: [ 1 ]
+  }
+]
+
+# Triton should expect to respond with a single string
+# output of variable length named 'text_output'
+output [
+  {
+    name: "text_output"
+    data_type: TYPE_STRING
+    dims: [ -1 ]
+  }
+]
diff --git a/Quick_Deploy/vLLM/README.md b/Quick_Deploy/vLLM/README.md
index 5d292511..a0fb6b26 100644
--- a/Quick_Deploy/vLLM/README.md
+++ b/Quick_Deploy/vLLM/README.md
@@ -34,8 +34,7 @@ The following tutorial demonstrates how to deploy a simple
 Triton Inference Server using Triton's [Python backend](https://github.com/triton-inference-server/python_backend) and the
 [vLLM](https://github.com/vllm-project/vllm) library.
 
-*NOTE*: The tutorial is intended to be a reference example only. It is a work in progress with
-[known limitations](#limitations).
+*NOTE*: The tutorial is intended to be a reference example only and has [known limitations](#limitations).
 
 
 ## Step 1: Build a Triton Container Image with vLLM