Skip to content

Commit

Permalink
build: Upgrade to 24.07, TRT-LLM 0.11.0, and Triton CLI v0.0.10 (#81)
Browse files Browse the repository at this point in the history
  • Loading branch information
rmccorm4 authored Aug 6, 2024
1 parent 449f6b8 commit a050ec1
Show file tree
Hide file tree
Showing 20 changed files with 1,544 additions and 543 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/python-package.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ jobs:
build:
runs-on: ${{ matrix.os }}
container:
image: nvcr.io/nvidia/tritonserver:24.06-py3
image: nvcr.io/nvidia/tritonserver:24.07-py3
strategy:
fail-fast: false
matrix:
Expand Down
17 changes: 9 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ and running the CLI from within the latest corresponding `tritonserver`
container image, which should have all necessary system dependencies installed.

For vLLM and TRT-LLM, you can use their respective images:
- `nvcr.io/nvidia/tritonserver:24.06-vllm-python-py3`
- `nvcr.io/nvidia/tritonserver:24.06-trtllm-python-py3`
- `nvcr.io/nvidia/tritonserver:24.07-vllm-python-py3`
- `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`

If you decide to run the CLI on the host or in a custom image, please
see this list of [additional dependencies](#additional-dependencies-for-custom-environments)
Expand All @@ -38,6 +38,7 @@ matrix below:

| Triton CLI Version | TRT-LLM Version | Triton Container Tag |
|:------------------:|:---------------:|:--------------------:|
| 0.0.10 | v0.11.0 | 24.07 |
| 0.0.9 | v0.10.0 | 24.06 |
| 0.0.8 | v0.9.0 | 24.05 |
| 0.0.7 | v0.9.0 | 24.04 |
Expand All @@ -56,7 +57,7 @@ It is also possible to install from a specific branch name, a commit hash
or a tag name. For example to install `triton_cli` with a specific tag:

```bash
GIT_REF="0.0.9"
GIT_REF="0.0.10"
pip install git+https://github.com/triton-inference-server/triton_cli.git@${GIT_REF}
```

Expand Down Expand Up @@ -91,7 +92,7 @@ triton -h
triton import -m gpt2

# Start server pointing at the default model repository
triton start --image nvcr.io/nvidia/tritonserver:24.06-vllm-python-py3
triton start --image nvcr.io/nvidia/tritonserver:24.07-vllm-python-py3

# Infer with CLI
triton infer -m gpt2 --prompt "machine learning is"
Expand Down Expand Up @@ -145,10 +146,10 @@ docker run -ti \
--shm-size=1g --ulimit memlock=-1 \
-v ${HOME}/models:/root/models \
-v ${HOME}/.cache/huggingface:/root/.cache/huggingface \
nvcr.io/nvidia/tritonserver:24.06-vllm-python-py3
nvcr.io/nvidia/tritonserver:24.07-vllm-python-py3

# Install the Triton CLI
pip install git+https://github.com/triton-inference-server/[email protected].9
pip install git+https://github.com/triton-inference-server/[email protected].10

# Authenticate with huggingface for restricted models like Llama-2 and Llama-3
huggingface-cli login
Expand Down Expand Up @@ -214,10 +215,10 @@ docker run -ti \
-v /tmp:/tmp \
-v ${HOME}/models:/root/models \
-v ${HOME}/.cache/huggingface:/root/.cache/huggingface \
nvcr.io/nvidia/tritonserver:24.06-trtllm-python-py3
nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3

# Install the Triton CLI
pip install git+https://github.com/triton-inference-server/[email protected].9
pip install git+https://github.com/triton-inference-server/[email protected].10

# Authenticate with huggingface for restricted models like Llama-2 and Llama-3
huggingface-cli login
Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ dependencies = [
"grpcio>=1.64.0",
"directory-tree == 0.0.4", # may remove in future
"docker == 6.1.3",
"genai-perf @ git+https://github.com/triton-inference-server/client.git@r24.06#subdirectory=src/c++/perf_analyzer/genai-perf",
"genai-perf @ git+https://github.com/triton-inference-server/client.git@r24.07#subdirectory=src/c++/perf_analyzer/genai-perf",
# TODO: rely on tritonclient to pull in protobuf and numpy dependencies?
"numpy >=1.21,<2",
"protobuf>=3.7.0",
Expand All @@ -59,7 +59,7 @@ dependencies = [
"rich == 13.5.2",
# TODO: Test on cpu-only machine if [cuda] dependency is an issue,
# Use explicit client version matching genai-perf version for tagged release
"tritonclient[all] == 2.47",
"tritonclient[all] == 2.48",
"huggingface-hub >= 0.19.4",
# Testing
"pytest >= 8.1.1", # may remove later
Expand Down
2 changes: 1 addition & 1 deletion src/triton_cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,4 @@
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

__version__ = "0.0.9"
__version__ = "0.0.10"
20 changes: 14 additions & 6 deletions src/triton_cli/client/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,13 @@ def generate_data(self, config: dict, data_mode: str):
infer_inputs.append(
self.__create_triton_input(name, shape, triton_dtype, data)
)
json_input = {
"name": name,
"shape": str(data.shape),
"dtype": triton_dtype,
"value": np.array_str(data),
}
logger.info(f"Input:\n{json.dumps(json_input, indent=4)}")

return infer_inputs

Expand Down Expand Up @@ -294,17 +301,18 @@ def __process_infer_result(self, result):
for output in response["outputs"]:
name = output["name"]
# TODO: Need special logic for string/bytes type
np_data = result.as_numpy(name)
data = result.as_numpy(name)
# WAR for LLMs
if np_data.dtype == np.object_:
if data.dtype == np.object_:
# Assume 2D-output (batch_size, texts)
texts = np_data.flatten()
np_data = np.array([text.decode("utf-8") for text in texts])
texts = data.flatten()
data = np.array([text.decode("utf-8") for text in texts])

output_data_str = np.array_str(np_data)
output_data_str = np.array_str(data)
json_output = {
"name": name,
"shape": str(np_data.shape),
"shape": str(data.shape),
"dtype": output["datatype"],
"value": output_data_str,
}
logger.info(f"Output:\n{json.dumps(json_output, indent=4)}")
Expand Down
6 changes: 3 additions & 3 deletions src/triton_cli/docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
# TRT-LLM image contains engine building and runtime dependencies
FROM nvcr.io/nvidia/tritonserver:24.06-trtllm-python-py3
FROM nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3

# Setup vLLM Triton backend
RUN mkdir -p /opt/tritonserver/backends/vllm && \
wget -P /opt/tritonserver/backends/vllm https://raw.githubusercontent.com/triton-inference-server/vllm_backend/r24.06/src/model.py
wget -P /opt/tritonserver/backends/vllm https://raw.githubusercontent.com/triton-inference-server/vllm_backend/r24.07/src/model.py

# vLLM runtime dependencies
RUN pip install "vllm==0.4.3"
RUN pip install "vllm==0.5.0.post1"
110 changes: 63 additions & 47 deletions src/triton_cli/templates/trt_llm/postprocessing/1/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,22 +52,46 @@ def initialize(self, args):
* model_name: Model name
"""
# Parse model configs
model_config = json.loads(args["model_config"])
tokenizer_dir = model_config["parameters"]["tokenizer_dir"]["string_value"]
self.skip_special_tokens = model_config["parameters"].get(
"skip_special_tokens", {"string_value": "true"}
)["string_value"].lower() in ["true", "1", "t", "y", "yes"]
model_config = json.loads(args['model_config'])
tokenizer_dir = model_config['parameters']['tokenizer_dir'][
'string_value']

skip_special_tokens = model_config['parameters'].get(
'skip_special_tokens')
if skip_special_tokens is not None:
skip_special_tokens_str = skip_special_tokens[
'string_value'].lower()
if skip_special_tokens_str in [
'true', 'false', '1', '0', 't', 'f', 'y', 'n', 'yes', 'no'
]:
self.skip_special_tokens = skip_special_tokens_str in [
'true', '1', 't', 'y', 'yes'
]
else:
print(
f"[TensorRT-LLM][WARNING] Don't setup 'skip_special_tokens' correctly (set value is {skip_special_tokens['string_value']}). Set it as True by default."
)
self.skip_special_tokens = True
else:
print(
f"[TensorRT-LLM][WARNING] Don't setup 'skip_special_tokens'. Set it as True by default."
)
self.skip_special_tokens = True

self.tokenizer = AutoTokenizer.from_pretrained(
tokenizer_dir, legacy=False, padding_side="left", trust_remote_code=True
)
self.tokenizer.pad_token = self.tokenizer.eos_token
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
legacy=False,
padding_side='left',
trust_remote_code=True)
if not self.tokenizer.pad_token:
self.tokenizer.pad_token = self.tokenizer.eos_token

# Parse model output configs
output_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT")
output_config = pb_utils.get_output_config_by_name(
model_config, "OUTPUT")

# Convert Triton types to numpy types
self.output_dtype = pb_utils.triton_string_to_numpy(output_config["data_type"])
self.output_dtype = pb_utils.triton_string_to_numpy(
output_config['data_type'])

def execute(self, requests):
"""`execute` must be implemented in every Python model. `execute`
Expand Down Expand Up @@ -96,31 +120,27 @@ def execute(self, requests):
for idx, request in enumerate(requests):
# Get input tensors
tokens_batch = pb_utils.get_input_tensor_by_name(
request, "TOKENS_BATCH"
).as_numpy()
request, 'TOKENS_BATCH').as_numpy()

# Get sequence length
sequence_lengths = pb_utils.get_input_tensor_by_name(
request, "SEQUENCE_LENGTH"
).as_numpy()
request, 'SEQUENCE_LENGTH').as_numpy()

# Get cum log probs
cum_log_probs = pb_utils.get_input_tensor_by_name(request, "CUM_LOG_PROBS")
cum_log_probs = pb_utils.get_input_tensor_by_name(
request, 'CUM_LOG_PROBS')

# Get sequence length
output_log_probs = pb_utils.get_input_tensor_by_name(
request, "OUTPUT_LOG_PROBS"
)
request, 'OUTPUT_LOG_PROBS')

# Get context logits
context_logits = pb_utils.get_input_tensor_by_name(
request, "CONTEXT_LOGITS"
)
request, 'CONTEXT_LOGITS')

# Get generation logits
generation_logits = pb_utils.get_input_tensor_by_name(
request, "GENERATION_LOGITS"
)
request, 'GENERATION_LOGITS')

# Reshape Input
# tokens_batch = tokens_batch.reshape([-1, tokens_batch.shape[0]])
Expand All @@ -132,54 +152,49 @@ def execute(self, requests):
# Create output tensors. You need pb_utils.Tensor
# objects to create pb_utils.InferenceResponse.
output_tensor = pb_utils.Tensor(
"OUTPUT", np.array(outputs).astype(self.output_dtype)
)
'OUTPUT',
np.array(outputs).astype(self.output_dtype))

outputs = []
outputs.append(output_tensor)

if cum_log_probs:
out_cum_log_probs = pb_utils.Tensor(
"OUT_CUM_LOG_PROBS", cum_log_probs.as_numpy()
)
out_cum_log_probs = pb_utils.Tensor('OUT_CUM_LOG_PROBS',
cum_log_probs.as_numpy())
outputs.append(out_cum_log_probs)
else:
out_cum_log_probs = pb_utils.Tensor(
"OUT_CUM_LOG_PROBS", np.array([[0.0]], dtype=np.float32)
)
'OUT_CUM_LOG_PROBS', np.array([[0.0]], dtype=np.float32))
outputs.append(out_cum_log_probs)

if output_log_probs:
out_output_log_probs = pb_utils.Tensor(
"OUT_OUTPUT_LOG_PROBS", output_log_probs.as_numpy()
)
'OUT_OUTPUT_LOG_PROBS', output_log_probs.as_numpy())
outputs.append(out_output_log_probs)
else:
out_output_log_probs = pb_utils.Tensor(
"OUT_OUTPUT_LOG_PROBS", np.array([[[0.0]]], dtype=np.float32)
)
'OUT_OUTPUT_LOG_PROBS',
np.array([[[0.0]]], dtype=np.float32))
outputs.append(out_output_log_probs)

if context_logits:
out_context_logits = pb_utils.Tensor(
"OUT_CONTEXT_LOGITS", context_logits.as_numpy()
)
out_context_logits = pb_utils.Tensor('OUT_CONTEXT_LOGITS',
context_logits.as_numpy())
outputs.append(out_context_logits)
else:
out_context_logits = pb_utils.Tensor(
"OUT_CONTEXT_LOGITS", np.array([[[0.0]]], dtype=np.float32)
)
'OUT_CONTEXT_LOGITS', np.array([[[0.0]]],
dtype=np.float32))
outputs.append(out_context_logits)

if generation_logits:
out_generation_logits = pb_utils.Tensor(
"OUT_GENERATION_LOGITS", generation_logits.as_numpy()
)
'OUT_GENERATION_LOGITS', generation_logits.as_numpy())
outputs.append(out_generation_logits)
else:
out_generation_logits = pb_utils.Tensor(
"OUT_GENERATION_LOGITS", np.array([[[[0.0]]]], dtype=np.float32)
)
'OUT_GENERATION_LOGITS',
np.array([[[[0.0]]]], dtype=np.float32))
outputs.append(out_generation_logits)

# Create InferenceResponse. You can set an error here in case
Expand All @@ -189,7 +204,8 @@ def execute(self, requests):
#
# pb_utils.InferenceResponse(
# output_tensors=..., TritonError("An error occurred"))
inference_response = pb_utils.InferenceResponse(output_tensors=outputs)
inference_response = pb_utils.InferenceResponse(
output_tensors=outputs)
responses.append(inference_response)

# You should return a list of pb_utils.InferenceResponse. Length
Expand All @@ -201,15 +217,15 @@ def finalize(self):
Implementing `finalize` function is optional. This function allows
the model to perform any necessary clean ups before exit.
"""
print("Cleaning up...")
print('Cleaning up...')

def _postprocessing(self, tokens_batch, sequence_lengths):
outputs = []
for batch_idx, beam_tokens in enumerate(tokens_batch):
for beam_idx, tokens in enumerate(beam_tokens):
seq_len = sequence_lengths[batch_idx][beam_idx]
output = self.tokenizer.decode(
tokens[:seq_len], skip_special_tokens=self.skip_special_tokens
)
outputs.append(output.encode("utf8"))
tokens[:seq_len],
skip_special_tokens=self.skip_special_tokens)
outputs.append(output.encode('utf8'))
return outputs
3 changes: 2 additions & 1 deletion src/triton_cli/templates/trt_llm/postprocessing/config.pbtxt
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

name: "postprocessing"
backend: "python"
max_batch_size: ${triton_max_batch_size}
input [
Expand Down Expand Up @@ -100,7 +101,7 @@ parameters {
parameters {
key: "skip_special_tokens"
value: {
string_value: "True"
string_value: "${skip_special_tokens}"
}
}

Expand Down
Loading

0 comments on commit a050ec1

Please sign in to comment.