triton-inference-server · dyastremsky · May 10, 2024 · Apr 29, 2024 · Apr 29, 2024 · Apr 30, 2024
@@ -41,7 +41,7 @@ jobs:
       fail-fast: false
       matrix:
         os: ["ubuntu-22.04"]
-        python-version: ["3.8", "3.10"]
+        python-version: ["3.10"]
 
     steps:
     - uses: actions/checkout@v3

diff --git a/.gitignore b/.gitignore
@@ -8,6 +8,7 @@ __pycache__/
 
 # Distribution / packaging
 .Python
+artifacts/
 build/
 develop-eggs/
 dist/
@@ -17,12 +18,20 @@ eggs/
 lib64/
 parts/
 sdist/
+tests/checkpoints/
+tests/output_dir/
+tests/output_model_repository/
+tests/plots/
+tests/reports/
+tests/results/
+tmp/
 var/
 wheels/
 share/python-wheels/
 *.egg-info/
 .installed.cfg
 *.egg
+llm_inputs.json
 MANIFEST
 
 # PyInstaller

diff --git a/README.md b/README.md
@@ -16,7 +16,7 @@ Server.
 ## Pre-requisites
 
 When using Triton and related tools on your host (outside of a Triton container
-image) there are a number of additional dependencies that may be required for
+image), there are a number of additional dependencies that may be required for
 various workflows. Most system dependency issues can be resolved by installing
 and running the CLI from within the latest corresponding `tritonserver`
 container image, which should have all necessary system dependencies installed.
@@ -162,7 +162,7 @@ triton start
 # Interact with model
 triton infer -m llama-3-8b-instruct --prompt "machine learning is"
 
-# Profile model with Perf Analyzer
+# Profile model with GenAI-Perf
 triton profile -m llama-3-8b-instruct --backend vllm
 ```
 
@@ -232,7 +232,7 @@ triton start
 # Interact with model
 triton infer -m llama-3-8b-instruct --prompt "machine learning is"
 
-# Profile model with Perf Analyzer
+# Profile model with GenAI-Perf
 triton profile -m llama-3-8b-instruct --backend tensorrtllm
 ```
 ## Additional Dependencies for Custom Environments
@@ -269,10 +269,6 @@ sudo apt install libopenmpi-dev
 ```
 
 ## Known Limitations
-- Triton CLI's `profile` command currently only supports TRT-LLM and vLLM models.
-- Triton CLI's `profile` command will be migrating to use
-[genai-perf](https://github.com/triton-inference-server/client/tree/main/src/c++/perf_analyzer/genai-perf)
-as the backbone for LLM profiling soon.
 - Models and configurations generated by Triton CLI are focused on ease-of-use,
 and may not be as optimized as possible for your system or use case.
 - Triton CLI currently uses the TRT-LLM dependencies installed in its environment

diff --git a/pyproject.toml b/pyproject.toml
@@ -37,32 +37,32 @@ classifiers = [
     "Topic :: Scientific/Engineering",
     "Programming Language :: Python",
     "Programming Language :: Python :: 3",
-    "Programming Language :: Python :: 3.8",
-    "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
     "Operating System :: Unix",
 ]
 authors = []
 maintainers = []
 keywords = []
-requires-python = ">=3.8,<4"
+requires-python = ">=3.10,<4"
 # TODO: Add [gpu] set of dependencies for trtllm once it's available on pypi
 dependencies = [
     "directory-tree == 0.0.4", # may remove in future
     "docker == 6.1.3",
+    "genai-perf @ git+https://github.com/triton-inference-server/[email protected]#subdirectory=src/c++/perf_analyzer/genai-perf",
     # TODO: rely on tritonclient to pull in protobuf and numpy dependencies?
     "numpy >= 1.21",
     "protobuf>=3.7.0",
     "prometheus-client == 0.19.0",
     "psutil >= 5.9.5", # may remove later
     "rich == 13.5.2",
     # TODO: Test on cpu-only machine if [cuda] dependency is an issue
-    "tritonclient[all] >= 2.38",
+    "tritonclient[all] >= 2.45",
     "huggingface-hub >= 0.19.4",
     # Testing
     "pytest >= 8.1.1", # may remove later
     "pytest-timeout", # may remove later
+    "pytest-mock >= 3.13.0", # may remove later
 ]
 
 # CLI Entrypoint
@@ -81,6 +81,9 @@ build-backend = "hatchling.build"
 [tool.hatch.version]
 path = "src/triton_cli/__init__.py"
 
+[tool.hatch.metadata]
+allow-direct-references = true
+
 # Pre-commit hook tool configs
 [tool.codespell]
 # note: pre-commit passes explicit lists of files here, which this skip file list doesn't override -

diff --git a/src/triton_cli/parser.py b/src/triton_cli/parser.py
@@ -26,6 +26,8 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import json
+import subprocess
+import sys
 import time
 import logging
 import argparse
@@ -41,9 +43,9 @@
 )
 from triton_cli.client.client import InferenceServerException, TritonClient
 from triton_cli.metrics import MetricsClient
+from triton_cli.profile import add_unknown_args_to_args, build_command
 from triton_cli.repository import ModelRepository
 from triton_cli.server.server_factory import TritonServerFactory
-from triton_cli.profiler import Profiler
 
 logger = logging.getLogger(LOGGER_NAME)
 
@@ -159,41 +161,6 @@ def add_model_args(subcommands):
         )
 
 
-def add_profile_args(subcommands):
-    for subcommand in subcommands:
-        subcommand.add_argument(
-            "-b",
-            "--batch-size",
-            type=int,
-            default=1,
-            required=False,
-            help="The batch size / concurrency to benchmark. (Default: 1)",
-        )
-        subcommand.add_argument(
-            "--input-length",
-            type=int,
-            default=128,
-            required=False,
-            help="The input length (tokens) to use for benchmarking LLMs. (Default: 128)",
-        )
-        subcommand.add_argument(
-            "--output-length",
-            type=int,
-            default=128,
-            required=False,
-            help="The output length (tokens) to use for benchmarking LLMs. (Default: 128)",
-        )
-        # TODO: Revisit terminology here. Online/offline vs streaming, etc.
-        subcommand.add_argument(
-            "--profile-mode",
-            type=str,
-            choices=["online", "offline"],
-            default="online",
-            required=False,
-            help="Profiling mode: offline means one full response will be generated, online means response will be streaming tokens as they are generated.",
-        )
-
-
 def add_client_args(subcommands):
     # Add protocol/url/port to all client-based subcommands
     for subcommand in subcommands:
@@ -396,49 +363,17 @@ def handle_infer(args: argparse.Namespace):
 # Profile
 # ================================================
 def parse_args_profile(parser):
-    profile = parser.add_parser(
-        "profile", help="Profile LLM models using Perf Analyzer"
-    )
+    profile = parser.add_parser("profile", help="Profile models", add_help=False)
     profile.set_defaults(func=handle_profile)
-    add_model_args([profile])
-    add_profile_args([profile])
-    add_backend_args([profile])
-    add_client_args([profile])
+    profile.add_argument(
+        "--help", action="store_true", help="Show help message and exit"
+    )
 
 
 def handle_profile(args: argparse.Namespace):
-    client = TritonClient(url=args.url, port=args.port, protocol=args.protocol)
-    profile_model(args, client)
-
-
-# TODO: Move to utils? <-- Delete?
-def profile_model(args: argparse.Namespace, client: TritonClient):
-    if args.protocol != "grpc":
-        raise Exception("Profiler only supports 'grpc' protocol at this time.")
-
-    if not args.port:
-        args.port = 8001 if args.protocol == "grpc" else 8000
-
-    # TODO: Consider python(BLS)/ensemble case for the model
-    # receiving requests in the case of TRT-LLM. For now, TRT-LLM
-    # should be manually specified.
-    backend = args.backend
-    if not args.backend:
-        # Profiler needs to know TRT-LLM vs vLLM to form correct payload
-        backend = client.get_model_backend(args.model)
-
-    logger.info(f"Running Perf Analyzer profiler on '{args.model}'...")
-    Profiler.profile(
-        model=args.model,
-        backend=backend,
-        batch_size=args.batch_size,
-        url=f"{args.url}:{args.port}",
-        input_length=args.input_length,
-        output_length=args.output_length,
-        # Should be "online" for IFB / streaming, and "offline" for non-streaming
-        offline=(args.profile_mode == "offline"),
-        verbose=args.verbose,
-    )
+    cmd = build_command(args, "genai-perf")
+    logger.info(f"Running: '{' '.join(cmd)}'")
+    subprocess.run(cmd, check=True)
 
 
 # ================================================
@@ -502,5 +437,11 @@ def parse_args(argv=None):
     parse_args_profile(subcommands)
     parse_args_utils(subcommands)
     add_verbose_args([parser])
-    args = parser.parse_args(argv)
+
+    argv_ = argv if argv is not None else sys.argv[1:]
+    if argv_[0] == "profile":
+        args, unknown_args = parser.parse_known_args(argv_)
+        args = add_unknown_args_to_args(args, unknown_args)
+    else:
+        args = parser.parse_args(argv_)
     return args
diff --git a/src/triton_cli/profile.py b/src/triton_cli/profile.py
@@ -0,0 +1,91 @@
+#!/usr/bin/env python3
+# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import argparse
+from typing import List
+
+
+# ================================================
+# Helper functions
+# ================================================
+def build_command(args: argparse.Namespace, executable: str):
+    skip_args = ["func"]
+    cmd = [executable]
+    for arg, value in vars(args).items():
+        if arg in skip_args:
+            pass
+        elif value is False:
+            pass
+        elif value is True:
+            if len(arg) == 1:
+                cmd += [f"-{arg}"]
+            else:
+                cmd += [f"--{arg}"]
+        # [DLIS-6656] - Remove backend renaming.
+        # This allows "tensorrtllm" to be used as the backend for consistency.
+        # Once GenAI-Perf releases 24.05, "tensorrtllm" as the backend value
+        # will be supported by default.
+        elif arg == "backend" and value in ["tensorrtllm", "trtllm"]:
+            cmd += ["--backend", "trtllm"]
+        else:
+            if len(arg) == 1:
+                cmd += [f"-{arg}", f"{value}"]
+            else:
+                cmd += [f"--{arg}", f"{value}"]
+    return cmd
+
+
+def add_unknown_args_to_args(args: argparse.Namespace, unknown_args: List[str]):
+    """Add unknown args to args list"""
+    unknown_args_dict = turn_unknown_args_into_dict(unknown_args)
+    for key, value in unknown_args_dict.items():
+        setattr(args, key, value)
+    return args
+
+
+def turn_unknown_args_into_dict(unknown_args: List[str]):
+    """Convert list of unknown args to dictionary"""
+    it = iter(unknown_args)
+    unknown_args_dict = {}
+    try:
+        while True:
+            arg = next(it)
+            if arg.startswith(("-", "--")):
+                key = arg.lstrip("-")
+                # Peek to see if next item is a value or another flag
+                next_arg = next(it, None)
+                if next_arg and not next_arg.startswith(("-", "--")):
+                    unknown_args_dict[key] = next_arg
+                else:
+                    unknown_args_dict[key] = True
+                    if next_arg:
+                        it = iter([next_arg] + list(it))
+            else:
+                raise ValueError(f"Argument does not start with a '-' or '--': {arg}")
+    except StopIteration:
+        pass
+    return unknown_args_dict