triton-inference-server · dyastremsky · May 10, 2024 · Apr 29, 2024 · Apr 29, 2024 · Apr 30, 2024
@@ -41,7 +41,7 @@ jobs:
       fail-fast: false
       matrix:
         os: ["ubuntu-22.04"]
-        python-version: ["3.8", "3.10"]
+        python-version: ["3.10"]
 
     steps:
     - uses: actions/checkout@v3

diff --git a/.gitignore b/.gitignore
@@ -8,6 +8,7 @@ __pycache__/
 
 # Distribution / packaging
 .Python
+artifacts/
 build/
 develop-eggs/
 dist/
@@ -17,12 +18,20 @@ eggs/
 lib64/
 parts/
 sdist/
+tests/checkpoints/
+tests/output_dir/
+tests/output_model_repository/
+tests/plots/
+tests/reports/
+tests/results/
+tmp/
 var/
 wheels/
 share/python-wheels/
 *.egg-info/
 .installed.cfg
 *.egg
+llm_inputs.json
 MANIFEST
 
 # PyInstaller

diff --git a/README.md b/README.md
@@ -16,7 +16,7 @@ Server.
 ## Pre-requisites
 
 When using Triton and related tools on your host (outside of a Triton container
-image) there are a number of additional dependencies that may be required for
+image), there are a number of additional dependencies that may be required for
 various workflows. Most system dependency issues can be resolved by installing
 and running the CLI from within the latest corresponding `tritonserver`
 container image, which should have all necessary system dependencies installed.
@@ -96,6 +96,8 @@ triton infer -m gpt2 --prompt "machine learning is"
 
 # Infer with curl using the generate endpoint
 curl -X POST localhost:8000/v2/models/gpt2/generate -d '{"text_input": "machine learning is", "max_tokens": 128}'
+
+
 ```
 
 ## Serving LLM Models
@@ -162,7 +164,7 @@ triton start
 # Interact with model
 triton infer -m llama-3-8b-instruct --prompt "machine learning is"
 
-# Profile model with Perf Analyzer
+# Profile model with GenAI-Perf
 triton profile -m llama-3-8b-instruct --backend vllm
 ```
 
@@ -224,16 +226,16 @@ huggingface-login
 
 # Build TRT LLM engine and generate a Triton model repository pointing at it
 triton remove -m all
-triton import -m llama-3-8b-instruct --backend tensorrtllm
+triton import -m llama-3-8b-instruct --backend trtllm
 
 # Start Triton pointing at the default model repository
 triton start
 
 # Interact with model
 triton infer -m llama-3-8b-instruct --prompt "machine learning is"
 
-# Profile model with Perf Analyzer
-triton profile -m llama-3-8b-instruct --backend tensorrtllm
+# Profile model with GenAI-Perf
+triton profile -m llama-3-8b-instruct --backend trtllm
 ```
 ## Additional Dependencies for Custom Environments
 
@@ -269,10 +271,6 @@ sudo apt install libopenmpi-dev
 ```
 
 ## Known Limitations
-- Triton CLI's `profile` command currently only supports TRT-LLM and vLLM models.
-- Triton CLI's `profile` command will be migrating to use
-[genai-perf](https://github.com/triton-inference-server/client/tree/main/src/c++/perf_analyzer/genai-perf)
-as the backbone for LLM profiling soon.
 - Models and configurations generated by Triton CLI are focused on ease-of-use,
 and may not be as optimized as possible for your system or use case.
 - Triton CLI currently uses the TRT-LLM dependencies installed in its environment

diff --git a/pyproject.toml b/pyproject.toml
@@ -37,20 +37,19 @@ classifiers = [
     "Topic :: Scientific/Engineering",
     "Programming Language :: Python",
     "Programming Language :: Python :: 3",
-    "Programming Language :: Python :: 3.8",
-    "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
     "Operating System :: Unix",
 ]
 authors = []
 maintainers = []
 keywords = []
-requires-python = ">=3.8,<4"
+requires-python = ">=3.10,<4"
 # TODO: Add [gpu] set of dependencies for trtllm once it's available on pypi
 dependencies = [
     "directory-tree == 0.0.4", # may remove in future
     "docker == 6.1.3",
+    "genai-perf @ git+https://github.com/triton-inference-server/[email protected]#subdirectory=src/c++/perf_analyzer/genai-perf",
     # TODO: rely on tritonclient to pull in protobuf and numpy dependencies?
     "numpy >= 1.21",
     "protobuf>=3.7.0",
@@ -63,6 +62,7 @@ dependencies = [
     # Testing
     "pytest >= 8.1.1", # may remove later
     "pytest-timeout", # may remove later
+    "pytest-mock >= 3.13.0", # may remove later
 ]
 
 # CLI Entrypoint
@@ -81,6 +81,9 @@ build-backend = "hatchling.build"
 [tool.hatch.version]
 path = "src/triton_cli/__init__.py"
 
+[tool.hatch.metadata]
+allow-direct-references = true
+
 # Pre-commit hook tool configs
 [tool.codespell]
 # note: pre-commit passes explicit lists of files here, which this skip file list doesn't override -

diff --git a/src/triton_cli/parser.py b/src/triton_cli/parser.py
@@ -26,7 +26,10 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import json
+import subprocess
+import sys
 import time
+from typing import List
 import logging
 import argparse
 from pathlib import Path
@@ -43,7 +46,6 @@
 from triton_cli.metrics import MetricsClient
 from triton_cli.repository import ModelRepository
 from triton_cli.server.server_factory import TritonServerFactory
-from triton_cli.profiler import Profiler
 
 logger = logging.getLogger(LOGGER_NAME)
 
@@ -159,41 +161,6 @@ def add_model_args(subcommands):
         )
 
 
-def add_profile_args(subcommands):
-    for subcommand in subcommands:
-        subcommand.add_argument(
-            "-b",
-            "--batch-size",
-            type=int,
-            default=1,
-            required=False,
-            help="The batch size / concurrency to benchmark. (Default: 1)",
-        )
-        subcommand.add_argument(
-            "--input-length",
-            type=int,
-            default=128,
-            required=False,
-            help="The input length (tokens) to use for benchmarking LLMs. (Default: 128)",
-        )
-        subcommand.add_argument(
-            "--output-length",
-            type=int,
-            default=128,
-            required=False,
-            help="The output length (tokens) to use for benchmarking LLMs. (Default: 128)",
-        )
-        # TODO: Revisit terminology here. Online/offline vs streaming, etc.
-        subcommand.add_argument(
-            "--profile-mode",
-            type=str,
-            choices=["online", "offline"],
-            default="online",
-            required=False,
-            help="Profiling mode: offline means one full response will be generated, online means response will be streaming tokens as they are generated.",
-        )
-
-
 def add_client_args(subcommands):
     # Add protocol/url/port to all client-based subcommands
     for subcommand in subcommands:
@@ -396,49 +363,17 @@ def handle_infer(args: argparse.Namespace):
 # Profile
 # ================================================
 def parse_args_profile(parser):
-    profile = parser.add_parser(
-        "profile", help="Profile LLM models using Perf Analyzer"
-    )
+    profile = parser.add_parser("profile", help="Profile models", add_help=False)
     profile.set_defaults(func=handle_profile)
-    add_model_args([profile])
-    add_profile_args([profile])
-    add_backend_args([profile])
-    add_client_args([profile])
+    profile.add_argument(
+        "--help", action="store_true", help="Show help message and exit"
+    )
 
 
 def handle_profile(args: argparse.Namespace):
-    client = TritonClient(url=args.url, port=args.port, protocol=args.protocol)
-    profile_model(args, client)
-
-
-# TODO: Move to utils? <-- Delete?
-def profile_model(args: argparse.Namespace, client: TritonClient):
-    if args.protocol != "grpc":
-        raise Exception("Profiler only supports 'grpc' protocol at this time.")
-
-    if not args.port:
-        args.port = 8001 if args.protocol == "grpc" else 8000
-
-    # TODO: Consider python(BLS)/ensemble case for the model
-    # receiving requests in the case of TRT-LLM. For now, TRT-LLM
-    # should be manually specified.
-    backend = args.backend
-    if not args.backend:
-        # Profiler needs to know TRT-LLM vs vLLM to form correct payload
-        backend = client.get_model_backend(args.model)
-
-    logger.info(f"Running Perf Analyzer profiler on '{args.model}'...")
-    Profiler.profile(
-        model=args.model,
-        backend=backend,
-        batch_size=args.batch_size,
-        url=f"{args.url}:{args.port}",
-        input_length=args.input_length,
-        output_length=args.output_length,
-        # Should be "online" for IFB / streaming, and "offline" for non-streaming
-        offline=(args.profile_mode == "offline"),
-        verbose=args.verbose,
-    )
+    cmd = build_command(args, "genai-perf")
+    logger.info(f"Running: '{' '.join(cmd)}'")
+    subprocess.run(cmd, check=True)
 
 
 # ================================================
@@ -502,5 +437,76 @@ def parse_args(argv=None):
     parse_args_profile(subcommands)
     parse_args_utils(subcommands)
     add_verbose_args([parser])
-    args = parser.parse_args(argv)
+
+    argv_ = argv if argv is not None else sys.argv[1:]
+    # If a passthrough command is passed as the first arg,
+    # special handling is needed.
+    if argv_[0] == "profile":
+        args, unknown_args = parser.parse_known_args(argv_)
+        args = add_unknown_args_to_args(args, unknown_args)
+    else:
+        args = parser.parse_args(argv_)
     return args
+
+
+# ================================================
+# Helper functions
+# ================================================
+def build_command(args: argparse.Namespace, executable: str):
+    skip_args = ["func"]
+    cmd = [executable]
+    for arg, value in vars(args).items():
+        if arg in skip_args:
+            pass
+        elif value is False:
+            pass
+        elif value is True:
+            if len(arg) == 1:
+                cmd += [f"-{arg}"]
+            else:
+                cmd += [f"--{arg}"]
+        # [DLIS-6656] - Remove backend renaming.
+        # This allows "tensorrtllm" to be used as the backend for consistency.
+        # Once GenAI-Perf releases 24.05, "tensorrtllm" as the backend value
+        # will be supported by default.
+        elif arg == "backend":
+            if value == "tensorrtllm":
+                cmd += ["--backend", "trtllm"]
+        else:
+            if len(arg) == 1:
+                cmd += [f"-{arg}", f"{value}"]
+            else:
+                cmd += [f"--{arg}", f"{value}"]
+    return cmd
+
+
+def add_unknown_args_to_args(args: argparse.Namespace, unknown_args: List[str]):
+    """Add unknown args to args list"""
+    unknown_args_dict = turn_unknown_args_into_dict(unknown_args)
+    for key, value in unknown_args_dict.items():
+        setattr(args, key, value)
+    return args
+
+
+def turn_unknown_args_into_dict(unknown_args: List[str]):
+    """Convert list of unknown args to dictionary"""
+    it = iter(unknown_args)
+    unknown_args_dict = {}
+    try:
+        while True:
+            arg = next(it)
+            if arg.startswith(("-", "--")):
+                key = arg.lstrip("-")
+                # Peek to see if next item is a value or another flag
+                next_arg = next(it, None)
+                if next_arg and not next_arg.startswith(("-", "--")):
+                    unknown_args_dict[key] = next_arg
+                else:
+                    unknown_args_dict[key] = True
+                    if next_arg:
+                        it = iter([next_arg] + list(it))
+            else:
+                raise ValueError(f"Argument does not start with a '-' or '--': {arg}")
+    except StopIteration:
+        pass
+    return unknown_args_dict