triton-inference-server · rmccorm4 · Dec 14, 2023 · Dec 14, 2023 · Dec 14, 2023 · Dec 14, 2023
diff --git a/src/triton_cli/parser.py b/src/triton_cli/parser.py
@@ -242,6 +242,13 @@ def parse_args_server(subcommands):
         default="nvcr.io/nvidia/tritonserver:23.11-vllm-python-py3",
         help="Image to use when starting Triton with 'docker' mode",
     )
+    server_start.add_argument(
+        "--world-size",
+        type=int,
+        required=False,
+        default=-1,
+        help="Number of devices to deploy a tensorrtllm model.",
+    )
     add_repo_args([server_start])
 
     server_metrics = server_commands.add_parser(

diff --git a/src/triton_cli/server/server_docker.py b/src/triton_cli/server/server_docker.py
@@ -18,6 +18,7 @@
 import logging
 
 from .server import TritonServer
+from .server_utils import TritonServerUtils
 from triton_cli.constants import LOGGER_NAME
 
 logger = logging.getLogger(LOGGER_NAME)
@@ -29,7 +30,7 @@ class TritonServerDocker(TritonServer):
     triton in a docker container.
     """
 
-    def __init__(self, image, config, gpus, mounts, labels, shm_size, args):
+    def __init__(self, image, world_size, config, gpus, mounts, labels, shm_size, args):
         """
         Parameters
         ----------
@@ -60,6 +61,7 @@ def __init__(self, image, config, gpus, mounts, labels, shm_size, args):
         self._gpus = gpus
         self._shm_size = shm_size
         self._args = args if args else {}
+        self._world_size = world_size
 
         assert self._server_config[
             "model-repository"
@@ -105,11 +107,19 @@ def start(self, env=None):
             server_grpc_port: server_grpc_port,
             server_metrics_port: server_metrics_port,
         }
-
         # Construct run command
-        command = " ".join(
-            env_cmds + ["tritonserver", self._server_config.to_cli_string()]
-        )
+        # TRTLLM models require special handling. For now,
+        # we will 'spell-out' the command.
+        if self._world_size >= 1:
+            command = " ".join(
+                TritonServerUtils.mpi_run(
+                    self._world_size, self._server_config["model-repository"]
+                )
+            )
+        else:
+            command = " ".join(
+                env_cmds + ["tritonserver", self._server_config.to_cli_string()]
+            )
         try:
             # Run the docker container and run the command in the container
             self._tritonserver_container = self._docker_client.containers.run(

diff --git a/src/triton_cli/server/server_factory.py b/src/triton_cli/server/server_factory.py
@@ -38,6 +38,7 @@ class TritonServerFactory:
     @staticmethod
     def create_server_docker(
         image,
+        world_size,
         config,
         gpus,
         mounts=None,
@@ -50,6 +51,8 @@ def create_server_docker(
         ----------
         image : str
             The tritonserver docker image to pull and run
+        world_size : int
+            Number of devices to deploy a tensorrtllm model.
         config : TritonServerConfig
             the config object containing arguments for this server instance
         gpus : list of str
@@ -71,6 +74,7 @@ def create_server_docker(
 
         return TritonServerDocker(
             image=image,
+            world_size=world_size,
             config=config,
             gpus=gpus,
             mounts=mounts,
@@ -148,10 +152,10 @@ def _get_local_server_handle(config, gpus):
     def _get_docker_server_handle(config, gpus):
         triton_config = TritonServerConfig()
         triton_config["model-repository"] = os.path.abspath(config.model_repository)
-
         logger.info("Starting a Triton Server using docker")
         server = TritonServerFactory.create_server_docker(
             image=config.image,
+            world_size=config.world_size,
             config=triton_config,
             gpus=gpus,
             mounts=None,

diff --git a/src/triton_cli/server/server_utils.py b/src/triton_cli/server/server_utils.py
@@ -0,0 +1,42 @@
+#!/usr/bin/env python3
+
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class TritonServerUtils:
+    @staticmethod
+    def mpi_run(world_size: int, model_repo: str) -> str:
+        """
+        Parameters
+        ----------
+        world_size : int
+            The path to the model repository
+        model_repo : str
+            The path to the model repository
+        Returns
+        -------
+        The appropriate world size to use to run the tensorrtllm
+        engine(s) stored in the model repository
+        """
+        cmd = ["mpirun", "--allow-run-as-root"]
+        for i in range(world_size):
+            cmd += ["-n", "1", "/opt/tritonserver/bin/tritonserver"]
+            cmd += [
+                f"--model-repository={model_repo}",
+                "--disable-auto-complete-config",
+                f"--backend-config=python,shm-region-prefix-name=prefix{i}_",
+                ":",
+            ]
+        return cmd