NVIDIA · TaekyungHeo · Oct 22, 2024 · Oct 22, 2024 · Oct 23, 2024 · Oct 23, 2024
diff --git a/conf/common/plugin/nccl_test_epilogue.toml b/conf/common/plugin/nccl_test_epilogue.toml
@@ -0,0 +1,22 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "nccl_test_epilogue"
+
+[[Tests]]
+id = "Tests.1"
+test_name = "nccl_test_all_gather"
+time_limit = "00:20:00"
diff --git a/conf/common/plugin/nccl_test_prologue.toml b/conf/common/plugin/nccl_test_prologue.toml
@@ -0,0 +1,22 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "nccl_test_prologue"
+
+[[Tests]]
+id = "Tests.1"
+test_name = "nccl_test_all_reduce"
+time_limit = "00:20:00"
diff --git a/conf/common/plugin/test/nccl_test_all_gather.toml b/conf/common/plugin/test/nccl_test_all_gather.toml
@@ -0,0 +1,33 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "nccl_test_all_gather"
+description = "all_gather"
+test_template_name = "NcclTest"
+
+[cmd_args]
+"subtest_name" = "all_gather_perf_mpi"
+"ngpus" = "1"
+"minbytes" = "128"
+"maxbytes" = "4G"
+"iters" = "100"
+"warmup_iters" = "50"
+
+[extra_cmd_args]
+"--stepfactor" = "2"
+
+[extra_env_vars]
+"NCCL_TEST_SPLIT_MASK" = "0x7"
diff --git a/conf/common/plugin/test/nccl_test_all_reduce.toml b/conf/common/plugin/test/nccl_test_all_reduce.toml
@@ -0,0 +1,30 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "nccl_test_all_reduce"
+description = "all_reduce"
+test_template_name = "NcclTest"
+
+[cmd_args]
+"subtest_name" = "all_reduce_perf_mpi"
+"ngpus" = "1"
+"minbytes" = "128"
+"maxbytes" = "16G"
+"iters" = "100"
+"warmup_iters" = "50"
+
+[extra_cmd_args]
+"--stepfactor" = "2"
diff --git a/conf/common/test_scenario/nccl_test.toml b/conf/common/test_scenario/nccl_test.toml
@@ -15,6 +15,10 @@
 # limitations under the License.
 
 name = "nccl-test"
+
+prologue = "nccl_test_prologue"
+epilogue = "nccl_test_epilogue"
+
 [[Tests]]
 id = "Tests.1"
 test_name = "nccl_test_all_reduce"

diff --git a/src/cloudai/_core/command_gen_strategy.py b/src/cloudai/_core/command_gen_strategy.py
@@ -39,3 +39,29 @@ def gen_exec_command(self, tr: TestRun) -> str:
             str: The generated execution command.
         """
         pass
+
+    @abstractmethod
+    def gen_srun_command(self, tr: TestRun) -> str:
+        """
+        Generate the Slurm srun command for a test based on the given parameters.
+
+        Args:
+            tr (TestRun): Contains the test and its run-specific configurations.
+
+        Returns:
+            str: The generated Slurm srun command.
+        """
+        pass
+
+    @abstractmethod
+    def gen_srun_success_check(self, tr: TestRun) -> str:
+        """
+        Generate the Slurm success check command to verify if a test run was successful.
+
+        Args:
+            tr (TestRun): Contains the test and its run-specific configurations.
+
+        Returns:
+            str: The generated command to check the success of the test run.
+        """
+        pass
diff --git a/src/cloudai/_core/test_scenario.py b/src/cloudai/_core/test_scenario.py
@@ -58,6 +58,8 @@ class TestRun:
     weight: float = 0.0
     ideal_perf: float = 1.0
     dependencies: dict[str, TestDependency] = field(default_factory=dict)
+    prologue: Optional["TestScenario"] = None
+    epilogue: Optional["TestScenario"] = None
 
     def __hash__(self) -> int:
         return hash(self.name + self.test.name + str(self.iterations) + str(self.current_iteration))

diff --git a/src/cloudai/_core/test_scenario_parser.py b/src/cloudai/_core/test_scenario_parser.py
@@ -54,6 +54,8 @@ class _TestScenarioTOML(BaseModel):
     name: str
     job_status_check: bool = True
     tests: list[_TestRunTOML] = Field(alias="Tests", min_length=1)
+    prologue: str = ""
+    epilogue: str = ""
 
     @model_validator(mode="after")
     def check_no_self_dependency(self):
@@ -99,9 +101,10 @@ class TestScenarioParser:
 
     __test__ = False
 
-    def __init__(self, file_path: Path, test_mapping: Dict[str, Test]) -> None:
+    def __init__(self, file_path: Path, test_mapping: Dict[str, Test], plugin_mapping: Dict[str, TestScenario]) -> None:
         self.file_path = file_path
         self.test_mapping = test_mapping
+        self.plugin_mapping = plugin_mapping
 
     def parse(self) -> TestScenario:
         """
@@ -136,8 +139,14 @@ def _parse_data(self, data: Dict[str, Any]) -> TestScenario:
         total_weight = sum(tr.weight for tr in ts_model.tests)
         normalized_weight = 0 if total_weight == 0 else 100 / total_weight
 
+        prologue_name = data.get("prologue", "")
+        epilogue_name = data.get("epilogue", "")
+
+        prologue = self.plugin_mapping.get(prologue_name, None) if prologue_name else None
+        epilogue = self.plugin_mapping.get(epilogue_name, None) if epilogue_name else None
+
         testruns_by_id: dict[str, TestRun] = {
-            tr.id: self._create_section_test_run(tr, normalized_weight) for tr in ts_model.tests
+            tr.id: self._create_section_test_run(tr, normalized_weight, prologue, epilogue) for tr in ts_model.tests
         }
 
         tests_data: dict[str, _TestRunTOML] = {tr.id: tr for tr in ts_model.tests}
@@ -153,13 +162,21 @@ def _parse_data(self, data: Dict[str, Any]) -> TestScenario:
             job_status_check=ts_model.job_status_check,
         )
 
-    def _create_section_test_run(self, test_info: _TestRunTOML, normalized_weight: float) -> TestRun:
+    def _create_section_test_run(
+        self,
+        test_info: _TestRunTOML,
+        normalized_weight: float,
+        prologue: Optional[TestScenario],
+        epilogue: Optional[TestScenario],
+    ) -> TestRun:
         """
         Create a section-specific Test object by copying from the test mapping.
 
         Args:
             test_info (Dict[str, Any]): Information of the test.
             normalized_weight (float): Normalized weight for the test.
+            prologue (Optional[TestScenario]): TestScenario object representing the prologue sequence.
+            epilogue (Optional[TestScenario]): TestScenario object representing the epilogue sequence.
 
         Returns:
             Test: Copied and updated Test object for the section.
@@ -192,5 +209,7 @@ def _create_section_test_run(self, test_info: _TestRunTOML, normalized_weight: f
             sol=test_info.sol,
             weight=test_info.weight * normalized_weight,
             ideal_perf=test_info.ideal_perf,
+            prologue=prologue if prologue is not None else TestScenario(name="default_prologue", test_runs=[]),
+            epilogue=epilogue if epilogue is not None else TestScenario(name="default_epilogue", test_runs=[]),
         )
         return tr
diff --git a/src/cloudai/_core/test_template.py b/src/cloudai/_core/test_template.py
@@ -133,6 +133,40 @@ def gen_exec_command(self, tr: TestRun) -> str:
             )
         return self.command_gen_strategy.gen_exec_command(tr)
 
+    def gen_srun_command(self, tr: TestRun) -> str:
+        """
+        Generate an Slurm srun command for a test using the provided command generation strategy.
+
+        Args:
+            tr (TestRun): Contains the test and its run-specific configurations.
+
+        Returns:
+            str: The generated Slurm srun command.
+        """
+        if self.command_gen_strategy is None:
+            raise ValueError(
+                "command_gen_strategy is missing. Ensure the strategy is registered in the Registry "
+                "by calling the appropriate registration function for the system type."
+            )
+        return self.command_gen_strategy.gen_srun_command(tr)
+
+    def gen_srun_success_check(self, tr: TestRun) -> str:
+        """
+        Generate a Slurm success check command for a test using the provided command generation strategy.
+
+        Args:
+            tr (TestRun): Contains the test and its run-specific configurations.
+
+        Returns:
+            str: The generated command to check the success of the test run.
+        """
+        if self.command_gen_strategy is None:
+            raise ValueError(
+                "command_gen_strategy is missing. Ensure the strategy is registered in the Registry "
+                "by calling the appropriate registration function for the system type."
+            )
+        return self.command_gen_strategy.gen_srun_success_check(tr)
+
     def gen_json(self, tr: TestRun) -> Dict[Any, Any]:
         """
         Generate a JSON string representing the Kubernetes job specification for this test using this template.

diff --git a/src/cloudai/cli/handlers.py b/src/cloudai/cli/handlers.py
@@ -114,7 +114,7 @@ def handle_dry_run_and_run(args: argparse.Namespace) -> int:
         args (argparse.Namespace): The parsed command-line arguments.
     """
     parser = Parser(args.system_config)
-    system, tests, test_scenario = parser.parse(args.tests_dir, args.test_scenario)
+    system, tests, test_scenario = parser.parse(args.tests_dir, args.test_scenario, Path("conf/common/plugin"))
     assert test_scenario is not None
 
     if args.output_dir: