Skip to content

Commit

Permalink
Fix bug by removing trailing newline from +cluster.nodelist in comman…
Browse files Browse the repository at this point in the history
…d args (#282)
  • Loading branch information
TaekyungHeo authored Oct 23, 2024
1 parent ec716df commit e30e230
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 24 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ def _generate_cmd_args_str(self, args: Dict[str, str], nodes: List[str]) -> str:

if nodes:
nodes_str = ",".join(nodes)
cmd_arg_str_parts.append(f"+cluster.nodelist=\\'{nodes_str}\\'\n")
cmd_arg_str_parts.append(f"+cluster.nodelist=\\'{nodes_str}\\'")

return " ".join(cmd_arg_str_parts + env_var_str_parts)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,39 +55,46 @@ def cmd_gen_strategy(self, slurm_system: SlurmSystem) -> NeMoLauncherSlurmComman
return NeMoLauncherSlurmCommandGenStrategy(slurm_system, {})

@pytest.mark.parametrize(
"expected_content",
"expected_content, nodes",
[
[
"TEST_VAR_1=value1",
"+env_vars.TEST_VAR_1=value1",
'stages=["training"]',
"cluster.gpus_per_node=null",
"cluster.partition=main",
"numa_mapping.enable=True",
"training.exp_manager.create_checkpoint_callback=False",
"training.model.data.data_impl=mock",
"training.model.data.data_prefix=[]",
"training.model.global_batch_size=128",
"training.model.micro_batch_size=2",
"training.model.pipeline_model_parallel_size=4",
"training.model.tensor_model_parallel_size=4",
"training.run.time_limit=3:00:00",
"training.trainer.enable_checkpointing=False",
"training.trainer.log_every_n_steps=1",
"training.trainer.max_steps=400",
"training.trainer.num_nodes=2",
"training.trainer.val_check_interval=100",
"training=gpt3/40b_improved",
]
(
[
"TEST_VAR_1=value1",
"+env_vars.TEST_VAR_1=value1",
'stages=["training"]',
"cluster.gpus_per_node=null",
"cluster.partition=main",
"numa_mapping.enable=True",
"training.exp_manager.create_checkpoint_callback=False",
"training.model.data.data_impl=mock",
"training.model.data.data_prefix=[]",
"training.model.global_batch_size=128",
"training.model.micro_batch_size=2",
"training.model.pipeline_model_parallel_size=4",
"training.model.tensor_model_parallel_size=4",
"training.run.time_limit=3:00:00",
"training.trainer.enable_checkpointing=False",
"training.trainer.log_every_n_steps=1",
"training.trainer.max_steps=400",
"training.trainer.num_nodes=2",
"training.trainer.val_check_interval=100",
"training=gpt3/40b_improved",
"+cluster.nodelist=\\'node1,node2\\'",
],
["node1", "node2"],
),
],
)
def test_generate_exec_command(
self,
cmd_gen_strategy: NeMoLauncherSlurmCommandGenStrategy,
test_run: TestRun,
expected_content: List[str],
nodes: List[str],
) -> None:
test_run.nodes = nodes
cmd = cmd_gen_strategy.gen_exec_command(test_run)

for content in expected_content:
assert any(content in part for part in cmd.split())
assert "training.run.name=" in cmd
Expand Down

0 comments on commit e30e230

Please sign in to comment.