Skip to content

Commit

Permalink
stash
Browse files Browse the repository at this point in the history
  • Loading branch information
BalaBalaYi committed Oct 16, 2024
1 parent cc8c8f0 commit c123b9c
Show file tree
Hide file tree
Showing 3 changed files with 5 additions and 2 deletions.
4 changes: 2 additions & 2 deletions dlrover/python/master/node/event_callback.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,7 +334,7 @@ def _stop_job_if_needed(self, node: Node):
reason=job_exit_reason,
msg=(
"The number of worker failure exceeds the "
f"worker count {self._total_worker_num} "
f"max failure limit: {max_failure_num}."
),
)
elif self._available_worker_num < self._min_node:
Expand All @@ -343,6 +343,6 @@ def _stop_job_if_needed(self, node: Node):
reason=job_exit_reason,
msg=(
"The available number of worker is less than the minimum"
f"number {self._min_node} of redzv "
f"number {self._min_node} of rdzv."
),
)
1 change: 1 addition & 0 deletions dlrover/python/master/node/training_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,7 @@ def relaunch_node(self, node: Node, remove_exited_node=False):
name=self._new_node_name_fn(node.type, new_id),
service_addr=node.service_addr,
relaunch_count=relaunch_node.relaunch_count,
max_relaunch_count=relaunch_node.max_relaunch_count,
)
)
if remove_exited_node and not node.is_released and node.exited():
Expand Down
2 changes: 2 additions & 0 deletions dlrover/python/tests/test_worker_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,10 +107,12 @@ def test_relaunch_node(self):
)
failed_worker = self._job_nodes[NodeType.WORKER][4]
failed_worker.status = NodeStatus.FAILED
failed_worker.max_relaunch_count = 3
plan = worker_manager.relaunch_node(
failed_worker, remove_exited_node=True
)
self.assertEqual(plan.launch_nodes[0].config_resource.cpu, 16)
self.assertEqual(plan.launch_nodes[0].max_relaunch_count, 3)
self.assertEqual(worker_manager._nodes[5].id, 5)
self.assertEqual(plan.remove_nodes[0].config_resource.cpu, 16)

Expand Down

0 comments on commit c123b9c

Please sign in to comment.