Skip to content

Commit

Permalink
add exception handler in _get_master_addr_port since the port might b…
Browse files Browse the repository at this point in the history
…e null
  • Loading branch information
majieyue committed Oct 12, 2024
1 parent f10ba6c commit 94106cd
Showing 1 changed file with 12 additions and 1 deletion.
13 changes: 12 additions & 1 deletion dlrover/python/elastic_agent/torch/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -485,7 +485,7 @@ def _rendezvous(self, worker_group: WorkerGroup) -> None:
spec.master_port,
)

master_addr, master_port = self._get_master_addr_port(store)
master_addr, master_port = self._safe_get_master_addr_port(store)

# compatible with torch 2.4
if not version_less_than_240():
Expand Down Expand Up @@ -543,6 +543,17 @@ def _get_master_addr_port(self, store: Store) -> Tuple[str, int]:
master_port = int(store.get("MASTER_PORT").decode(encoding="UTF-8"))
return (master_addr, master_port)

def _safe_get_master_addr_port(self, store: Store) -> Tuple[str, int]:
for i in range(1,5):
try:
addr, port = self._get_master_addr_port(store)
return (addr, port)
except Exception as e:
logger.warning(f"_get_master_addr_port failed with exception {e}, will try again")
time.sleep(10)

Check warning on line 553 in dlrover/python/elastic_agent/torch/training.py

View check run for this annotation

Codecov / codecov/patch

dlrover/python/elastic_agent/torch/training.py#L551-L553

Added lines #L551 - L553 were not covered by tests

raise ValueError("invalid value in _get_master_addr_port")

Check warning on line 555 in dlrover/python/elastic_agent/torch/training.py

View check run for this annotation

Codecov / codecov/patch

dlrover/python/elastic_agent/torch/training.py#L555

Added line #L555 was not covered by tests

def _get_socket_with_port(self) -> socket.socket:
"""Return a free port on localhost.
Expand Down

0 comments on commit 94106cd

Please sign in to comment.