Skip to content

Commit

Permalink
updating some epochtimetracker dependencies
Browse files Browse the repository at this point in the history
  • Loading branch information
jarlsondre committed Oct 30, 2024
1 parent 929c4cb commit 0257693
Show file tree
Hide file tree
Showing 9 changed files with 16 additions and 10 deletions.
4 changes: 4 additions & 0 deletions src/itwinai/scalability.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,10 @@ def create_absolute_plot(avg_epoch_time_df: pd.DataFrame) -> None:


def create_relative_plot(avg_epoch_time_df: pd.DataFrame, gpus_per_node: int = 4):
"""Creates a plot showing the relative training times for the different
distributed strategies and different number of GPUs. In particular, it shows the
speedup when adding more GPUs, compared to the baseline of using a single node.
"""
sns.set_theme()

fig, ax = plt.subplots(figsize=(6, 4))
Expand Down
5 changes: 3 additions & 2 deletions tutorials/distributed-ml/torch-scaling-test/ddp_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,8 +208,9 @@ def main():
print('--------------------------------------------------------')
nnod = os.environ.get('SLURM_NNODES', 'unk')
epoch_time_tracker = EpochTimeTracker(
series_name="ddp-bl",
csv_file=f"epochtime_ddp-bl_{nnod}N.csv"
strategy_name="ddp-bl",
save_path=f"epochtime_ddp-bl_{nnod}N.csv",
num_nodes=int(nnod)
)

et = timer()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -222,8 +222,9 @@ def main():
print('--------------------------------------------------------')
nnod = os.environ.get('SLURM_NNODES', 'unk')
epoch_time_tracker = EpochTimeTracker(
series_name="deepspeed-bl",
csv_file=f"epochtime_deepspeed-bl_{nnod}N.csv"
strategy_name="deepspeed-bl",
save_path=f"epochtime_deepspeed-bl_{nnod}N.csv",
num_nodes=int(nnod)
)

et = timer()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -264,8 +264,8 @@ def main():
print('--------------------------------------------------------')
nnod = os.environ.get('SLURM_NNODES', 'unk')
epoch_time_tracker = EpochTimeTracker(
series_name="horovod-bl",
csv_file=f"epochtime_horovod-bl_{nnod}N.csv"
strategy_name="horovod-bl",
save_path=f"epochtime_horovod-bl_{nnod}N.csv"
)

et = timer()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -269,8 +269,8 @@ def main():
nnod = os.environ.get('SLURM_NNODES', 'unk')
s_name = f"{args.strategy}-it"
epoch_time_tracker = EpochTimeTracker(
series_name=s_name,
csv_file=f"epochtime_{s_name}_{nnod}N.csv"
strategy_name=s_name,
save_path=f"epochtime_{s_name}_{nnod}N.csv"
)

et = timer()
Expand Down
Binary file removed use-cases/eurac/plots/comm_plot.png
Binary file not shown.
Binary file modified use-cases/eurac/plots/communication_plot.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file removed use-cases/eurac/plots/gpu_energy_plot.png
Binary file not shown.
4 changes: 2 additions & 2 deletions use-cases/virgo/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,8 +184,8 @@ def train(self):
nnod = os.environ.get('SLURM_NNODES', 'unk')
s_name = f"{os.environ.get('DIST_MODE', 'unk')}-torch"
epoch_time_tracker = EpochTimeTracker(
series_name=s_name,
csv_file=f"epochtime_{s_name}_{nnod}N.csv"
strategy_name=s_name,
save_path=f"epochtime_{s_name}_{nnod}N.csv"
)
loss_plot = []
val_loss_plot = []
Expand Down

0 comments on commit 0257693

Please sign in to comment.