Mark training complete after last checkpoint saving is completed.

PiperOrigin-RevId: 689279915
google-research · Oct 24, 2024 · 99deeca · 99deeca
1 parent 87f7474
commit 99deeca
Showing 1 changed file with 4 additions and 3 deletions.
diff --git a/kauldron/train/train_lib.py b/kauldron/train/train_lib.py
@@ -141,15 +141,16 @@ def train_impl(
             log_summaries=log_summaries,
         )
 
+  # Ensure all hosts exit together. See section in dm/jax-faqs.
+  _sync()
+  ckpt.wait_until_finished()
+
   # Notify the eval job training is complete
   if trainer.workdir.exists():  # `TrainEvaluator` do not have a workdir
     epath.Path(trainer.workdir).joinpath(
         eval_impl.TRAIN_COMPLETE_FILENAME
     ).touch()
 
-  # Ensure all hosts exit together. See section in dm/jax-faqs.
-  _sync()
-  ckpt.wait_until_finished()
   # Returning the final state is convenient for interactive training in colab
   return state, aux