Skip to content

Commit

Permalink
Improve error message. (#275)
Browse files Browse the repository at this point in the history
* Improve error message.

* Formatting.
  • Loading branch information
GeorgiosSmyrnis authored May 14, 2024
1 parent 3b4a063 commit b864d15
Showing 1 changed file with 5 additions and 1 deletion.
6 changes: 5 additions & 1 deletion open_lm/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -843,10 +843,14 @@ def main(args):
expected_steps = data["train"].dataloader.num_batches
if steps_done_epoch < (1 - args.data_tolerate_error_p) * expected_steps and not done_training:
num_ckpt_too_few_tokens += 1
if is_master(args):
logging.warning(
f"Epoch {epoch}, tokens seen: {steps_done_epoch * args.global_batch_size * args.seq_len}, tokens expected: {expected_steps * args.global_batch_size * args.seq_len}, ratio: {steps_done_epoch / expected_steps}"
)

if num_ckpt_too_few_tokens > args.data_tolerate_num_ckpts:
raise RuntimeError(
f"{num_ckpt_too_few_tokens} checkpoints happened where the number of tokens seen was less than {1 - args.data_tolerate_error_p} of expected. This is likely due to transient errors e.g. reading from S3."
f"{num_ckpt_too_few_tokens} checkpoints happened where the number of tokens seen was {1 - args.data_tolerate_error_p} of expected. This is likely due to transient errors e.g. reading from S3."
)

epoch = epoch + 1
Expand Down

0 comments on commit b864d15

Please sign in to comment.