Fix step iteration bug in finetuning scripts (#1794)

Lightning-AI · Oct 21, 2024 · e14e39f · e14e39f
1 parent 3ea3d93
commit e14e39f
Show file tree

Hide file tree

Showing 4 changed files with 14 additions and 4 deletions.
diff --git a/litgpt/finetune/adapter.py b/litgpt/finetune/adapter.py
@@ -268,10 +268,13 @@ def fit(
  total_lengths = 0
  total_t0 = time.perf_counter()
 
- while step_count < max_steps and train_iterator.epoch < train.epochs:
+ while step_count < max_steps:
  iter_num += 1
  iter_t0 = time.perf_counter()
  batch = next(train_iterator)
+ if train_iterator.epoch >= train.epochs:
+ break
+
  input_ids, targets = batch["input_ids"], batch["labels"]
 
  is_accumulating = iter_num % train.gradient_accumulation_iters(devices) != 0

diff --git a/litgpt/finetune/adapter_v2.py b/litgpt/finetune/adapter_v2.py
@@ -268,10 +268,13 @@ def fit(
  total_lengths = 0
  total_t0 = time.perf_counter()
 
- while step_count < max_steps and train_iterator.epoch < train.epochs:
+ while step_count < max_steps:
  iter_num += 1
  iter_t0 = time.perf_counter()
  batch = next(train_iterator)
+ if train_iterator.epoch >= train.epochs:
+ break
+
  input_ids, targets = batch["input_ids"], batch["labels"]
 
  is_accumulating = iter_num % train.gradient_accumulation_iters(devices) != 0

diff --git a/litgpt/finetune/full.py b/litgpt/finetune/full.py
@@ -244,10 +244,12 @@ def fit(
  )
  fabric.barrier()
 
- while state["step_count"] < max_steps and train_iterator.epoch < train.epochs:
+ while state["step_count"] < max_steps:
  state["iter_num"] += 1
  iter_t0 = time.perf_counter()
  batch = next(train_iterator)
+ if train_iterator.epoch >= train.epochs:
+ break
  input_ids, targets = batch["input_ids"], batch["labels"]
 
  is_accumulating = state["iter_num"] % train.gradient_accumulation_iters(devices) != 0

diff --git a/litgpt/finetune/lora.py b/litgpt/finetune/lora.py
@@ -299,10 +299,12 @@ def fit(
  total_lengths = 0
  total_t0 = time.perf_counter()
 
- while step_count < max_steps and train_iterator.epoch < train.epochs:
+ while step_count < max_steps:
  iter_num += 1
  iter_t0 = time.perf_counter()
  batch = next(train_iterator)
+ if train_iterator.epoch >= train.epochs:
+ break
  input_ids, targets = batch["input_ids"], batch["labels"]
 
  is_accumulating = iter_num % train.gradient_accumulation_iters(devices) != 0