Add epochs to levanter #768

ahmeda14960 · 2024-10-16T23:06:08Z

adds epochs with a boolean flag, which will continue epoching over the dataset and tracks epochs throughout training. Should be backwards compatible with checkpoints.

config/llama_7b_with_olmo_config.yaml

src/levanter/callbacks.py

dlwh · 2024-10-16T23:41:15Z

src/levanter/data/text.py

@@ -63,6 +63,57 @@

 DEFAULT_IGNORE_INDEX = -100  # Mirrors pytorch's default ignore index

+class TokenSeqEpochDataset(AsyncDataset[np.ndarray]):


let's just make EpochDataset that wraps an arbitrary dataset.

chatgpt and i made this

from typing import Sequence, Optional, TypeVar import asyncio import numpy as np T_co = TypeVar('T_co', covariant=True) class EpochDataset(AsyncDataset[T_co]): """ A dataset that wraps another dataset, providing infinite epochs by recycling indices. If `max_epochs` is specified, it limits the number of cycles before raising StopIteration. :param dataset: The dataset to wrap. :param max_epochs: The maximum number of epochs to cycle through. If None, cycle indefinitely. """ def __init__(self, dataset: AsyncDataset[T_co], max_epochs: Optional[int] = None): if dataset.is_finite(): raise ValueError("Cannot apply epoching to a finite dataset.") self.dataset = dataset self.max_epochs = max_epochs async def async_len(self) -> int: if self.max_epochs is None: raise ValueError("Cannot determine length of an infinite dataset without max_epochs.") # Return the total number of samples: max_epochs * length of the dataset return self.max_epochs * await self.dataset.async_len() async def final_length_is_known(self) -> bool: return await self.dataset.final_length_is_known() def is_finite(self) -> bool: # EpochDataset can be finite if max_epochs is set. return self.max_epochs is not None async def current_len(self) -> Optional[int]: # If max_epochs is None, the dataset is effectively infinite. if self.max_epochs is None: return None # If the final length of the dataset is not known, return the current length of the underlying dataset. if not await self.dataset.final_length_is_known(): return await self.dataset.current_len() # If the final length is known, return the max_epochs * async_len of the dataset. return self.max_epochs * await self.dataset.async_len() async def get_batch(self, indices: Sequence[int]) -> Sequence[T_co]: # Use self.wait_until_len_at_least to ensure we have enough data for the batch. max_index = max(indices) ds_len = await self.wait_until_len_at_least(max_index + 1) # Determine the epoch based on the largest index epoch = max_index // ds_len # If max_epochs is specified, raise an error if the epoch exceeds the allowed number of epochs if self.max_epochs is not None and epoch >= self.max_epochs: raise StopIteration(f"Reached maximum number of epochs: epoch {epoch} exceeds the maximum allowed {self.max_epochs}") # Wrap the indices within the bounds of the dataset length wrapped_indices = [idx % ds_len for idx in indices] # Delegate to the underlying dataset's get_batch return await self.dataset.get_batch(wrapped_indices) async def wait_until_len_at_least(self, length: int) -> int: """ Returns the length of the dataset once it is at least `length` or if the dataset has a known (finished) length. If the dataset's actual length is less than `length`, it returns the minimum of async_len and the current length. """ # Wait until the underlying dataset's length is at least `length` if not self.is_finite(): return length if await self.dataset.final_length_is_known(): base_length = await self.dataset.async_len() else: base_length = await self.dataset.wait_until_len_at_least(length) if base_length < length: # hit epoch boundary assert self.max_epochs is not None return self.max_epochs * base_length return base_length

FYI I removed the "cannot apply epoching to a finite dataset" since that seems like a bug

src/levanter/data/text.py

src/levanter/main/train_lm.py

Co-authored-by: David Hall <[email protected]>

src/levanter/main/train_lm.py

src/levanter/data/text.py

src/levanter/main/train_lm.py

dlwh · 2024-10-24T22:57:35Z

src/levanter/checkpoint.py

@@ -27,6 +27,7 @@

 from levanter.tensorstore_serialization import tree_deserialize_leaves_tensorstore, tree_serialize_leaves_tensorstore
 from levanter.types import FilterSpec
+# from levanter.trainer import StepInfo


dlwh · 2024-10-24T22:58:45Z

src/levanter/checkpoint.py

+            return  # Can't calculate epochs without dataset size
+
+        # Calculate current epoch from steps without modifying StepInfo
+        current_epoch = (step_info.step * self.batch_size) // self.total_dataset_size


we should probably just be tracking this explicilty in stepinfo, but this is fine right now

dlwh · 2024-10-24T23:01:04Z

src/levanter/checkpoint.py

@@ -27,6 +27,7 @@

 from levanter.tensorstore_serialization import tree_deserialize_leaves_tensorstore, tree_serialize_leaves_tensorstore
 from levanter.types import FilterSpec
+# from levanter.trainer import StepInfo


Suggested change

# from levanter.trainer import StepInfo

ahmeda14960 added 5 commits October 15, 2024 20:32

wip epochs

234a945

fix

f0b1eaa

add epoch flag, sanity check tulu one epoch

020a1b2

epochs work

50500b9

minor fix

49afb5d

ahmeda14960 marked this pull request as ready for review October 16, 2024 23:34

ahmeda14960 added 2 commits October 16, 2024 16:37

fix ci

c2ed3ee

fix ci

667a5a3

dlwh reviewed Oct 17, 2024

View reviewed changes

ahmeda14960 and others added 7 commits October 17, 2024 11:57

fix config file

37e77fb

add suggested fix from david

7c195ba

Merge remote-tracking branch 'origin/main' into sft

e71ed16

restore toml

54a6007

Update src/levanter/callbacks.py

e2646d6

Co-authored-by: David Hall <[email protected]>

refactor

fd18cae

add suggested fix from david

1706803

dlwh reviewed Oct 23, 2024

View reviewed changes

src/levanter/main/train_lm.py Outdated Show resolved Hide resolved

src/levanter/data/text.py Outdated Show resolved Hide resolved

src/levanter/data/text.py Outdated Show resolved Hide resolved

src/levanter/main/train_lm.py Outdated Show resolved Hide resolved

ahmeda14960 added 6 commits October 23, 2024 12:13

update for v4 so we don't crash

f0ca163

remove changes that break epochs

c971ebf

final fixes

4733f3b

final fixes

e82eec2

substatial changes to save on epochs w callback

08fd427

epoch tracking still broken

18a5352

dlwh approved these changes Oct 24, 2024

View reviewed changes

ahmeda14960 added 2 commits October 24, 2024 17:11

Merge remote-tracking branch 'origin/main' into sft

f1ef2c7

WIP

c38b076

dlwh mentioned this pull request Oct 25, 2024

Epochs #778

Open

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Add epochs to levanter #768

Add epochs to levanter #768

ahmeda14960 commented Oct 16, 2024

dlwh Oct 16, 2024

dlwh Oct 17, 2024

ahmeda14960 Oct 23, 2024

dlwh Oct 24, 2024

dlwh Oct 24, 2024

dlwh Oct 24, 2024

		@@ -63,6 +63,57 @@

		DEFAULT_IGNORE_INDEX = -100 # Mirrors pytorch's default ignore index

		class TokenSeqEpochDataset(AsyncDataset[np.ndarray]):

Add epochs to levanter #768

Are you sure you want to change the base?

Add epochs to levanter #768

Conversation

ahmeda14960 commented Oct 16, 2024

dlwh Oct 16, 2024

Choose a reason for hiding this comment

dlwh Oct 17, 2024

Choose a reason for hiding this comment

ahmeda14960 Oct 23, 2024

Choose a reason for hiding this comment

dlwh Oct 24, 2024

Choose a reason for hiding this comment

dlwh Oct 24, 2024

Choose a reason for hiding this comment

dlwh Oct 24, 2024

Choose a reason for hiding this comment