Skip to content

Commit

Permalink
fix for EOS/PAD tokens when not gpt-neox
Browse files Browse the repository at this point in the history
  • Loading branch information
Jeffrey committed May 24, 2024
1 parent c65b430 commit dee37d0
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 10 deletions.
46 changes: 37 additions & 9 deletions open_lm/datapreprocess/ray/tokenize_shuffle.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,10 +257,9 @@ def preprocess(
sources: enum.Enum = None,
source_counter: GlobalCounter = None,
):
tokenizer_fn, vocab_size = tokenizer
tokenizer_fn, EOS, PAD = tokenizer
rng = random.Random(hash(key) + seed)
EOT = SpecialTokens.END_OF_TEXT.value % (vocab_size + len(SpecialTokens))
PAD = SpecialTokens.PAD.value % (vocab_size + len(SpecialTokens))

if do_sample:
assert sources is not None
sample_freq = sources.get_sampling_frequency(key)
Expand All @@ -274,7 +273,7 @@ def preprocess(
pbar.set_description(key)
for string in pbar:
tokens = tokenizer_fn(string)
tokens.append(EOT)
tokens.append(EOS)
buffer += tokens
idx = 0
while idx < len(buffer) - seqlen:
Expand Down Expand Up @@ -442,7 +441,7 @@ def write_to_location(folder, tar_name, bio):
assert False, f"error is {path} and {e}"


def load_tokenizer(tokenizer):
def load_tokenizer(tokenizer, eos_overwrite=None, pad_overwrite=None):
enc = None
if pathlib.Path(tokenizer).exists() and pathlib.Path(tokenizer).is_file():
enc = PreTrainedTokenizerFast(tokenizer_file=tokenizer)
Expand All @@ -453,7 +452,24 @@ def load_tokenizer(tokenizer):
print(str(e))
raise ValueError(f"Unknown Tokenizer: {tokenizer}")

return (lambda x: enc(x).input_ids, enc.vocab_size)
eos_token_id, pad_token_id = enc.eos_token_id, enc.pad_token_id

if eos_overwrite is not None:
if eos_token_id is not None and eos_overwrite != eos_token_id:
logger.warning(f"Default EOS id for {tokenizer} is {eos_token_id} and you are overriding it to be {eos_overwrite}. This may cause issues during training.")
eos_token_id = eos_overwrite

if pad_overwrite is not None:
if pad_overwrite != pad_token_id:
logger.warning(f"Default PAD id for {tokenizer} is {pad_token_id} and you are overriding it to be {pad_overwrite}. This may cause issues during training.")
pad_token_id = pad_overwrite

if eos_token_id is None:
raise ValueError("Tokenizer does not have a specified EOS token id. Please manually pass one in via --eos_overwrite")
if pad_token_id is None:
raise ValueError("Tokenizer does not have a specified PAD token id. Please manually pass one in via --pad_overwrite")

return (lambda x: enc(x).input_ids, eos_token_id, pad_token_id)


def glob_files(path, suffixes):
Expand Down Expand Up @@ -557,7 +573,7 @@ def main(args):
parser.add_argument("--content_key", type=str, default="text")
parser.add_argument("--seqlen", type=int, default=2048)
parser.add_argument("--tokenizer", type=str, default="EleutherAI/gpt-neox-20b")
parser.add_argument("--vocab_size", type=int, default=None) # for pre-tokenized data, don't load tokenizer
parser.add_argument("--pretokenized", action='store_true') # For pre-tokenized data, don't load tokenizer
parser.add_argument("--wds_chunk_size", type=int, default=8192)
parser.add_argument("--seed", type=int, default=42)
parser.add_argument("--subset", type=int, default=None)
Expand All @@ -576,6 +592,8 @@ def main(args):
parser.add_argument("--suffixes", nargs="+", default=[".json", ".jsonl", ".zst", ".zstd", ".tar", ".gz"])
parser.add_argument("--presort", action="store_true")
parser.add_argument("--allow_imbalanced_write", action="store_true")
parser.add_argument("--eos_overwrite", type=int, default=None)
parser.add_argument("--pad_overwrite", type=int, default=None)

args = parser.parse_args(args)
if args.do_sample:
Expand Down Expand Up @@ -608,6 +626,11 @@ def main(args):
dashboard_host=args.ray_dashboard_host,
)
num_nodes = len(ray.nodes())


SpecialTokens = enum.Enum
Sources = enum.Enum("Sources", {item["source"]: index for index, item in enumerate(data["sources"])})

input_folders = args.input.split(",")
input_paths = []
for inp_folder in input_folders:
Expand Down Expand Up @@ -637,7 +660,12 @@ def main(args):
ctx.execution_options.resource_limits.object_store_memory = float("inf")
ray.data.DataContext.get_current().execution_options.verbose_progress = True
start_time = time.time()
tokenizer = load_tokenizer(args.tokenizer) if args.vocab_size is None else (lambda x: x, args.vocab_size)

if args.pretokenized:
tokenizer = (lambda x: x, args.eos_overwrite, args.pad_overwrite)
else:
tokenizer = load_tokenizer(args.tokenizer, args.eos_overwrite, args.pad_overwrite)

logger.info(f"Total number of keys = {len(input_paths)}")
df = pd.DataFrame(input_paths, columns=["path"])
ds = ray.data.from_pandas(pd.DataFrame(input_paths, columns=["path"])).repartition(parallelism)
Expand Down Expand Up @@ -687,7 +715,7 @@ def main(args):
ds = ds.repartition(1)
ds = ds.sort(key="shard")
jsonl_lines = ds.take_all()
token_count_from_manifest = sum([x["num_sequences"][0] for x in jsonl_lines] * seqlen)
token_count_from_manifest = sum([x["num_sequences"] for x in jsonl_lines] * seqlen)
write_manifest(jsonl_lines, args)
else:
write_status = ds.map_batches(
Expand Down
2 changes: 1 addition & 1 deletion tests/test_tokenize_shuffle.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def test_tokenize_shuffle_tar(content_key, NUM_TOKENS):

params = f"--content_key {content_key}"
if content_key == "npy":
params += " --vocab_size 16384"
params += " --pretokenized"

exit_value = os.system(
f"python open_lm/datapreprocess/ray/tokenize_shuffle.py --input s3://dcnlp-west-test/tokenize_shuffle_test/webvid_tiny/ {params} --output test_output/ --seqlen {content_len}"
Expand Down

0 comments on commit dee37d0

Please sign in to comment.