From 082b4005dcf4abd179716e5f2d57c186306be531 Mon Sep 17 00:00:00 2001
From: Jacob Morrison <jacobm@allenai.org>
Date: Tue, 27 Aug 2024 09:11:00 -0700
Subject: [PATCH] 70B training + other tweaks (#295)

Adding 70B training config, plus a few small fixes/tweaks for finetune.py
---
 .../default_finetune_multinode.yaml           |  4 ---
 .../sft/tulu3_L3.1_70b_preview_mix_v3.3.yaml  | 30 +++++++++++++++++++
 open_instruct/finetune.py                     | 13 ++++++--
 scripts/submit_finetune_job.py                | 28 +++++++++++++++++
 4 files changed, 68 insertions(+), 7 deletions(-)
 create mode 100644 configs/train_configs/sft/tulu3_L3.1_70b_preview_mix_v3.3.yaml

diff --git a/configs/beaker_configs/default_finetune_multinode.yaml b/configs/beaker_configs/default_finetune_multinode.yaml
index 8c7058eef..91960348c 100644
--- a/configs/beaker_configs/default_finetune_multinode.yaml
+++ b/configs/beaker_configs/default_finetune_multinode.yaml
@@ -60,10 +60,6 @@ tasks:
         value: false
       - name: WANDB_DISABLED
         value: true
-      - name: NCCL_NET
-        value: IB
-      - name: NCCL_DEBUG
-        value: INFO
       - name: HF_TOKEN
         secret: HF_TOKEN
     result:
diff --git a/configs/train_configs/sft/tulu3_L3.1_70b_preview_mix_v3.3.yaml b/configs/train_configs/sft/tulu3_L3.1_70b_preview_mix_v3.3.yaml
new file mode 100644
index 000000000..6cd422f37
--- /dev/null
+++ b/configs/train_configs/sft/tulu3_L3.1_70b_preview_mix_v3.3.yaml
@@ -0,0 +1,30 @@
+model_name_or_path: meta-llama/Meta-Llama-3.1-70B
+model_revision: main
+use_flash_attn: true
+tokenizer_name: meta-llama/Meta-Llama-3.1-70B
+use_slow_tokenizer: true
+dataset_mixer:
+    # Tulu V2 datasets
+    ai2-adapt-dev/llama-3-tulu-v2-sft-mixture-with-subset-llama-405b-completions-code_alpaca-open_orca-gpt4_alpaca: 326154
+    # Tulu V3 datasets (WIP)
+    HuggingFaceH4/no_robots: 9500 # all
+    ai2-adapt-dev/metamath-qa-reformat: 100000
+    ai2-adapt-dev/codefeedback-single-turn-reformat: 156526 # all
+    nvidia/Daring-Anteater: 99532 # all
+max_seq_length: 4096
+preprocessing_num_workers: 128
+per_device_train_batch_size: 1 # note, this is set up for 8 GPUs
+gradient_accumulation_steps: 4 # effective batch size 128 with 4 nodes
+learning_rate: 5.0e-06 # best LR so far
+lr_scheduler_type: linear
+warmup_ratio: 0.03
+weight_decay: 0.0
+num_train_epochs: 2
+output_dir: /output/
+with_tracking: true
+report_to:
+  - wandb
+logging_steps: 1
+checkpointing_steps: epoch
+dataset_mix_dir: /output/
+gradient_checkpointing: true
diff --git a/open_instruct/finetune.py b/open_instruct/finetune.py
index f8cf10a28..3119fddd3 100644
--- a/open_instruct/finetune.py
+++ b/open_instruct/finetune.py
@@ -318,6 +318,12 @@ class FlatArguments:
     """The url of the saved model in the Hugging Face Hub (will be autoset)"""
     try_launch_beaker_eval_jobs: bool = True
     """Whether to launch beaker evaluation jobs after training"""
+    fused_optimizer: bool = field(
+        default=True,
+        metadata={
+            "help": "Whether to use fused AdamW or not.",
+        },
+    )
 
     def __post_init__(self):
         if self.reduce_loss not in ["mean", "sum"]:
@@ -598,7 +604,7 @@ def main(args: FlatArguments):
                 device_map=device_map,
                 trust_remote_code=args.trust_remote_code,
                 torch_dtype=torch.bfloat16,
-                use_flash_attention_2=True if args.use_flash_attn else False,
+                attn_implementation="flash_attention_2" if args.use_flash_attn else "eager",
                 revision=args.model_revision,
                 token=os.getenv("HF_TOKEN", None),
             )
@@ -609,7 +615,8 @@ def main(args: FlatArguments):
                 config=config,
                 trust_remote_code=args.trust_remote_code,
                 low_cpu_mem_usage=args.low_cpu_mem_usage,
-                use_flash_attention_2=True if args.use_flash_attn else False,
+                torch_dtype=torch.bfloat16,
+                attn_implementation="flash_attention_2" if args.use_flash_attn else "eager",
                 revision=args.model_revision,
                 token=os.getenv("HF_TOKEN", None),
             )
@@ -780,7 +787,7 @@ def main(args: FlatArguments):
             is_paged=True,
         )
     else:
-        optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
+        optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate, fused=args.fused_optimizer)
 
     # Scheduler and math around the number of training steps.
     overrode_max_train_steps = False
diff --git a/scripts/submit_finetune_job.py b/scripts/submit_finetune_job.py
index d3538c128..ef4e87138 100644
--- a/scripts/submit_finetune_job.py
+++ b/scripts/submit_finetune_job.py
@@ -170,6 +170,34 @@ def parse_args(args):
     d['description'] = exp_name
     d['tasks'][0]['name'] = exp_name
 
+    # add cluster-specific env vars
+    if args.cluster == "ai2/jupiter-cirrascale-2":
+        d['tasks'][0]['envVars'] += [
+            {
+                "name": "NCCL_SOCKET_IFNAME",
+                "value": "ib",
+            },
+            {
+                "name": "NCCL_IB_HCA",
+                "value": "^=mlx5_bond_0",
+            },
+            {
+                "name": "NCCL_DEBUG",
+                "value": "INFO",
+            },
+        ]
+    elif args.cluster == "ai2/pluto-cirrascale":
+        d['tasks'][0]['envVars'] += [
+            {
+                "name": "NCCL_IB_HCA",
+                "value": "^=mlx5_1,mlx5_2",
+            },
+            {
+                "name": "NCCL_DEBUG",
+                "value": "INFO",
+            },
+        ]
+
     # WANDB settings
     for env in d['tasks'][0]['envVars']:
         if env['name'] == "WANDB_DISABLED":