Update CodeLlama configs (and fix a couple Phi3 ones) (#1358)

pytorch · Aug 17, 2024 · 8bb3a6f · 8bb3a6f
1 parent 367e9ab
commit 8bb3a6f
Show file tree

Hide file tree

Showing 7 changed files with 61 additions and 65 deletions.
diff --git a/recipes/configs/code_llama2/7B_full_low_memory.yaml b/recipes/configs/code_llama2/7B_full_low_memory.yaml
@@ -1,5 +1,5 @@
 # Config for single device full finetuning in full_finetune_single_device.py
-# using a Code-Llama2 7B model
+# using a CodeLlama 7B model
 #
 # This config assumes that you've run the following command before launching
 # this run:
@@ -19,22 +19,16 @@
 #
 # This config works only for training on single device.
 
+# Model arguments
+model:
+  _component_: torchtune.models.code_llama2.code_llama2_7b
 
 # Tokenizer
 tokenizer:
   _component_: torchtune.models.llama2.llama2_tokenizer
   path: /tmp/CodeLlama-7b-hf/tokenizer.model
 
-# Dataset
-dataset:
-  _component_: torchtune.datasets.alpaca_dataset
-seed: null
-shuffle: True
-
-# Model Arguments
-model:
-  _component_: torchtune.models.code_llama2.code_llama2_7b
-
+# Checkpointer
 checkpointer:
   _component_: torchtune.utils.FullModelHFCheckpointer
   checkpoint_dir: /tmp/CodeLlama-7b-hf
@@ -48,32 +42,36 @@ checkpointer:
   model_type: LLAMA2
 resume_from_checkpoint: False
 
+# Dataset
+dataset:
+  _component_: torchtune.datasets.alpaca_dataset
+seed: null
+shuffle: True
+
 # Fine-tuning arguments
+epochs: 1
+max_steps_per_epoch: null
 batch_size: 2
-epochs: 3
+gradient_accumulation_steps: 1
 optimizer:
   _component_: bitsandbytes.optim.PagedAdamW
   lr: 2e-5
 optimizer_in_bwd: True
 loss:
   _component_: torch.nn.CrossEntropyLoss
-max_steps_per_epoch: null
-gradient_accumulation_steps: 1
 compile: False
 
-# Training environment
+# Training env
 device: cuda
 
 # Memory management
 enable_activation_checkpointing: True
-
-# Reduced precision
 dtype: bf16
 
 # Logging
+output_dir: /tmp/codellama_finetune_output
 metric_logger:
   _component_: torchtune.utils.metric_logging.DiskLogger
-  log_dir: ${output_dir}
-output_dir: /tmp/code_llama2_finetune
+  log_dir: /tmp/CodeLlama-7b-hf/logs
 log_every_n_steps: 1
 log_peak_memory_stats: False
diff --git a/recipes/configs/code_llama2/7B_lora_single_device.yaml b/recipes/configs/code_llama2/7B_lora_single_device.yaml
@@ -1,5 +1,5 @@
 # Config for single device full finetuning in full_finetune_single_device.py
-# using a Code-Llama2 7B model
+# using a CodeLlama 7B model
 #
 # This config assumes that you've run the following command before launching
 # this run:
@@ -29,13 +29,7 @@ tokenizer:
   _component_: torchtune.models.llama2.llama2_tokenizer
   path: /tmp/CodeLlama-7b-hf/tokenizer.model
 
-# Dataset
-dataset:
-  _component_: torchtune.datasets.alpaca_cleaned_dataset
-seed: null
-shuffle: True
-
-
+# Checkpointer
 checkpointer:
   _component_: torchtune.utils.FullModelHFCheckpointer
   checkpoint_dir: /tmp/CodeLlama-7b-hf
@@ -51,45 +45,51 @@ checkpointer:
 resume_from_checkpoint: False
 save_adapter_weights_only: False
 
+# Dataset
+dataset:
+  _component_: torchtune.datasets.alpaca_cleaned_dataset
+seed: null
+shuffle: True
+
 # Fine-tuning arguments
-batch_size: 2
 epochs: 1
 max_steps_per_epoch: null
-gradient_accumulation_steps: 64
-compile: False
-
+batch_size: 2
+gradient_accumulation_steps: 16
 optimizer:
   _component_: torch.optim.AdamW
   weight_decay: 0.01
   lr: 3e-4
-
 lr_scheduler:
   _component_: torchtune.modules.get_cosine_schedule_with_warmup
   num_warmup_steps: 100
-
 loss:
   _component_: torch.nn.CrossEntropyLoss
+compile: False
 
-
-# Training environment
+# Training env
 device: cuda
+
+# Memory management
 enable_activation_checkpointing: True
 dtype: bf16
 
 # Logging
+output_dir: /tmp/codellama_lora_finetune_output
 metric_logger:
   _component_: torchtune.utils.metric_logging.DiskLogger
-  log_dir: ${output_dir}
-output_dir: /tmp/lora_code_llama2_finetune_output
+  log_dir: /tmp/CodeLlama-7b-hf/logs
 log_every_n_steps: 1
 log_peak_memory_stats: False
 
+# Showcase the usage of PyTorch profiler
+# Set enabled to False as it's only needed for debugging training
 profiler:
   _component_: torchtune.utils.setup_torch_profiler
   enabled: False
 
   #Output directory of trace artifacts
-  output_dir: ${output_dir}/profiling_outputs
+  output_dir: /tmp/CodeLlama-7b-hf/profiling_outputs
 
   #`torch.profiler.ProfilerActivity` types to trace
   cpu: True

diff --git a/recipes/configs/code_llama2/7B_qlora_single_device.yaml b/recipes/configs/code_llama2/7B_qlora_single_device.yaml
@@ -1,5 +1,5 @@
 # Config for single device QLoRA finetuning in lora_finetune_single_device.py
-# using a Code-Llama2 7B model
+# using a CodeLlama 7B model
 #
 # This config assumes that you've run the following command before launching
 # this run:
@@ -18,8 +18,8 @@
 # Model Arguments
 model:
   _component_: torchtune.models.code_llama2.qlora_code_llama2_7b
-  lora_attn_modules: ['q_proj', 'v_proj']
-  apply_lora_to_mlp: False
+  lora_attn_modules: ['q_proj', 'v_proj', 'k_proj', 'output_proj']
+  apply_lora_to_mlp: True
   apply_lora_to_output: False
   lora_rank: 8
   lora_alpha: 16
@@ -29,13 +29,7 @@ tokenizer:
   _component_: torchtune.models.llama2.llama2_tokenizer
   path: /tmp/CodeLlama-7b-hf/tokenizer.model
 
-# Dataset
-dataset:
-  _component_: torchtune.datasets.alpaca_cleaned_dataset
-seed: null
-shuffle: True
-
-
+# Checkpointer
 checkpointer:
   _component_: torchtune.utils.FullModelHFCheckpointer
   checkpoint_dir: /tmp/CodeLlama-7b-hf
@@ -51,36 +45,40 @@ checkpointer:
 resume_from_checkpoint: False
 save_adapter_weights_only: False
 
-# Fine-tuning arguments and training
-batch_size: 2
+# Dataset
+dataset:
+  _component_: torchtune.datasets.alpaca_cleaned_dataset
+seed: null
+shuffle: True
+
+# Fine-tuning arguments
 epochs: 1
 max_steps_per_epoch: null
-gradient_accumulation_steps: 64
-compile: False
-
+batch_size: 2
+gradient_accumulation_steps: 16
 optimizer:
   _component_: torch.optim.AdamW
   weight_decay: 0.01
   lr: 3e-4
-
 lr_scheduler:
   _component_: torchtune.modules.get_cosine_schedule_with_warmup
   num_warmup_steps: 100
-
 loss:
   _component_: torch.nn.CrossEntropyLoss
+compile: False
 
-
-# Training environment
+# Training env
 device: cuda
+
+# Memory management
 enable_activation_checkpointing: True
 dtype: bf16
 
 # Logging
+output_dir: /tmp/codellama_qlora_finetune_output
 metric_logger:
   _component_: torchtune.utils.metric_logging.DiskLogger
-  log_dir: ${output_dir}
-output_dir: /tmp/qlora_code_llama2_finetune_output
+  log_dir: /tmp/CodeLlama-7b-hf/logs
 log_every_n_steps: 1
 log_peak_memory_stats: False
 
@@ -91,7 +89,7 @@ profiler:
   enabled: False
 
   #Output directory of trace artifacts
-  output_dir: ${output_dir}/profiling_outputs
+  output_dir: /tmp/CodeLlama-7b-hf/profiling_outputs
 
   #`torch.profiler.ProfilerActivity` types to trace
   cpu: True

diff --git a/recipes/configs/phi3/mini_full.yaml b/recipes/configs/phi3/mini_full.yaml
@@ -64,7 +64,7 @@ enable_activation_checkpointing: True
 dtype: bf16
 
 # Logging
-output_dir: /tmp/phi3_full_finetune_output
+output_dir: /tmp/phi3_finetune_output
 metric_logger:
   _component_: torchtune.utils.metric_logging.DiskLogger
   log_dir: /tmp/Phi-3-mini-4k-instruct/logs

diff --git a/recipes/configs/phi3/mini_full_low_memory.yaml b/recipes/configs/phi3/mini_full_low_memory.yaml
@@ -68,7 +68,7 @@ enable_activation_checkpointing: True
 dtype: bf16
 
 # Logging
-output_dir: /tmp/phi3_lora_finetune_output
+output_dir: /tmp/phi3_finetune_output
 metric_logger:
   _component_: torchtune.utils.metric_logging.DiskLogger
   log_dir: /tmp/Phi-3-mini-4k-instruct/logs

diff --git a/recipes/configs/phi3/mini_lora.yaml b/recipes/configs/phi3/mini_lora.yaml
@@ -55,7 +55,7 @@ shuffle: True
 epochs: 1
 max_steps_per_epoch: null
 batch_size: 2
-gradient_accumulation_steps: 32
+gradient_accumulation_steps: 16
 optimizer:
   _component_: torch.optim.AdamW
   weight_decay: 0.01

diff --git a/recipes/configs/phi3/mini_lora_single_device.yaml b/recipes/configs/phi3/mini_lora_single_device.yaml
@@ -52,8 +52,8 @@ shuffle: True
 # Fine-tuning arguments
 epochs: 1
 max_steps_per_epoch: null
-gradient_accumulation_steps: 64
 batch_size: 2
+gradient_accumulation_steps: 16
 optimizer:
   _component_: torch.optim.AdamW
   weight_decay: 0.01
@@ -69,8 +69,8 @@ compile: False
 device: cuda
 
 # Memory management
-dtype: bf16
 enable_activation_checkpointing: True
+dtype: bf16
 
 # Logging
 output_dir: /tmp/phi3_lora_finetune_output