From 994c4d9086d7a8481b98e97eb8c0fb8204f11453 Mon Sep 17 00:00:00 2001
From: Felipe Mello <felipemello@fb.com>
Date: Fri, 25 Oct 2024 20:21:34 -0700
Subject: [PATCH] updated configs + qlora

---
 recipes/configs/llama3_2_vision/11B_lora.yaml |  2 +-
 .../11B_lora_single_device.yaml               |  2 +-
 .../configs/llama3_2_vision/11B_qlora.yaml    | 13 ++-
 .../11B_qlora_single_device.yaml              | 15 +--
 recipes/configs/llama3_2_vision/90B_full.yaml |  2 -
 recipes/configs/llama3_2_vision/90B_lora.yaml |  4 +-
 .../configs/llama3_2_vision/90B_qlora.yaml    | 91 +++++++++++++++++++
 torchtune/_recipe_registry.py                 |  8 +-
 8 files changed, 117 insertions(+), 20 deletions(-)
 create mode 100644 recipes/configs/llama3_2_vision/90B_qlora.yaml

diff --git a/recipes/configs/llama3_2_vision/11B_lora.yaml b/recipes/configs/llama3_2_vision/11B_lora.yaml
index 6f18eaa08..3f542fa25 100644
--- a/recipes/configs/llama3_2_vision/11B_lora.yaml
+++ b/recipes/configs/llama3_2_vision/11B_lora.yaml
@@ -47,7 +47,7 @@ checkpointer:
   output_dir: /tmp/Llama-3.2-11B-Vision-Instruct/
   model_type: LLAMA3_VISION
 resume_from_checkpoint: False
-save_adapter_weights_only: True # Set to false to save the whole model + adapter merged
+save_adapter_weights_only: False # PeFT formatting not available yet. This will save it in torchtune format only.
 
 # Dataset
 dataset:
diff --git a/recipes/configs/llama3_2_vision/11B_lora_single_device.yaml b/recipes/configs/llama3_2_vision/11B_lora_single_device.yaml
index 8b815144c..6bec024e3 100644
--- a/recipes/configs/llama3_2_vision/11B_lora_single_device.yaml
+++ b/recipes/configs/llama3_2_vision/11B_lora_single_device.yaml
@@ -45,7 +45,7 @@ checkpointer:
   output_dir: /tmp/Llama-3.2-11B-Vision-Instruct/
   model_type: LLAMA3_VISION
 resume_from_checkpoint: False
-save_adapter_weights_only: True # Set to false to save the whole model + adapter merged
+save_adapter_weights_only: False # PeFT formatting not available yet. This will save it in torchtune format only.
 
 # Dataset
 dataset:
diff --git a/recipes/configs/llama3_2_vision/11B_qlora.yaml b/recipes/configs/llama3_2_vision/11B_qlora.yaml
index 1217fb367..dd3cf601c 100644
--- a/recipes/configs/llama3_2_vision/11B_qlora.yaml
+++ b/recipes/configs/llama3_2_vision/11B_qlora.yaml
@@ -2,7 +2,7 @@
 # using a Llama3.2 11B Vision Instruct model
 #
 # This config assumes that you've run the following command before launching:
-#   tune download meta-llama/Llama-3.2-11B-Vision-Instruct --output-dir /tmp/Llama-3.2-11B-Vision-Instruct
+#   tune download meta-llama/Llama-3.2-11B-Vision-Instruct --output-dir /tmp/Llama-3.2-11B-Vision-Instruct --ignore-patterns "original/consolidated*"
 #
 # To launch on 2 devices, run the following command from root:
 #   tune run --nproc_per_node 2 lora_finetune_distributed --config llama3_2_vision/11B_qlora
@@ -37,13 +37,16 @@ tokenizer:
 
 # Checkpointer
 checkpointer:
-  _component_: torchtune.training.FullModelMetaCheckpointer
-  checkpoint_dir: /tmp/Llama-3.2-11B-Vision-Instruct/original/
-  checkpoint_files: [consolidated.pth]
+  _component_: torchtune.training.FullModelHFCheckpointer
+  checkpoint_dir: /tmp/Llama-3.2-11B-Vision-Instruct/
+  checkpoint_files:
+    filename_format: model-{}-of-{}.safetensors
+    max_filename: "00005"
   recipe_checkpoint: null
   output_dir: /tmp/Llama-3.2-11B-Vision-Instruct/
   model_type: LLAMA3_VISION
 resume_from_checkpoint: False
+save_adapter_weights_only: False # PeFT formatting not available yet. This will save it in torchtune format only.
 
 # Dataset
 dataset:
@@ -62,7 +65,7 @@ optimizer:
   _component_: torch.optim.AdamW
   fused: True
   weight_decay: 0.01
-  lr: 2e-5
+  lr: 1e-4
 lr_scheduler:
   _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
   num_warmup_steps: 100
diff --git a/recipes/configs/llama3_2_vision/11B_qlora_single_device.yaml b/recipes/configs/llama3_2_vision/11B_qlora_single_device.yaml
index b12d51237..6131abe8b 100644
--- a/recipes/configs/llama3_2_vision/11B_qlora_single_device.yaml
+++ b/recipes/configs/llama3_2_vision/11B_qlora_single_device.yaml
@@ -2,7 +2,7 @@
 # using a Llama3.2 11B Vision Instruct model
 #
 # This config assumes that you've run the following command before launching:
-#   tune download meta-llama/Llama-3.2-11B-Vision-Instruct --output-dir /tmp/Llama-3.2-11B-Vision-Instruct
+#   tune download meta-llama/Llama-3.2-11B-Vision-Instruct --output-dir /tmp/Llama-3.2-11B-Vision-Instruct --ignore-patterns "original/consolidated*"
 #
 # To launch on a single device, run the following command from root:
 #   tune run lora_finetune_single_device --config llama3_2_vision/11B_qlora_single_device
@@ -36,13 +36,16 @@ tokenizer:
 
 # Checkpointer
 checkpointer:
-  _component_: torchtune.training.FullModelMetaCheckpointer
-  checkpoint_dir: /tmp/Llama-3.2-11B-Vision-Instruct/original/
-  checkpoint_files: [consolidated.pth]
+  _component_: torchtune.training.FullModelHFCheckpointer
+  checkpoint_dir: /tmp/Llama-3.2-11B-Vision-Instruct/
+  checkpoint_files:
+    filename_format: model-{}-of-{}.safetensors
+    max_filename: "00005"
   recipe_checkpoint: null
   output_dir: /tmp/Llama-3.2-11B-Vision-Instruct/
   model_type: LLAMA3_VISION
 resume_from_checkpoint: False
+save_adapter_weights_only: False # PeFT formatting not available yet. This will save it in torchtune format only.
 
 # Dataset
 dataset:
@@ -61,8 +64,8 @@ optimizer:
   _component_: torch.optim.AdamW
   fused: True
   weight_decay: 0.01
-  lr: 2e-5
-optimizer_in_bwd: False
+  lr: 1e-4
+
 lr_scheduler:
   _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
   num_warmup_steps: 100
diff --git a/recipes/configs/llama3_2_vision/90B_full.yaml b/recipes/configs/llama3_2_vision/90B_full.yaml
index 5575785dd..9eee41e49 100644
--- a/recipes/configs/llama3_2_vision/90B_full.yaml
+++ b/recipes/configs/llama3_2_vision/90B_full.yaml
@@ -12,8 +12,6 @@
 #    tune run --nproc_per_node 4 full_finetune_distributed --config llama3_2_vision/90B_full checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
 #
 # This config works best when the model is being fine-tuned on 2+ GPUs.
-# Single device full finetuning requires more memory optimizations. It's
-# best to use 90B_full_single_device.yaml for those cases.
 
 # Model arguments
 model:
diff --git a/recipes/configs/llama3_2_vision/90B_lora.yaml b/recipes/configs/llama3_2_vision/90B_lora.yaml
index 95e166e75..8083357a2 100644
--- a/recipes/configs/llama3_2_vision/90B_lora.yaml
+++ b/recipes/configs/llama3_2_vision/90B_lora.yaml
@@ -12,8 +12,6 @@
 #   tune run --nproc_per_node 8 lora_finetune_distributed --config llama3_2_vision/90B_lora checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
 #
 # This config works best when the model is being fine-tuned on 2+ GPUs.
-# For single device LoRA finetuning please use 90B_lora_single_device.yaml
-# or 90B_qlora_single_device.yaml
 
 # Model arguments
 model:
@@ -47,7 +45,7 @@ checkpointer:
   output_dir: /tmp/Llama-3.2-90B-Vision-Instruct/
   model_type: LLAMA3_VISION
 resume_from_checkpoint: False
-save_adapter_weights_only: True # Set to false to save the whole model + adapter merged
+save_adapter_weights_only: False # PeFT formatting not available yet. This will save it in torchtune format only.
 
 # Dataset
 dataset:
diff --git a/recipes/configs/llama3_2_vision/90B_qlora.yaml b/recipes/configs/llama3_2_vision/90B_qlora.yaml
new file mode 100644
index 000000000..868b47da8
--- /dev/null
+++ b/recipes/configs/llama3_2_vision/90B_qlora.yaml
@@ -0,0 +1,91 @@
+# Config for multi-device QLoRA finetuning in lora_finetune_distributed.py
+# using a Llama3.2 90B Vision Instruct model
+#
+# This config assumes that you've run the following command before launching:
+#   tune download meta-llama/Llama-3.2-90B-Vision-Instruct --output-dir /tmp/Llama-3.2-90B-Vision-Instruct --ignore-patterns "original/consolidated*"
+#
+# To launch on 2 devices, run the following command from root:
+#   tune run --nproc_per_node 2 lora_finetune_distributed --config llama3_2_vision/90B_qlora
+#
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training:
+#   tune run --nproc_per_node 2 lora_finetune_distributed --config llama3_2_vision/90B_qlora checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+#
+# This config works best when the model is being fine-tuned on 2+ GPUs.
+# For single device QLoRA finetuning please use 90B_qlora_single_device.yaml
+
+# Model arguments
+model:
+  _component_: torchtune.models.llama3_2_vision.qlora_llama3_2_vision_90b
+  decoder_trainable: "frozen"
+  encoder_trainable: "lora"
+  fusion_trainable: "lora"
+  lora_attn_modules: ['q_proj', 'v_proj']
+  apply_lora_to_mlp: False
+  apply_lora_to_output: False
+  lora_rank: 8
+  lora_alpha: 16
+  lora_dropout: 0.0
+  image_size: 560 # Make sure this matches the image_size in tokenizer
+
+# Transform
+tokenizer:
+  _component_: torchtune.models.llama3_2_vision.llama3_2_vision_transform
+  path: /tmp/Llama-3.2-90B-Vision-Instruct/original/tokenizer.model
+  image_size: 560
+  max_seq_len: 8192
+
+# Checkpointer
+checkpointer:
+  _component_: torchtune.training.FullModelHFCheckpointer
+  checkpoint_dir: /tmp/Llama-3.2-90B-Vision-Instruct/
+  checkpoint_files:
+    filename_format: model-{}-of-{}.safetensors
+    max_filename: "00037"
+  recipe_checkpoint: null
+  output_dir: /tmp/Llama-3.2-90B-Vision-Instruct/
+  model_type: LLAMA3_VISION
+resume_from_checkpoint: False
+save_adapter_weights_only: False # PeFT formatting not available yet. This will save it in torchtune format only.
+
+# Dataset
+dataset:
+  _component_: torchtune.datasets.multimodal.the_cauldron_dataset
+  subset: ocrvqa
+seed: null
+shuffle: True
+collate_fn: torchtune.data.padded_collate_tiled_images_and_mask
+
+# Fine-tuning arguments
+epochs: 1
+max_steps_per_epoch: null
+batch_size: 2
+gradient_accumulation_steps: 4
+optimizer:
+  _component_: torch.optim.AdamW
+  fused: True
+  weight_decay: 0.01
+  lr: 1e-4
+lr_scheduler:
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
+  num_warmup_steps: 100
+loss:
+  _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
+clip_grad_norm: 1.0
+compile: False # set it to True for better memory and performance
+
+# Training env
+device: cuda
+
+# Memory management
+enable_activation_checkpointing: True
+enable_activation_offloading: False
+dtype: bf16
+
+# Logging
+output_dir: /tmp/qlora-llama3.2-vision-finetune
+metric_logger:
+  _component_: torchtune.training.metric_logging.DiskLogger
+  log_dir: /tmp/Llama-3.2-90B-Vision-Instruct/logs
+log_every_n_steps: 1
+log_peak_memory_stats: False
diff --git a/torchtune/_recipe_registry.py b/torchtune/_recipe_registry.py
index 5058d9b9e..7602d8e8c 100644
--- a/torchtune/_recipe_registry.py
+++ b/torchtune/_recipe_registry.py
@@ -297,13 +297,17 @@ class Recipe:
                 name="llama3_2_vision/11B_lora",
                 file_path="llama3_2_vision/11B_lora.yaml",
             ),
+            Config(
+                name="llama3_2_vision/11B_qlora",
+                file_path="llama3_2_vision/11B_qlora.yaml",
+            ),
             Config(
                 name="llama3_2_vision/90B_lora",
                 file_path="llama3_2_vision/90B_lora.yaml",
             ),
             Config(
-                name="llama3_2_vision/11B_qlora",
-                file_path="llama3_2_vision/11B_qlora.yaml",
+                name="llama3_2_vision/90B_qlora",
+                file_path="llama3_2_vision/90B_qlora.yaml",
             ),
         ],
         supports_distributed=True,